linux: add realtime patch to rpi kernel
[openadk.git] / target / linux / patches / 31a8651c138253007c66e1be176cdc839b416842 / patch-realtime
blob7bb72e14d1197cf8c5190523deee2c9d18bbc1ab
1 diff --git a/Documentation/sysrq.txt b/Documentation/sysrq.txt
2 index 3a3b30ac2a75..9e0745cafbd8 100644
3 --- a/Documentation/sysrq.txt
4 +++ b/Documentation/sysrq.txt
5 @@ -59,10 +59,17 @@ On PowerPC - Press 'ALT - Print Screen (or F13) - <command key>,
6  On other - If you know of the key combos for other architectures, please
7             let me know so I can add them to this section.
8  
9 -On all -  write a character to /proc/sysrq-trigger.  e.g.:
11 +On all -  write a character to /proc/sysrq-trigger, e.g.:
12                 echo t > /proc/sysrq-trigger
14 +On all - Enable network SysRq by writing a cookie to icmp_echo_sysrq, e.g.
15 +               echo 0x01020304 >/proc/sys/net/ipv4/icmp_echo_sysrq
16 +        Send an ICMP echo request with this pattern plus the particular
17 +        SysRq command key. Example:
18 +               # ping -c1 -s57 -p0102030468
19 +        will trigger the SysRq-H (help) command.
22  *  What are the 'command' keys?
23  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
24  'b'     - Will immediately reboot the system without syncing or unmounting
25 diff --git a/Documentation/trace/histograms.txt b/Documentation/trace/histograms.txt
26 new file mode 100644
27 index 000000000000..6f2aeabf7faa
28 --- /dev/null
29 +++ b/Documentation/trace/histograms.txt
30 @@ -0,0 +1,186 @@
31 +               Using the Linux Kernel Latency Histograms
34 +This document gives a short explanation how to enable, configure and use
35 +latency histograms. Latency histograms are primarily relevant in the
36 +context of real-time enabled kernels (CONFIG_PREEMPT/CONFIG_PREEMPT_RT)
37 +and are used in the quality management of the Linux real-time
38 +capabilities.
41 +* Purpose of latency histograms
43 +A latency histogram continuously accumulates the frequencies of latency
44 +data. There are two types of histograms
45 +- potential sources of latencies
46 +- effective latencies
49 +* Potential sources of latencies
51 +Potential sources of latencies are code segments where interrupts,
52 +preemption or both are disabled (aka critical sections). To create
53 +histograms of potential sources of latency, the kernel stores the time
54 +stamp at the start of a critical section, determines the time elapsed
55 +when the end of the section is reached, and increments the frequency
56 +counter of that latency value - irrespective of whether any concurrently
57 +running process is affected by latency or not.
58 +- Configuration items (in the Kernel hacking/Tracers submenu)
59 +  CONFIG_INTERRUPT_OFF_LATENCY
60 +  CONFIG_PREEMPT_OFF_LATENCY
63 +* Effective latencies
65 +Effective latencies are actually occuring during wakeup of a process. To
66 +determine effective latencies, the kernel stores the time stamp when a
67 +process is scheduled to be woken up, and determines the duration of the
68 +wakeup time shortly before control is passed over to this process. Note
69 +that the apparent latency in user space may be somewhat longer, since the
70 +process may be interrupted after control is passed over to it but before
71 +the execution in user space takes place. Simply measuring the interval
72 +between enqueuing and wakeup may also not appropriate in cases when a
73 +process is scheduled as a result of a timer expiration. The timer may have
74 +missed its deadline, e.g. due to disabled interrupts, but this latency
75 +would not be registered. Therefore, the offsets of missed timers are
76 +recorded in a separate histogram. If both wakeup latency and missed timer
77 +offsets are configured and enabled, a third histogram may be enabled that
78 +records the overall latency as a sum of the timer latency, if any, and the
79 +wakeup latency. This histogram is called "timerandwakeup".
80 +- Configuration items (in the Kernel hacking/Tracers submenu)
81 +  CONFIG_WAKEUP_LATENCY
82 +  CONFIG_MISSED_TIMER_OFSETS
85 +* Usage
87 +The interface to the administration of the latency histograms is located
88 +in the debugfs file system. To mount it, either enter
90 +mount -t sysfs nodev /sys
91 +mount -t debugfs nodev /sys/kernel/debug
93 +from shell command line level, or add
95 +nodev  /sys                    sysfs   defaults        0 0
96 +nodev  /sys/kernel/debug       debugfs defaults        0 0
98 +to the file /etc/fstab. All latency histogram related files are then
99 +available in the directory /sys/kernel/debug/tracing/latency_hist. A
100 +particular histogram type is enabled by writing non-zero to the related
101 +variable in the /sys/kernel/debug/tracing/latency_hist/enable directory.
102 +Select "preemptirqsoff" for the histograms of potential sources of
103 +latencies and "wakeup" for histograms of effective latencies etc. The
104 +histogram data - one per CPU - are available in the files
106 +/sys/kernel/debug/tracing/latency_hist/preemptoff/CPUx
107 +/sys/kernel/debug/tracing/latency_hist/irqsoff/CPUx
108 +/sys/kernel/debug/tracing/latency_hist/preemptirqsoff/CPUx
109 +/sys/kernel/debug/tracing/latency_hist/wakeup/CPUx
110 +/sys/kernel/debug/tracing/latency_hist/wakeup/sharedprio/CPUx
111 +/sys/kernel/debug/tracing/latency_hist/missed_timer_offsets/CPUx
112 +/sys/kernel/debug/tracing/latency_hist/timerandwakeup/CPUx
114 +The histograms are reset by writing non-zero to the file "reset" in a
115 +particular latency directory. To reset all latency data, use
117 +#!/bin/sh
119 +TRACINGDIR=/sys/kernel/debug/tracing
120 +HISTDIR=$TRACINGDIR/latency_hist
122 +if test -d $HISTDIR
123 +then
124 +  cd $HISTDIR
125 +  for i in `find . | grep /reset$`
126 +  do
127 +    echo 1 >$i
128 +  done
132 +* Data format
134 +Latency data are stored with a resolution of one microsecond. The
135 +maximum latency is 10,240 microseconds. The data are only valid, if the
136 +overflow register is empty. Every output line contains the latency in
137 +microseconds in the first row and the number of samples in the second
138 +row. To display only lines with a positive latency count, use, for
139 +example,
141 +grep -v " 0$" /sys/kernel/debug/tracing/latency_hist/preemptoff/CPU0
143 +#Minimum latency: 0 microseconds.
144 +#Average latency: 0 microseconds.
145 +#Maximum latency: 25 microseconds.
146 +#Total samples: 3104770694
147 +#There are 0 samples greater or equal than 10240 microseconds
148 +#usecs          samples
149 +    0        2984486876
150 +    1          49843506
151 +    2          58219047
152 +    3           5348126
153 +    4           2187960
154 +    5           3388262
155 +    6            959289
156 +    7            208294
157 +    8             40420
158 +    9              4485
159 +   10             14918
160 +   11             18340
161 +   12             25052
162 +   13             19455
163 +   14              5602
164 +   15               969
165 +   16                47
166 +   17                18
167 +   18                14
168 +   19                 1
169 +   20                 3
170 +   21                 2
171 +   22                 5
172 +   23                 2
173 +   25                 1
176 +* Wakeup latency of a selected process
178 +To only collect wakeup latency data of a particular process, write the
179 +PID of the requested process to
181 +/sys/kernel/debug/tracing/latency_hist/wakeup/pid
183 +PIDs are not considered, if this variable is set to 0.
186 +* Details of the process with the highest wakeup latency so far
188 +Selected data of the process that suffered from the highest wakeup
189 +latency that occurred in a particular CPU are available in the file
191 +/sys/kernel/debug/tracing/latency_hist/wakeup/max_latency-CPUx.
193 +In addition, other relevant system data at the time when the
194 +latency occurred are given.
196 +The format of the data is (all in one line):
197 +<PID> <Priority> <Latency> (<Timeroffset>) <Command> \
198 +<- <PID> <Priority> <Command> <Timestamp>
200 +The value of <Timeroffset> is only relevant in the combined timer
201 +and wakeup latency recording. In the wakeup recording, it is
202 +always 0, in the missed_timer_offsets recording, it is the same
203 +as <Latency>.
205 +When retrospectively searching for the origin of a latency and
206 +tracing was not enabled, it may be helpful to know the name and
207 +some basic data of the task that (finally) was switching to the
208 +late real-tlme task. In addition to the victim's data, also the
209 +data of the possible culprit are therefore displayed after the
210 +"<-" symbol.
212 +Finally, the timestamp of the time when the latency occurred
213 +in <seconds>.<microseconds> after the most recent system boot
214 +is provided.
216 +These data are also reset when the wakeup histogram is reset.
217 diff --git a/MAINTAINERS b/MAINTAINERS
218 index 63cefa62324c..be0ea1e5c4cc 100644
219 --- a/MAINTAINERS
220 +++ b/MAINTAINERS
221 @@ -5196,6 +5196,23 @@ F:       fs/fuse/
222  F:     include/uapi/linux/fuse.h
223  F:     Documentation/filesystems/fuse.txt
225 +FUTEX SUBSYSTEM
226 +M:     Thomas Gleixner <tglx@linutronix.de>
227 +M:     Ingo Molnar <mingo@redhat.com>
228 +R:     Peter Zijlstra <peterz@infradead.org>
229 +R:     Darren Hart <dvhart@infradead.org>
230 +L:     linux-kernel@vger.kernel.org
231 +T:     git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git locking/core
232 +S:     Maintained
233 +F:     kernel/futex.c
234 +F:     kernel/futex_compat.c
235 +F:     include/asm-generic/futex.h
236 +F:     include/linux/futex.h
237 +F:     include/uapi/linux/futex.h
238 +F:     tools/testing/selftests/futex/
239 +F:     tools/perf/bench/futex*
240 +F:     Documentation/*futex*
242  FUTURE DOMAIN TMC-16x0 SCSI DRIVER (16-bit)
243  M:     Rik Faith <faith@cs.unc.edu>
244  L:     linux-scsi@vger.kernel.org
245 diff --git a/arch/Kconfig b/arch/Kconfig
246 index 659bdd079277..099fc0f5155e 100644
247 --- a/arch/Kconfig
248 +++ b/arch/Kconfig
249 @@ -9,6 +9,7 @@ config OPROFILE
250         tristate "OProfile system profiling"
251         depends on PROFILING
252         depends on HAVE_OPROFILE
253 +       depends on !PREEMPT_RT_FULL
254         select RING_BUFFER
255         select RING_BUFFER_ALLOW_SWAP
256         help
257 @@ -52,6 +53,7 @@ config KPROBES
258  config JUMP_LABEL
259         bool "Optimize very unlikely/likely branches"
260         depends on HAVE_ARCH_JUMP_LABEL
261 +       depends on (!INTERRUPT_OFF_HIST && !PREEMPT_OFF_HIST && !WAKEUP_LATENCY_HIST && !MISSED_TIMER_OFFSETS_HIST)
262         help
263           This option enables a transparent branch optimization that
264          makes certain almost-always-true or almost-always-false branch
265 diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
266 index b5d529fdffab..5715844e83e3 100644
267 --- a/arch/arm/Kconfig
268 +++ b/arch/arm/Kconfig
269 @@ -36,7 +36,7 @@ config ARM
270         select HAVE_ARCH_AUDITSYSCALL if (AEABI && !OABI_COMPAT)
271         select HAVE_ARCH_BITREVERSE if (CPU_32v7M || CPU_32v7) && !CPU_32v6
272         select HAVE_ARCH_HARDENED_USERCOPY
273 -       select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 && MMU
274 +       select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 && MMU && !PREEMPT_RT_BASE
275         select HAVE_ARCH_KGDB if !CPU_ENDIAN_BE32 && MMU
276         select HAVE_ARCH_MMAP_RND_BITS if MMU
277         select HAVE_ARCH_SECCOMP_FILTER if (AEABI && !OABI_COMPAT)
278 @@ -75,6 +75,7 @@ config ARM
279         select HAVE_PERF_EVENTS
280         select HAVE_PERF_REGS
281         select HAVE_PERF_USER_STACK_DUMP
282 +       select HAVE_PREEMPT_LAZY
283         select HAVE_RCU_TABLE_FREE if (SMP && ARM_LPAE)
284         select HAVE_REGS_AND_STACK_ACCESS_API
285         select HAVE_SYSCALL_TRACEPOINTS
286 diff --git a/arch/arm/include/asm/irq.h b/arch/arm/include/asm/irq.h
287 index e53638c8ed8a..6095a1649865 100644
288 --- a/arch/arm/include/asm/irq.h
289 +++ b/arch/arm/include/asm/irq.h
290 @@ -22,6 +22,8 @@
291  #endif
293  #ifndef __ASSEMBLY__
294 +#include <linux/cpumask.h>
296  struct irqaction;
297  struct pt_regs;
298  extern void migrate_irqs(void);
299 diff --git a/arch/arm/include/asm/switch_to.h b/arch/arm/include/asm/switch_to.h
300 index 12ebfcc1d539..c962084605bc 100644
301 --- a/arch/arm/include/asm/switch_to.h
302 +++ b/arch/arm/include/asm/switch_to.h
303 @@ -3,6 +3,13 @@
305  #include <linux/thread_info.h>
307 +#if defined CONFIG_PREEMPT_RT_FULL && defined CONFIG_HIGHMEM
308 +void switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p);
309 +#else
310 +static inline void
311 +switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p) { }
312 +#endif
314  /*
315   * For v7 SMP cores running a preemptible kernel we may be pre-empted
316   * during a TLB maintenance operation, so execute an inner-shareable dsb
317 @@ -25,6 +32,7 @@ extern struct task_struct *__switch_to(struct task_struct *, struct thread_info
318  #define switch_to(prev,next,last)                                      \
319  do {                                                                   \
320         __complete_pending_tlbi();                                      \
321 +       switch_kmaps(prev, next);                                       \
322         last = __switch_to(prev,task_thread_info(prev), task_thread_info(next));        \
323  } while (0)
325 diff --git a/arch/arm/include/asm/thread_info.h b/arch/arm/include/asm/thread_info.h
326 index 776757d1604a..1f36a4eccc72 100644
327 --- a/arch/arm/include/asm/thread_info.h
328 +++ b/arch/arm/include/asm/thread_info.h
329 @@ -49,6 +49,7 @@ struct cpu_context_save {
330  struct thread_info {
331         unsigned long           flags;          /* low level flags */
332         int                     preempt_count;  /* 0 => preemptable, <0 => bug */
333 +       int                     preempt_lazy_count; /* 0 => preemptable, <0 => bug */
334         mm_segment_t            addr_limit;     /* address limit */
335         struct task_struct      *task;          /* main task structure */
336         __u32                   cpu;            /* cpu */
337 @@ -142,7 +143,8 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *,
338  #define TIF_SYSCALL_TRACE      4       /* syscall trace active */
339  #define TIF_SYSCALL_AUDIT      5       /* syscall auditing active */
340  #define TIF_SYSCALL_TRACEPOINT 6       /* syscall tracepoint instrumentation */
341 -#define TIF_SECCOMP            7       /* seccomp syscall filtering active */
342 +#define TIF_SECCOMP            8       /* seccomp syscall filtering active */
343 +#define TIF_NEED_RESCHED_LAZY  7
345  #define TIF_NOHZ               12      /* in adaptive nohz mode */
346  #define TIF_USING_IWMMXT       17
347 @@ -152,6 +154,7 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *,
348  #define _TIF_SIGPENDING                (1 << TIF_SIGPENDING)
349  #define _TIF_NEED_RESCHED      (1 << TIF_NEED_RESCHED)
350  #define _TIF_NOTIFY_RESUME     (1 << TIF_NOTIFY_RESUME)
351 +#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY)
352  #define _TIF_UPROBE            (1 << TIF_UPROBE)
353  #define _TIF_SYSCALL_TRACE     (1 << TIF_SYSCALL_TRACE)
354  #define _TIF_SYSCALL_AUDIT     (1 << TIF_SYSCALL_AUDIT)
355 @@ -167,7 +170,8 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *,
356   * Change these and you break ASM code in entry-common.S
357   */
358  #define _TIF_WORK_MASK         (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
359 -                                _TIF_NOTIFY_RESUME | _TIF_UPROBE)
360 +                                _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
361 +                                _TIF_NEED_RESCHED_LAZY)
363  #endif /* __KERNEL__ */
364  #endif /* __ASM_ARM_THREAD_INFO_H */
365 diff --git a/arch/arm/kernel/asm-offsets.c b/arch/arm/kernel/asm-offsets.c
366 index 608008229c7d..3866da3f7bb7 100644
367 --- a/arch/arm/kernel/asm-offsets.c
368 +++ b/arch/arm/kernel/asm-offsets.c
369 @@ -65,6 +65,7 @@ int main(void)
370    BLANK();
371    DEFINE(TI_FLAGS,             offsetof(struct thread_info, flags));
372    DEFINE(TI_PREEMPT,           offsetof(struct thread_info, preempt_count));
373 +  DEFINE(TI_PREEMPT_LAZY,      offsetof(struct thread_info, preempt_lazy_count));
374    DEFINE(TI_ADDR_LIMIT,                offsetof(struct thread_info, addr_limit));
375    DEFINE(TI_TASK,              offsetof(struct thread_info, task));
376    DEFINE(TI_CPU,               offsetof(struct thread_info, cpu));
377 diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S
378 index 9f157e7c51e7..468e224d76aa 100644
379 --- a/arch/arm/kernel/entry-armv.S
380 +++ b/arch/arm/kernel/entry-armv.S
381 @@ -220,11 +220,18 @@ __irq_svc:
383  #ifdef CONFIG_PREEMPT
384         ldr     r8, [tsk, #TI_PREEMPT]          @ get preempt count
385 -       ldr     r0, [tsk, #TI_FLAGS]            @ get flags
386         teq     r8, #0                          @ if preempt count != 0
387 +       bne     1f                              @ return from exeption
388 +       ldr     r0, [tsk, #TI_FLAGS]            @ get flags
389 +       tst     r0, #_TIF_NEED_RESCHED          @ if NEED_RESCHED is set
390 +       blne    svc_preempt                     @ preempt!
392 +       ldr     r8, [tsk, #TI_PREEMPT_LAZY]     @ get preempt lazy count
393 +       teq     r8, #0                          @ if preempt lazy count != 0
394         movne   r0, #0                          @ force flags to 0
395 -       tst     r0, #_TIF_NEED_RESCHED
396 +       tst     r0, #_TIF_NEED_RESCHED_LAZY
397         blne    svc_preempt
399  #endif
401         svc_exit r5, irq = 1                    @ return from exception
402 @@ -239,8 +246,14 @@ svc_preempt:
403  1:     bl      preempt_schedule_irq            @ irq en/disable is done inside
404         ldr     r0, [tsk, #TI_FLAGS]            @ get new tasks TI_FLAGS
405         tst     r0, #_TIF_NEED_RESCHED
406 +       bne     1b
407 +       tst     r0, #_TIF_NEED_RESCHED_LAZY
408         reteq   r8                              @ go again
409 -       b       1b
410 +       ldr     r0, [tsk, #TI_PREEMPT_LAZY]     @ get preempt lazy count
411 +       teq     r0, #0                          @ if preempt lazy count != 0
412 +       beq     1b
413 +       ret     r8                              @ go again
415  #endif
417  __und_fault:
418 diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S
419 index 10c3283d6c19..8872937862cc 100644
420 --- a/arch/arm/kernel/entry-common.S
421 +++ b/arch/arm/kernel/entry-common.S
422 @@ -36,7 +36,9 @@ ret_fast_syscall:
423   UNWIND(.cantunwind    )
424         disable_irq_notrace                     @ disable interrupts
425         ldr     r1, [tsk, #TI_FLAGS]            @ re-check for syscall tracing
426 -       tst     r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
427 +       tst     r1, #((_TIF_SYSCALL_WORK | _TIF_WORK_MASK) & ~_TIF_SECCOMP)
428 +       bne     fast_work_pending
429 +       tst     r1, #_TIF_SECCOMP
430         bne     fast_work_pending
432         /* perform architecture specific actions before user return */
433 @@ -62,8 +64,11 @@ ret_fast_syscall:
434         str     r0, [sp, #S_R0 + S_OFF]!        @ save returned r0
435         disable_irq_notrace                     @ disable interrupts
436         ldr     r1, [tsk, #TI_FLAGS]            @ re-check for syscall tracing
437 -       tst     r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
438 +       tst     r1, #((_TIF_SYSCALL_WORK | _TIF_WORK_MASK) & ~_TIF_SECCOMP)
439 +       bne     do_slower_path
440 +       tst     r1, #_TIF_SECCOMP
441         beq     no_work_pending
442 +do_slower_path:
443   UNWIND(.fnend         )
444  ENDPROC(ret_fast_syscall)
446 diff --git a/arch/arm/kernel/patch.c b/arch/arm/kernel/patch.c
447 index 69bda1a5707e..1f665acaa6a9 100644
448 --- a/arch/arm/kernel/patch.c
449 +++ b/arch/arm/kernel/patch.c
450 @@ -15,7 +15,7 @@ struct patch {
451         unsigned int insn;
452  };
454 -static DEFINE_SPINLOCK(patch_lock);
455 +static DEFINE_RAW_SPINLOCK(patch_lock);
457  static void __kprobes *patch_map(void *addr, int fixmap, unsigned long *flags)
458         __acquires(&patch_lock)
459 @@ -32,7 +32,7 @@ static void __kprobes *patch_map(void *addr, int fixmap, unsigned long *flags)
460                 return addr;
462         if (flags)
463 -               spin_lock_irqsave(&patch_lock, *flags);
464 +               raw_spin_lock_irqsave(&patch_lock, *flags);
465         else
466                 __acquire(&patch_lock);
468 @@ -47,7 +47,7 @@ static void __kprobes patch_unmap(int fixmap, unsigned long *flags)
469         clear_fixmap(fixmap);
471         if (flags)
472 -               spin_unlock_irqrestore(&patch_lock, *flags);
473 +               raw_spin_unlock_irqrestore(&patch_lock, *flags);
474         else
475                 __release(&patch_lock);
477 diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c
478 index 91d2d5b01414..750550098b59 100644
479 --- a/arch/arm/kernel/process.c
480 +++ b/arch/arm/kernel/process.c
481 @@ -322,6 +322,30 @@ unsigned long arch_randomize_brk(struct mm_struct *mm)
484  #ifdef CONFIG_MMU
486 + * CONFIG_SPLIT_PTLOCK_CPUS results in a page->ptl lock.  If the lock is not
487 + * initialized by pgtable_page_ctor() then a coredump of the vector page will
488 + * fail.
489 + */
490 +static int __init vectors_user_mapping_init_page(void)
492 +       struct page *page;
493 +       unsigned long addr = 0xffff0000;
494 +       pgd_t *pgd;
495 +       pud_t *pud;
496 +       pmd_t *pmd;
498 +       pgd = pgd_offset_k(addr);
499 +       pud = pud_offset(pgd, addr);
500 +       pmd = pmd_offset(pud, addr);
501 +       page = pmd_page(*(pmd));
503 +       pgtable_page_ctor(page);
505 +       return 0;
507 +late_initcall(vectors_user_mapping_init_page);
509  #ifdef CONFIG_KUSER_HELPERS
510  /*
511   * The vectors page is always readable from user space for the
512 diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
513 index 7b8f2141427b..96541e00b74a 100644
514 --- a/arch/arm/kernel/signal.c
515 +++ b/arch/arm/kernel/signal.c
516 @@ -572,7 +572,8 @@ do_work_pending(struct pt_regs *regs, unsigned int thread_flags, int syscall)
517          */
518         trace_hardirqs_off();
519         do {
520 -               if (likely(thread_flags & _TIF_NEED_RESCHED)) {
521 +               if (likely(thread_flags & (_TIF_NEED_RESCHED |
522 +                                          _TIF_NEED_RESCHED_LAZY))) {
523                         schedule();
524                 } else {
525                         if (unlikely(!user_mode(regs)))
526 diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
527 index 7dd14e8395e6..4cd7e3d98035 100644
528 --- a/arch/arm/kernel/smp.c
529 +++ b/arch/arm/kernel/smp.c
530 @@ -234,8 +234,6 @@ int __cpu_disable(void)
531         flush_cache_louis();
532         local_flush_tlb_all();
534 -       clear_tasks_mm_cpumask(cpu);
536         return 0;
539 @@ -251,6 +249,9 @@ void __cpu_die(unsigned int cpu)
540                 pr_err("CPU%u: cpu didn't die\n", cpu);
541                 return;
542         }
544 +       clear_tasks_mm_cpumask(cpu);
546         pr_notice("CPU%u: shutdown\n", cpu);
548         /*
549 diff --git a/arch/arm/kernel/unwind.c b/arch/arm/kernel/unwind.c
550 index 0bee233fef9a..314cfb232a63 100644
551 --- a/arch/arm/kernel/unwind.c
552 +++ b/arch/arm/kernel/unwind.c
553 @@ -93,7 +93,7 @@ extern const struct unwind_idx __start_unwind_idx[];
554  static const struct unwind_idx *__origin_unwind_idx;
555  extern const struct unwind_idx __stop_unwind_idx[];
557 -static DEFINE_SPINLOCK(unwind_lock);
558 +static DEFINE_RAW_SPINLOCK(unwind_lock);
559  static LIST_HEAD(unwind_tables);
561  /* Convert a prel31 symbol to an absolute address */
562 @@ -201,7 +201,7 @@ static const struct unwind_idx *unwind_find_idx(unsigned long addr)
563                 /* module unwind tables */
564                 struct unwind_table *table;
566 -               spin_lock_irqsave(&unwind_lock, flags);
567 +               raw_spin_lock_irqsave(&unwind_lock, flags);
568                 list_for_each_entry(table, &unwind_tables, list) {
569                         if (addr >= table->begin_addr &&
570                             addr < table->end_addr) {
571 @@ -213,7 +213,7 @@ static const struct unwind_idx *unwind_find_idx(unsigned long addr)
572                                 break;
573                         }
574                 }
575 -               spin_unlock_irqrestore(&unwind_lock, flags);
576 +               raw_spin_unlock_irqrestore(&unwind_lock, flags);
577         }
579         pr_debug("%s: idx = %p\n", __func__, idx);
580 @@ -529,9 +529,9 @@ struct unwind_table *unwind_table_add(unsigned long start, unsigned long size,
581         tab->begin_addr = text_addr;
582         tab->end_addr = text_addr + text_size;
584 -       spin_lock_irqsave(&unwind_lock, flags);
585 +       raw_spin_lock_irqsave(&unwind_lock, flags);
586         list_add_tail(&tab->list, &unwind_tables);
587 -       spin_unlock_irqrestore(&unwind_lock, flags);
588 +       raw_spin_unlock_irqrestore(&unwind_lock, flags);
590         return tab;
592 @@ -543,9 +543,9 @@ void unwind_table_del(struct unwind_table *tab)
593         if (!tab)
594                 return;
596 -       spin_lock_irqsave(&unwind_lock, flags);
597 +       raw_spin_lock_irqsave(&unwind_lock, flags);
598         list_del(&tab->list);
599 -       spin_unlock_irqrestore(&unwind_lock, flags);
600 +       raw_spin_unlock_irqrestore(&unwind_lock, flags);
602         kfree(tab);
604 diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
605 index 19b5f5c1c0ff..82aa639e6737 100644
606 --- a/arch/arm/kvm/arm.c
607 +++ b/arch/arm/kvm/arm.c
608 @@ -619,7 +619,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
609                  * involves poking the GIC, which must be done in a
610                  * non-preemptible context.
611                  */
612 -               preempt_disable();
613 +               migrate_disable();
614                 kvm_pmu_flush_hwstate(vcpu);
615                 kvm_timer_flush_hwstate(vcpu);
616                 kvm_vgic_flush_hwstate(vcpu);
617 @@ -640,7 +640,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
618                         kvm_pmu_sync_hwstate(vcpu);
619                         kvm_timer_sync_hwstate(vcpu);
620                         kvm_vgic_sync_hwstate(vcpu);
621 -                       preempt_enable();
622 +                       migrate_enable();
623                         continue;
624                 }
626 @@ -696,7 +696,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
628                 kvm_vgic_sync_hwstate(vcpu);
630 -               preempt_enable();
631 +               migrate_enable();
633                 ret = handle_exit(vcpu, run, ret);
634         }
635 diff --git a/arch/arm/mach-exynos/platsmp.c b/arch/arm/mach-exynos/platsmp.c
636 index 98ffe1e62ad5..df9769ddece5 100644
637 --- a/arch/arm/mach-exynos/platsmp.c
638 +++ b/arch/arm/mach-exynos/platsmp.c
639 @@ -229,7 +229,7 @@ static void __iomem *scu_base_addr(void)
640         return (void __iomem *)(S5P_VA_SCU);
643 -static DEFINE_SPINLOCK(boot_lock);
644 +static DEFINE_RAW_SPINLOCK(boot_lock);
646  static void exynos_secondary_init(unsigned int cpu)
648 @@ -242,8 +242,8 @@ static void exynos_secondary_init(unsigned int cpu)
649         /*
650          * Synchronise with the boot thread.
651          */
652 -       spin_lock(&boot_lock);
653 -       spin_unlock(&boot_lock);
654 +       raw_spin_lock(&boot_lock);
655 +       raw_spin_unlock(&boot_lock);
658  int exynos_set_boot_addr(u32 core_id, unsigned long boot_addr)
659 @@ -307,7 +307,7 @@ static int exynos_boot_secondary(unsigned int cpu, struct task_struct *idle)
660          * Set synchronisation state between this boot processor
661          * and the secondary one
662          */
663 -       spin_lock(&boot_lock);
664 +       raw_spin_lock(&boot_lock);
666         /*
667          * The secondary processor is waiting to be released from
668 @@ -334,7 +334,7 @@ static int exynos_boot_secondary(unsigned int cpu, struct task_struct *idle)
670                 if (timeout == 0) {
671                         printk(KERN_ERR "cpu1 power enable failed");
672 -                       spin_unlock(&boot_lock);
673 +                       raw_spin_unlock(&boot_lock);
674                         return -ETIMEDOUT;
675                 }
676         }
677 @@ -380,7 +380,7 @@ static int exynos_boot_secondary(unsigned int cpu, struct task_struct *idle)
678          * calibrations, then wait for it to finish
679          */
680  fail:
681 -       spin_unlock(&boot_lock);
682 +       raw_spin_unlock(&boot_lock);
684         return pen_release != -1 ? ret : 0;
686 diff --git a/arch/arm/mach-hisi/platmcpm.c b/arch/arm/mach-hisi/platmcpm.c
687 index 4b653a8cb75c..b03d5a922cb1 100644
688 --- a/arch/arm/mach-hisi/platmcpm.c
689 +++ b/arch/arm/mach-hisi/platmcpm.c
690 @@ -61,7 +61,7 @@
692  static void __iomem *sysctrl, *fabric;
693  static int hip04_cpu_table[HIP04_MAX_CLUSTERS][HIP04_MAX_CPUS_PER_CLUSTER];
694 -static DEFINE_SPINLOCK(boot_lock);
695 +static DEFINE_RAW_SPINLOCK(boot_lock);
696  static u32 fabric_phys_addr;
697  /*
698   * [0]: bootwrapper physical address
699 @@ -113,7 +113,7 @@ static int hip04_boot_secondary(unsigned int l_cpu, struct task_struct *idle)
700         if (cluster >= HIP04_MAX_CLUSTERS || cpu >= HIP04_MAX_CPUS_PER_CLUSTER)
701                 return -EINVAL;
703 -       spin_lock_irq(&boot_lock);
704 +       raw_spin_lock_irq(&boot_lock);
706         if (hip04_cpu_table[cluster][cpu])
707                 goto out;
708 @@ -147,7 +147,7 @@ static int hip04_boot_secondary(unsigned int l_cpu, struct task_struct *idle)
710  out:
711         hip04_cpu_table[cluster][cpu]++;
712 -       spin_unlock_irq(&boot_lock);
713 +       raw_spin_unlock_irq(&boot_lock);
715         return 0;
717 @@ -162,11 +162,11 @@ static void hip04_cpu_die(unsigned int l_cpu)
718         cpu = MPIDR_AFFINITY_LEVEL(mpidr, 0);
719         cluster = MPIDR_AFFINITY_LEVEL(mpidr, 1);
721 -       spin_lock(&boot_lock);
722 +       raw_spin_lock(&boot_lock);
723         hip04_cpu_table[cluster][cpu]--;
724         if (hip04_cpu_table[cluster][cpu] == 1) {
725                 /* A power_up request went ahead of us. */
726 -               spin_unlock(&boot_lock);
727 +               raw_spin_unlock(&boot_lock);
728                 return;
729         } else if (hip04_cpu_table[cluster][cpu] > 1) {
730                 pr_err("Cluster %d CPU%d boots multiple times\n", cluster, cpu);
731 @@ -174,7 +174,7 @@ static void hip04_cpu_die(unsigned int l_cpu)
732         }
734         last_man = hip04_cluster_is_down(cluster);
735 -       spin_unlock(&boot_lock);
736 +       raw_spin_unlock(&boot_lock);
737         if (last_man) {
738                 /* Since it's Cortex A15, disable L2 prefetching. */
739                 asm volatile(
740 @@ -203,7 +203,7 @@ static int hip04_cpu_kill(unsigned int l_cpu)
741                cpu >= HIP04_MAX_CPUS_PER_CLUSTER);
743         count = TIMEOUT_MSEC / POLL_MSEC;
744 -       spin_lock_irq(&boot_lock);
745 +       raw_spin_lock_irq(&boot_lock);
746         for (tries = 0; tries < count; tries++) {
747                 if (hip04_cpu_table[cluster][cpu])
748                         goto err;
749 @@ -211,10 +211,10 @@ static int hip04_cpu_kill(unsigned int l_cpu)
750                 data = readl_relaxed(sysctrl + SC_CPU_RESET_STATUS(cluster));
751                 if (data & CORE_WFI_STATUS(cpu))
752                         break;
753 -               spin_unlock_irq(&boot_lock);
754 +               raw_spin_unlock_irq(&boot_lock);
755                 /* Wait for clean L2 when the whole cluster is down. */
756                 msleep(POLL_MSEC);
757 -               spin_lock_irq(&boot_lock);
758 +               raw_spin_lock_irq(&boot_lock);
759         }
760         if (tries >= count)
761                 goto err;
762 @@ -231,10 +231,10 @@ static int hip04_cpu_kill(unsigned int l_cpu)
763                 goto err;
764         if (hip04_cluster_is_down(cluster))
765                 hip04_set_snoop_filter(cluster, 0);
766 -       spin_unlock_irq(&boot_lock);
767 +       raw_spin_unlock_irq(&boot_lock);
768         return 1;
769  err:
770 -       spin_unlock_irq(&boot_lock);
771 +       raw_spin_unlock_irq(&boot_lock);
772         return 0;
774  #endif
775 diff --git a/arch/arm/mach-omap2/omap-smp.c b/arch/arm/mach-omap2/omap-smp.c
776 index b4de3da6dffa..b52893319d75 100644
777 --- a/arch/arm/mach-omap2/omap-smp.c
778 +++ b/arch/arm/mach-omap2/omap-smp.c
779 @@ -64,7 +64,7 @@ static const struct omap_smp_config omap5_cfg __initconst = {
780         .startup_addr = omap5_secondary_startup,
781  };
783 -static DEFINE_SPINLOCK(boot_lock);
784 +static DEFINE_RAW_SPINLOCK(boot_lock);
786  void __iomem *omap4_get_scu_base(void)
788 @@ -131,8 +131,8 @@ static void omap4_secondary_init(unsigned int cpu)
789         /*
790          * Synchronise with the boot thread.
791          */
792 -       spin_lock(&boot_lock);
793 -       spin_unlock(&boot_lock);
794 +       raw_spin_lock(&boot_lock);
795 +       raw_spin_unlock(&boot_lock);
798  static int omap4_boot_secondary(unsigned int cpu, struct task_struct *idle)
799 @@ -146,7 +146,7 @@ static int omap4_boot_secondary(unsigned int cpu, struct task_struct *idle)
800          * Set synchronisation state between this boot processor
801          * and the secondary one
802          */
803 -       spin_lock(&boot_lock);
804 +       raw_spin_lock(&boot_lock);
806         /*
807          * Update the AuxCoreBoot0 with boot state for secondary core.
808 @@ -223,7 +223,7 @@ static int omap4_boot_secondary(unsigned int cpu, struct task_struct *idle)
809          * Now the secondary core is starting up let it run its
810          * calibrations, then wait for it to finish
811          */
812 -       spin_unlock(&boot_lock);
813 +       raw_spin_unlock(&boot_lock);
815         return 0;
817 diff --git a/arch/arm/mach-prima2/platsmp.c b/arch/arm/mach-prima2/platsmp.c
818 index 0875b99add18..18b6d98d2581 100644
819 --- a/arch/arm/mach-prima2/platsmp.c
820 +++ b/arch/arm/mach-prima2/platsmp.c
821 @@ -22,7 +22,7 @@
823  static void __iomem *clk_base;
825 -static DEFINE_SPINLOCK(boot_lock);
826 +static DEFINE_RAW_SPINLOCK(boot_lock);
828  static void sirfsoc_secondary_init(unsigned int cpu)
830 @@ -36,8 +36,8 @@ static void sirfsoc_secondary_init(unsigned int cpu)
831         /*
832          * Synchronise with the boot thread.
833          */
834 -       spin_lock(&boot_lock);
835 -       spin_unlock(&boot_lock);
836 +       raw_spin_lock(&boot_lock);
837 +       raw_spin_unlock(&boot_lock);
840  static const struct of_device_id clk_ids[]  = {
841 @@ -75,7 +75,7 @@ static int sirfsoc_boot_secondary(unsigned int cpu, struct task_struct *idle)
842         /* make sure write buffer is drained */
843         mb();
845 -       spin_lock(&boot_lock);
846 +       raw_spin_lock(&boot_lock);
848         /*
849          * The secondary processor is waiting to be released from
850 @@ -107,7 +107,7 @@ static int sirfsoc_boot_secondary(unsigned int cpu, struct task_struct *idle)
851          * now the secondary core is starting up let it run its
852          * calibrations, then wait for it to finish
853          */
854 -       spin_unlock(&boot_lock);
855 +       raw_spin_unlock(&boot_lock);
857         return pen_release != -1 ? -ENOSYS : 0;
859 diff --git a/arch/arm/mach-qcom/platsmp.c b/arch/arm/mach-qcom/platsmp.c
860 index 5494c9e0c909..e8ce157d3548 100644
861 --- a/arch/arm/mach-qcom/platsmp.c
862 +++ b/arch/arm/mach-qcom/platsmp.c
863 @@ -46,7 +46,7 @@
865  extern void secondary_startup_arm(void);
867 -static DEFINE_SPINLOCK(boot_lock);
868 +static DEFINE_RAW_SPINLOCK(boot_lock);
870  #ifdef CONFIG_HOTPLUG_CPU
871  static void qcom_cpu_die(unsigned int cpu)
872 @@ -60,8 +60,8 @@ static void qcom_secondary_init(unsigned int cpu)
873         /*
874          * Synchronise with the boot thread.
875          */
876 -       spin_lock(&boot_lock);
877 -       spin_unlock(&boot_lock);
878 +       raw_spin_lock(&boot_lock);
879 +       raw_spin_unlock(&boot_lock);
882  static int scss_release_secondary(unsigned int cpu)
883 @@ -284,7 +284,7 @@ static int qcom_boot_secondary(unsigned int cpu, int (*func)(unsigned int))
884          * set synchronisation state between this boot processor
885          * and the secondary one
886          */
887 -       spin_lock(&boot_lock);
888 +       raw_spin_lock(&boot_lock);
890         /*
891          * Send the secondary CPU a soft interrupt, thereby causing
892 @@ -297,7 +297,7 @@ static int qcom_boot_secondary(unsigned int cpu, int (*func)(unsigned int))
893          * now the secondary core is starting up let it run its
894          * calibrations, then wait for it to finish
895          */
896 -       spin_unlock(&boot_lock);
897 +       raw_spin_unlock(&boot_lock);
899         return ret;
901 diff --git a/arch/arm/mach-spear/platsmp.c b/arch/arm/mach-spear/platsmp.c
902 index 8d1e2d551786..7fa56cc78118 100644
903 --- a/arch/arm/mach-spear/platsmp.c
904 +++ b/arch/arm/mach-spear/platsmp.c
905 @@ -32,7 +32,7 @@ static void write_pen_release(int val)
906         sync_cache_w(&pen_release);
909 -static DEFINE_SPINLOCK(boot_lock);
910 +static DEFINE_RAW_SPINLOCK(boot_lock);
912  static void __iomem *scu_base = IOMEM(VA_SCU_BASE);
914 @@ -47,8 +47,8 @@ static void spear13xx_secondary_init(unsigned int cpu)
915         /*
916          * Synchronise with the boot thread.
917          */
918 -       spin_lock(&boot_lock);
919 -       spin_unlock(&boot_lock);
920 +       raw_spin_lock(&boot_lock);
921 +       raw_spin_unlock(&boot_lock);
924  static int spear13xx_boot_secondary(unsigned int cpu, struct task_struct *idle)
925 @@ -59,7 +59,7 @@ static int spear13xx_boot_secondary(unsigned int cpu, struct task_struct *idle)
926          * set synchronisation state between this boot processor
927          * and the secondary one
928          */
929 -       spin_lock(&boot_lock);
930 +       raw_spin_lock(&boot_lock);
932         /*
933          * The secondary processor is waiting to be released from
934 @@ -84,7 +84,7 @@ static int spear13xx_boot_secondary(unsigned int cpu, struct task_struct *idle)
935          * now the secondary core is starting up let it run its
936          * calibrations, then wait for it to finish
937          */
938 -       spin_unlock(&boot_lock);
939 +       raw_spin_unlock(&boot_lock);
941         return pen_release != -1 ? -ENOSYS : 0;
943 diff --git a/arch/arm/mach-sti/platsmp.c b/arch/arm/mach-sti/platsmp.c
944 index ea5a2277ee46..b988e081ac79 100644
945 --- a/arch/arm/mach-sti/platsmp.c
946 +++ b/arch/arm/mach-sti/platsmp.c
947 @@ -35,7 +35,7 @@ static void write_pen_release(int val)
948         sync_cache_w(&pen_release);
951 -static DEFINE_SPINLOCK(boot_lock);
952 +static DEFINE_RAW_SPINLOCK(boot_lock);
954  static void sti_secondary_init(unsigned int cpu)
956 @@ -48,8 +48,8 @@ static void sti_secondary_init(unsigned int cpu)
957         /*
958          * Synchronise with the boot thread.
959          */
960 -       spin_lock(&boot_lock);
961 -       spin_unlock(&boot_lock);
962 +       raw_spin_lock(&boot_lock);
963 +       raw_spin_unlock(&boot_lock);
966  static int sti_boot_secondary(unsigned int cpu, struct task_struct *idle)
967 @@ -60,7 +60,7 @@ static int sti_boot_secondary(unsigned int cpu, struct task_struct *idle)
968          * set synchronisation state between this boot processor
969          * and the secondary one
970          */
971 -       spin_lock(&boot_lock);
972 +       raw_spin_lock(&boot_lock);
974         /*
975          * The secondary processor is waiting to be released from
976 @@ -91,7 +91,7 @@ static int sti_boot_secondary(unsigned int cpu, struct task_struct *idle)
977          * now the secondary core is starting up let it run its
978          * calibrations, then wait for it to finish
979          */
980 -       spin_unlock(&boot_lock);
981 +       raw_spin_unlock(&boot_lock);
983         return pen_release != -1 ? -ENOSYS : 0;
985 diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c
986 index f7861dc83182..ce47dfe25fb0 100644
987 --- a/arch/arm/mm/fault.c
988 +++ b/arch/arm/mm/fault.c
989 @@ -433,6 +433,9 @@ do_translation_fault(unsigned long addr, unsigned int fsr,
990         if (addr < TASK_SIZE)
991                 return do_page_fault(addr, fsr, regs);
993 +       if (interrupts_enabled(regs))
994 +               local_irq_enable();
996         if (user_mode(regs))
997                 goto bad_area;
999 @@ -500,6 +503,9 @@ do_translation_fault(unsigned long addr, unsigned int fsr,
1000  static int
1001  do_sect_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
1003 +       if (interrupts_enabled(regs))
1004 +               local_irq_enable();
1006         do_bad_area(addr, fsr, regs);
1007         return 0;
1009 diff --git a/arch/arm/mm/highmem.c b/arch/arm/mm/highmem.c
1010 index d02f8187b1cc..542692dbd40a 100644
1011 --- a/arch/arm/mm/highmem.c
1012 +++ b/arch/arm/mm/highmem.c
1013 @@ -34,6 +34,11 @@ static inline pte_t get_fixmap_pte(unsigned long vaddr)
1014         return *ptep;
1017 +static unsigned int fixmap_idx(int type)
1019 +       return FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id();
1022  void *kmap(struct page *page)
1024         might_sleep();
1025 @@ -54,12 +59,13 @@ EXPORT_SYMBOL(kunmap);
1027  void *kmap_atomic(struct page *page)
1029 +       pte_t pte = mk_pte(page, kmap_prot);
1030         unsigned int idx;
1031         unsigned long vaddr;
1032         void *kmap;
1033         int type;
1035 -       preempt_disable();
1036 +       preempt_disable_nort();
1037         pagefault_disable();
1038         if (!PageHighMem(page))
1039                 return page_address(page);
1040 @@ -79,7 +85,7 @@ void *kmap_atomic(struct page *page)
1042         type = kmap_atomic_idx_push();
1044 -       idx = FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id();
1045 +       idx = fixmap_idx(type);
1046         vaddr = __fix_to_virt(idx);
1047  #ifdef CONFIG_DEBUG_HIGHMEM
1048         /*
1049 @@ -93,7 +99,10 @@ void *kmap_atomic(struct page *page)
1050          * in place, so the contained TLB flush ensures the TLB is updated
1051          * with the new mapping.
1052          */
1053 -       set_fixmap_pte(idx, mk_pte(page, kmap_prot));
1054 +#ifdef CONFIG_PREEMPT_RT_FULL
1055 +       current->kmap_pte[type] = pte;
1056 +#endif
1057 +       set_fixmap_pte(idx, pte);
1059         return (void *)vaddr;
1061 @@ -106,44 +115,75 @@ void __kunmap_atomic(void *kvaddr)
1063         if (kvaddr >= (void *)FIXADDR_START) {
1064                 type = kmap_atomic_idx();
1065 -               idx = FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id();
1066 +               idx = fixmap_idx(type);
1068                 if (cache_is_vivt())
1069                         __cpuc_flush_dcache_area((void *)vaddr, PAGE_SIZE);
1070 +#ifdef CONFIG_PREEMPT_RT_FULL
1071 +               current->kmap_pte[type] = __pte(0);
1072 +#endif
1073  #ifdef CONFIG_DEBUG_HIGHMEM
1074                 BUG_ON(vaddr != __fix_to_virt(idx));
1075 -               set_fixmap_pte(idx, __pte(0));
1076  #else
1077                 (void) idx;  /* to kill a warning */
1078  #endif
1079 +               set_fixmap_pte(idx, __pte(0));
1080                 kmap_atomic_idx_pop();
1081         } else if (vaddr >= PKMAP_ADDR(0) && vaddr < PKMAP_ADDR(LAST_PKMAP)) {
1082                 /* this address was obtained through kmap_high_get() */
1083                 kunmap_high(pte_page(pkmap_page_table[PKMAP_NR(vaddr)]));
1084         }
1085         pagefault_enable();
1086 -       preempt_enable();
1087 +       preempt_enable_nort();
1089  EXPORT_SYMBOL(__kunmap_atomic);
1091  void *kmap_atomic_pfn(unsigned long pfn)
1093 +       pte_t pte = pfn_pte(pfn, kmap_prot);
1094         unsigned long vaddr;
1095         int idx, type;
1096         struct page *page = pfn_to_page(pfn);
1098 -       preempt_disable();
1099 +       preempt_disable_nort();
1100         pagefault_disable();
1101         if (!PageHighMem(page))
1102                 return page_address(page);
1104         type = kmap_atomic_idx_push();
1105 -       idx = FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id();
1106 +       idx = fixmap_idx(type);
1107         vaddr = __fix_to_virt(idx);
1108  #ifdef CONFIG_DEBUG_HIGHMEM
1109         BUG_ON(!pte_none(get_fixmap_pte(vaddr)));
1110  #endif
1111 -       set_fixmap_pte(idx, pfn_pte(pfn, kmap_prot));
1112 +#ifdef CONFIG_PREEMPT_RT_FULL
1113 +       current->kmap_pte[type] = pte;
1114 +#endif
1115 +       set_fixmap_pte(idx, pte);
1117         return (void *)vaddr;
1119 +#if defined CONFIG_PREEMPT_RT_FULL
1120 +void switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p)
1122 +       int i;
1124 +       /*
1125 +        * Clear @prev's kmap_atomic mappings
1126 +        */
1127 +       for (i = 0; i < prev_p->kmap_idx; i++) {
1128 +               int idx = fixmap_idx(i);
1130 +               set_fixmap_pte(idx, __pte(0));
1131 +       }
1132 +       /*
1133 +        * Restore @next_p's kmap_atomic mappings
1134 +        */
1135 +       for (i = 0; i < next_p->kmap_idx; i++) {
1136 +               int idx = fixmap_idx(i);
1138 +               if (!pte_none(next_p->kmap_pte[i]))
1139 +                       set_fixmap_pte(idx, next_p->kmap_pte[i]);
1140 +       }
1142 +#endif
1143 diff --git a/arch/arm/plat-versatile/platsmp.c b/arch/arm/plat-versatile/platsmp.c
1144 index c2366510187a..6b60f582b738 100644
1145 --- a/arch/arm/plat-versatile/platsmp.c
1146 +++ b/arch/arm/plat-versatile/platsmp.c
1147 @@ -32,7 +32,7 @@ static void write_pen_release(int val)
1148         sync_cache_w(&pen_release);
1151 -static DEFINE_SPINLOCK(boot_lock);
1152 +static DEFINE_RAW_SPINLOCK(boot_lock);
1154  void versatile_secondary_init(unsigned int cpu)
1156 @@ -45,8 +45,8 @@ void versatile_secondary_init(unsigned int cpu)
1157         /*
1158          * Synchronise with the boot thread.
1159          */
1160 -       spin_lock(&boot_lock);
1161 -       spin_unlock(&boot_lock);
1162 +       raw_spin_lock(&boot_lock);
1163 +       raw_spin_unlock(&boot_lock);
1166  int versatile_boot_secondary(unsigned int cpu, struct task_struct *idle)
1167 @@ -57,7 +57,7 @@ int versatile_boot_secondary(unsigned int cpu, struct task_struct *idle)
1168          * Set synchronisation state between this boot processor
1169          * and the secondary one
1170          */
1171 -       spin_lock(&boot_lock);
1172 +       raw_spin_lock(&boot_lock);
1174         /*
1175          * This is really belt and braces; we hold unintended secondary
1176 @@ -87,7 +87,7 @@ int versatile_boot_secondary(unsigned int cpu, struct task_struct *idle)
1177          * now the secondary core is starting up let it run its
1178          * calibrations, then wait for it to finish
1179          */
1180 -       spin_unlock(&boot_lock);
1181 +       raw_spin_unlock(&boot_lock);
1183         return pen_release != -1 ? -ENOSYS : 0;
1185 diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
1186 index cf57a7799a0f..78d1b49fbed5 100644
1187 --- a/arch/arm64/Kconfig
1188 +++ b/arch/arm64/Kconfig
1189 @@ -91,6 +91,7 @@ config ARM64
1190         select HAVE_PERF_EVENTS
1191         select HAVE_PERF_REGS
1192         select HAVE_PERF_USER_STACK_DUMP
1193 +       select HAVE_PREEMPT_LAZY
1194         select HAVE_REGS_AND_STACK_ACCESS_API
1195         select HAVE_RCU_TABLE_FREE
1196         select HAVE_SYSCALL_TRACEPOINTS
1197 @@ -704,7 +705,7 @@ config XEN_DOM0
1199  config XEN
1200         bool "Xen guest support on ARM64"
1201 -       depends on ARM64 && OF
1202 +       depends on ARM64 && OF && !PREEMPT_RT_FULL
1203         select SWIOTLB_XEN
1204         select PARAVIRT
1205         help
1206 diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h
1207 index e9ea5a6bd449..6c500ad63c6a 100644
1208 --- a/arch/arm64/include/asm/thread_info.h
1209 +++ b/arch/arm64/include/asm/thread_info.h
1210 @@ -49,6 +49,7 @@ struct thread_info {
1211         mm_segment_t            addr_limit;     /* address limit */
1212         struct task_struct      *task;          /* main task structure */
1213         int                     preempt_count;  /* 0 => preemptable, <0 => bug */
1214 +       int                     preempt_lazy_count; /* 0 => preemptable, <0 => bug */
1215         int                     cpu;            /* cpu */
1216  };
1218 @@ -112,6 +113,7 @@ static inline struct thread_info *current_thread_info(void)
1219  #define TIF_NEED_RESCHED       1
1220  #define TIF_NOTIFY_RESUME      2       /* callback before returning to user */
1221  #define TIF_FOREIGN_FPSTATE    3       /* CPU's FP state is not current's */
1222 +#define TIF_NEED_RESCHED_LAZY  4
1223  #define TIF_NOHZ               7
1224  #define TIF_SYSCALL_TRACE      8
1225  #define TIF_SYSCALL_AUDIT      9
1226 @@ -127,6 +129,7 @@ static inline struct thread_info *current_thread_info(void)
1227  #define _TIF_NEED_RESCHED      (1 << TIF_NEED_RESCHED)
1228  #define _TIF_NOTIFY_RESUME     (1 << TIF_NOTIFY_RESUME)
1229  #define _TIF_FOREIGN_FPSTATE   (1 << TIF_FOREIGN_FPSTATE)
1230 +#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY)
1231  #define _TIF_NOHZ              (1 << TIF_NOHZ)
1232  #define _TIF_SYSCALL_TRACE     (1 << TIF_SYSCALL_TRACE)
1233  #define _TIF_SYSCALL_AUDIT     (1 << TIF_SYSCALL_AUDIT)
1234 @@ -135,7 +138,9 @@ static inline struct thread_info *current_thread_info(void)
1235  #define _TIF_32BIT             (1 << TIF_32BIT)
1237  #define _TIF_WORK_MASK         (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
1238 -                                _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE)
1239 +                                _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE | \
1240 +                                _TIF_NEED_RESCHED_LAZY)
1241 +#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)
1243  #define _TIF_SYSCALL_WORK      (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \
1244                                  _TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP | \
1245 diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
1246 index c58ddf8c4062..a8f2f7c1fe12 100644
1247 --- a/arch/arm64/kernel/asm-offsets.c
1248 +++ b/arch/arm64/kernel/asm-offsets.c
1249 @@ -38,6 +38,7 @@ int main(void)
1250    BLANK();
1251    DEFINE(TI_FLAGS,             offsetof(struct thread_info, flags));
1252    DEFINE(TI_PREEMPT,           offsetof(struct thread_info, preempt_count));
1253 +  DEFINE(TI_PREEMPT_LAZY,      offsetof(struct thread_info, preempt_lazy_count));
1254    DEFINE(TI_ADDR_LIMIT,                offsetof(struct thread_info, addr_limit));
1255    DEFINE(TI_TASK,              offsetof(struct thread_info, task));
1256    DEFINE(TI_CPU,               offsetof(struct thread_info, cpu));
1257 diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
1258 index b4c7db434654..433d846f4f51 100644
1259 --- a/arch/arm64/kernel/entry.S
1260 +++ b/arch/arm64/kernel/entry.S
1261 @@ -430,11 +430,16 @@ el1_irq:
1263  #ifdef CONFIG_PREEMPT
1264         ldr     w24, [tsk, #TI_PREEMPT]         // get preempt count
1265 -       cbnz    w24, 1f                         // preempt count != 0
1266 +       cbnz    w24, 2f                         // preempt count != 0
1267         ldr     x0, [tsk, #TI_FLAGS]            // get flags
1268 -       tbz     x0, #TIF_NEED_RESCHED, 1f       // needs rescheduling?
1269 -       bl      el1_preempt
1270 +       tbnz    x0, #TIF_NEED_RESCHED, 1f       // needs rescheduling?
1272 +       ldr     w24, [tsk, #TI_PREEMPT_LAZY]    // get preempt lazy count
1273 +       cbnz    w24, 2f                         // preempt lazy count != 0
1274 +       tbz     x0, #TIF_NEED_RESCHED_LAZY, 2f  // needs rescheduling?
1275  1:
1276 +       bl      el1_preempt
1278  #endif
1279  #ifdef CONFIG_TRACE_IRQFLAGS
1280         bl      trace_hardirqs_on
1281 @@ -448,6 +453,7 @@ el1_preempt:
1282  1:     bl      preempt_schedule_irq            // irq en/disable is done inside
1283         ldr     x0, [tsk, #TI_FLAGS]            // get new tasks TI_FLAGS
1284         tbnz    x0, #TIF_NEED_RESCHED, 1b       // needs rescheduling?
1285 +       tbnz    x0, #TIF_NEED_RESCHED_LAZY, 1b  // needs rescheduling?
1286         ret     x24
1287  #endif
1289 diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
1290 index 404dd67080b9..639dc6d12e72 100644
1291 --- a/arch/arm64/kernel/signal.c
1292 +++ b/arch/arm64/kernel/signal.c
1293 @@ -409,7 +409,7 @@ asmlinkage void do_notify_resume(struct pt_regs *regs,
1294          */
1295         trace_hardirqs_off();
1296         do {
1297 -               if (thread_flags & _TIF_NEED_RESCHED) {
1298 +               if (thread_flags & _TIF_NEED_RESCHED_MASK) {
1299                         schedule();
1300                 } else {
1301                         local_irq_enable();
1302 diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
1303 index 5e844f68e847..dc613cc10f54 100644
1304 --- a/arch/mips/Kconfig
1305 +++ b/arch/mips/Kconfig
1306 @@ -2516,7 +2516,7 @@ config MIPS_ASID_BITS_VARIABLE
1308  config HIGHMEM
1309         bool "High Memory Support"
1310 -       depends on 32BIT && CPU_SUPPORTS_HIGHMEM && SYS_SUPPORTS_HIGHMEM && !CPU_MIPS32_3_5_EVA
1311 +       depends on 32BIT && CPU_SUPPORTS_HIGHMEM && SYS_SUPPORTS_HIGHMEM && !CPU_MIPS32_3_5_EVA && !PREEMPT_RT_FULL
1313  config CPU_SUPPORTS_HIGHMEM
1314         bool
1315 diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
1316 index 6eda5abbd719..601e27701a4a 100644
1317 --- a/arch/powerpc/Kconfig
1318 +++ b/arch/powerpc/Kconfig
1319 @@ -52,10 +52,11 @@ config LOCKDEP_SUPPORT
1321  config RWSEM_GENERIC_SPINLOCK
1322         bool
1323 +       default y if PREEMPT_RT_FULL
1325  config RWSEM_XCHGADD_ALGORITHM
1326         bool
1327 -       default y
1328 +       default y if !PREEMPT_RT_FULL
1330  config GENERIC_LOCKBREAK
1331         bool
1332 @@ -134,6 +135,7 @@ config PPC
1333         select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
1334         select GENERIC_STRNCPY_FROM_USER
1335         select GENERIC_STRNLEN_USER
1336 +       select HAVE_PREEMPT_LAZY
1337         select HAVE_MOD_ARCH_SPECIFIC
1338         select MODULES_USE_ELF_RELA
1339         select CLONE_BACKWARDS
1340 @@ -321,7 +323,7 @@ menu "Kernel options"
1342  config HIGHMEM
1343         bool "High memory support"
1344 -       depends on PPC32
1345 +       depends on PPC32 && !PREEMPT_RT_FULL
1347  source kernel/Kconfig.hz
1348  source kernel/Kconfig.preempt
1349 diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h
1350 index 87e4b2d8dcd4..981e501a4359 100644
1351 --- a/arch/powerpc/include/asm/thread_info.h
1352 +++ b/arch/powerpc/include/asm/thread_info.h
1353 @@ -43,6 +43,8 @@ struct thread_info {
1354         int             cpu;                    /* cpu we're on */
1355         int             preempt_count;          /* 0 => preemptable,
1356                                                    <0 => BUG */
1357 +       int             preempt_lazy_count;     /* 0 => preemptable,
1358 +                                                  <0 => BUG */
1359         unsigned long   local_flags;            /* private flags for thread */
1360  #ifdef CONFIG_LIVEPATCH
1361         unsigned long *livepatch_sp;
1362 @@ -88,8 +90,7 @@ static inline struct thread_info *current_thread_info(void)
1363  #define TIF_SYSCALL_TRACE      0       /* syscall trace active */
1364  #define TIF_SIGPENDING         1       /* signal pending */
1365  #define TIF_NEED_RESCHED       2       /* rescheduling necessary */
1366 -#define TIF_POLLING_NRFLAG     3       /* true if poll_idle() is polling
1367 -                                          TIF_NEED_RESCHED */
1368 +#define TIF_NEED_RESCHED_LAZY  3       /* lazy rescheduling necessary */
1369  #define TIF_32BIT              4       /* 32 bit binary */
1370  #define TIF_RESTORE_TM         5       /* need to restore TM FP/VEC/VSX */
1371  #define TIF_SYSCALL_AUDIT      7       /* syscall auditing active */
1372 @@ -107,6 +108,8 @@ static inline struct thread_info *current_thread_info(void)
1373  #if defined(CONFIG_PPC64)
1374  #define TIF_ELF2ABI            18      /* function descriptors must die! */
1375  #endif
1376 +#define TIF_POLLING_NRFLAG     19      /* true if poll_idle() is polling
1377 +                                          TIF_NEED_RESCHED */
1379  /* as above, but as bit values */
1380  #define _TIF_SYSCALL_TRACE     (1<<TIF_SYSCALL_TRACE)
1381 @@ -125,14 +128,16 @@ static inline struct thread_info *current_thread_info(void)
1382  #define _TIF_SYSCALL_TRACEPOINT        (1<<TIF_SYSCALL_TRACEPOINT)
1383  #define _TIF_EMULATE_STACK_STORE       (1<<TIF_EMULATE_STACK_STORE)
1384  #define _TIF_NOHZ              (1<<TIF_NOHZ)
1385 +#define _TIF_NEED_RESCHED_LAZY (1<<TIF_NEED_RESCHED_LAZY)
1386  #define _TIF_SYSCALL_DOTRACE   (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \
1387                                  _TIF_SECCOMP | _TIF_SYSCALL_TRACEPOINT | \
1388                                  _TIF_NOHZ)
1390  #define _TIF_USER_WORK_MASK    (_TIF_SIGPENDING | _TIF_NEED_RESCHED | \
1391                                  _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
1392 -                                _TIF_RESTORE_TM)
1393 +                                _TIF_RESTORE_TM | _TIF_NEED_RESCHED_LAZY)
1394  #define _TIF_PERSYSCALL_MASK   (_TIF_RESTOREALL|_TIF_NOERROR)
1395 +#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)
1397  /* Bits in local_flags */
1398  /* Don't move TLF_NAPPING without adjusting the code in entry_32.S */
1399 diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
1400 index c833d88c423d..96e9fbc3f684 100644
1401 --- a/arch/powerpc/kernel/asm-offsets.c
1402 +++ b/arch/powerpc/kernel/asm-offsets.c
1403 @@ -156,6 +156,7 @@ int main(void)
1404         DEFINE(TI_FLAGS, offsetof(struct thread_info, flags));
1405         DEFINE(TI_LOCAL_FLAGS, offsetof(struct thread_info, local_flags));
1406         DEFINE(TI_PREEMPT, offsetof(struct thread_info, preempt_count));
1407 +       DEFINE(TI_PREEMPT_LAZY, offsetof(struct thread_info, preempt_lazy_count));
1408         DEFINE(TI_TASK, offsetof(struct thread_info, task));
1409         DEFINE(TI_CPU, offsetof(struct thread_info, cpu));
1411 diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
1412 index 3841d749a430..6dbaeff192b9 100644
1413 --- a/arch/powerpc/kernel/entry_32.S
1414 +++ b/arch/powerpc/kernel/entry_32.S
1415 @@ -835,7 +835,14 @@ resume_kernel:
1416         cmpwi   0,r0,0          /* if non-zero, just restore regs and return */
1417         bne     restore
1418         andi.   r8,r8,_TIF_NEED_RESCHED
1419 +       bne+    1f
1420 +       lwz     r0,TI_PREEMPT_LAZY(r9)
1421 +       cmpwi   0,r0,0          /* if non-zero, just restore regs and return */
1422 +       bne     restore
1423 +       lwz     r0,TI_FLAGS(r9)
1424 +       andi.   r0,r0,_TIF_NEED_RESCHED_LAZY
1425         beq+    restore
1427         lwz     r3,_MSR(r1)
1428         andi.   r0,r3,MSR_EE    /* interrupts off? */
1429         beq     restore         /* don't schedule if so */
1430 @@ -846,11 +853,11 @@ resume_kernel:
1431          */
1432         bl      trace_hardirqs_off
1433  #endif
1434 -1:     bl      preempt_schedule_irq
1435 +2:     bl      preempt_schedule_irq
1436         CURRENT_THREAD_INFO(r9, r1)
1437         lwz     r3,TI_FLAGS(r9)
1438 -       andi.   r0,r3,_TIF_NEED_RESCHED
1439 -       bne-    1b
1440 +       andi.   r0,r3,_TIF_NEED_RESCHED_MASK
1441 +       bne-    2b
1442  #ifdef CONFIG_TRACE_IRQFLAGS
1443         /* And now, to properly rebalance the above, we tell lockdep they
1444          * are being turned back on, which will happen when we return
1445 @@ -1171,7 +1178,7 @@ global_dbcr0:
1446  #endif /* !(CONFIG_4xx || CONFIG_BOOKE) */
1448  do_work:                       /* r10 contains MSR_KERNEL here */
1449 -       andi.   r0,r9,_TIF_NEED_RESCHED
1450 +       andi.   r0,r9,_TIF_NEED_RESCHED_MASK
1451         beq     do_user_signal
1453  do_resched:                    /* r10 contains MSR_KERNEL here */
1454 @@ -1192,7 +1199,7 @@ recheck:
1455         MTMSRD(r10)             /* disable interrupts */
1456         CURRENT_THREAD_INFO(r9, r1)
1457         lwz     r9,TI_FLAGS(r9)
1458 -       andi.   r0,r9,_TIF_NEED_RESCHED
1459 +       andi.   r0,r9,_TIF_NEED_RESCHED_MASK
1460         bne-    do_resched
1461         andi.   r0,r9,_TIF_USER_WORK_MASK
1462         beq     restore_user
1463 diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
1464 index caa659671599..891080c4a41e 100644
1465 --- a/arch/powerpc/kernel/entry_64.S
1466 +++ b/arch/powerpc/kernel/entry_64.S
1467 @@ -656,7 +656,7 @@ _GLOBAL(ret_from_except_lite)
1468         bl      restore_math
1469         b       restore
1470  #endif
1471 -1:     andi.   r0,r4,_TIF_NEED_RESCHED
1472 +1:     andi.   r0,r4,_TIF_NEED_RESCHED_MASK
1473         beq     2f
1474         bl      restore_interrupts
1475         SCHEDULE_USER
1476 @@ -718,10 +718,18 @@ resume_kernel:
1478  #ifdef CONFIG_PREEMPT
1479         /* Check if we need to preempt */
1480 +       lwz     r8,TI_PREEMPT(r9)
1481 +       cmpwi   0,r8,0          /* if non-zero, just restore regs and return */
1482 +       bne     restore
1483         andi.   r0,r4,_TIF_NEED_RESCHED
1484 +       bne+    check_count
1486 +       andi.   r0,r4,_TIF_NEED_RESCHED_LAZY
1487         beq+    restore
1488 +       lwz     r8,TI_PREEMPT_LAZY(r9)
1490         /* Check that preempt_count() == 0 and interrupts are enabled */
1491 -       lwz     r8,TI_PREEMPT(r9)
1492 +check_count:
1493         cmpwi   cr1,r8,0
1494         ld      r0,SOFTE(r1)
1495         cmpdi   r0,0
1496 @@ -738,7 +746,7 @@ resume_kernel:
1497         /* Re-test flags and eventually loop */
1498         CURRENT_THREAD_INFO(r9, r1)
1499         ld      r4,TI_FLAGS(r9)
1500 -       andi.   r0,r4,_TIF_NEED_RESCHED
1501 +       andi.   r0,r4,_TIF_NEED_RESCHED_MASK
1502         bne     1b
1504         /*
1505 diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
1506 index 028a22bfa90c..a75e2dd3e71f 100644
1507 --- a/arch/powerpc/kernel/irq.c
1508 +++ b/arch/powerpc/kernel/irq.c
1509 @@ -651,6 +651,7 @@ void irq_ctx_init(void)
1510         }
1513 +#ifndef CONFIG_PREEMPT_RT_FULL
1514  void do_softirq_own_stack(void)
1516         struct thread_info *curtp, *irqtp;
1517 @@ -668,6 +669,7 @@ void do_softirq_own_stack(void)
1518         if (irqtp->flags)
1519                 set_bits(irqtp->flags, &curtp->flags);
1521 +#endif
1523  irq_hw_number_t virq_to_hw(unsigned int virq)
1525 diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S
1526 index 030d72df5dd5..b471a709e100 100644
1527 --- a/arch/powerpc/kernel/misc_32.S
1528 +++ b/arch/powerpc/kernel/misc_32.S
1529 @@ -41,6 +41,7 @@
1530   * We store the saved ksp_limit in the unused part
1531   * of the STACK_FRAME_OVERHEAD
1532   */
1533 +#ifndef CONFIG_PREEMPT_RT_FULL
1534  _GLOBAL(call_do_softirq)
1535         mflr    r0
1536         stw     r0,4(r1)
1537 @@ -57,6 +58,7 @@ _GLOBAL(call_do_softirq)
1538         stw     r10,THREAD+KSP_LIMIT(r2)
1539         mtlr    r0
1540         blr
1541 +#endif
1543  /*
1544   * void call_do_irq(struct pt_regs *regs, struct thread_info *irqtp);
1545 diff --git a/arch/powerpc/kernel/misc_64.S b/arch/powerpc/kernel/misc_64.S
1546 index 4cefe6888b18..cb2ee4be999a 100644
1547 --- a/arch/powerpc/kernel/misc_64.S
1548 +++ b/arch/powerpc/kernel/misc_64.S
1549 @@ -31,6 +31,7 @@
1551         .text
1553 +#ifndef CONFIG_PREEMPT_RT_FULL
1554  _GLOBAL(call_do_softirq)
1555         mflr    r0
1556         std     r0,16(r1)
1557 @@ -41,6 +42,7 @@ _GLOBAL(call_do_softirq)
1558         ld      r0,16(r1)
1559         mtlr    r0
1560         blr
1561 +#endif
1563  _GLOBAL(call_do_irq)
1564         mflr    r0
1565 diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
1566 index 029be26b5a17..9528089ea142 100644
1567 --- a/arch/powerpc/kvm/Kconfig
1568 +++ b/arch/powerpc/kvm/Kconfig
1569 @@ -175,6 +175,7 @@ config KVM_E500MC
1570  config KVM_MPIC
1571         bool "KVM in-kernel MPIC emulation"
1572         depends on KVM && E500
1573 +       depends on !PREEMPT_RT_FULL
1574         select HAVE_KVM_IRQCHIP
1575         select HAVE_KVM_IRQFD
1576         select HAVE_KVM_IRQ_ROUTING
1577 diff --git a/arch/powerpc/platforms/ps3/device-init.c b/arch/powerpc/platforms/ps3/device-init.c
1578 index e48462447ff0..2670cee66064 100644
1579 --- a/arch/powerpc/platforms/ps3/device-init.c
1580 +++ b/arch/powerpc/platforms/ps3/device-init.c
1581 @@ -752,7 +752,7 @@ static int ps3_notification_read_write(struct ps3_notification_device *dev,
1582         }
1583         pr_debug("%s:%u: notification %s issued\n", __func__, __LINE__, op);
1585 -       res = wait_event_interruptible(dev->done.wait,
1586 +       res = swait_event_interruptible(dev->done.wait,
1587                                        dev->done.done || kthread_should_stop());
1588         if (kthread_should_stop())
1589                 res = -EINTR;
1590 diff --git a/arch/sh/kernel/irq.c b/arch/sh/kernel/irq.c
1591 index 6c0378c0b8b5..abd58b4dff97 100644
1592 --- a/arch/sh/kernel/irq.c
1593 +++ b/arch/sh/kernel/irq.c
1594 @@ -147,6 +147,7 @@ void irq_ctx_exit(int cpu)
1595         hardirq_ctx[cpu] = NULL;
1598 +#ifndef CONFIG_PREEMPT_RT_FULL
1599  void do_softirq_own_stack(void)
1601         struct thread_info *curctx;
1602 @@ -174,6 +175,7 @@ void do_softirq_own_stack(void)
1603                   "r5", "r6", "r7", "r8", "r9", "r15", "t", "pr"
1604         );
1606 +#endif
1607  #else
1608  static inline void handle_one_irq(unsigned int irq)
1610 diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
1611 index 8b4152f3a764..c5cca159692a 100644
1612 --- a/arch/sparc/Kconfig
1613 +++ b/arch/sparc/Kconfig
1614 @@ -194,12 +194,10 @@ config NR_CPUS
1615  source kernel/Kconfig.hz
1617  config RWSEM_GENERIC_SPINLOCK
1618 -       bool
1619 -       default y if SPARC32
1620 +       def_bool PREEMPT_RT_FULL
1622  config RWSEM_XCHGADD_ALGORITHM
1623 -       bool
1624 -       default y if SPARC64
1625 +       def_bool !RWSEM_GENERIC_SPINLOCK && !PREEMPT_RT_FULL
1627  config GENERIC_HWEIGHT
1628         bool
1629 diff --git a/arch/sparc/kernel/irq_64.c b/arch/sparc/kernel/irq_64.c
1630 index 5cbf03c14981..6067d9379e5b 100644
1631 --- a/arch/sparc/kernel/irq_64.c
1632 +++ b/arch/sparc/kernel/irq_64.c
1633 @@ -854,6 +854,7 @@ void __irq_entry handler_irq(int pil, struct pt_regs *regs)
1634         set_irq_regs(old_regs);
1637 +#ifndef CONFIG_PREEMPT_RT_FULL
1638  void do_softirq_own_stack(void)
1640         void *orig_sp, *sp = softirq_stack[smp_processor_id()];
1641 @@ -868,6 +869,7 @@ void do_softirq_own_stack(void)
1642         __asm__ __volatile__("mov %0, %%sp"
1643                              : : "r" (orig_sp));
1645 +#endif
1647  #ifdef CONFIG_HOTPLUG_CPU
1648  void fixup_irqs(void)
1649 diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
1650 index da8156fd3d58..d8cd3bc807fc 100644
1651 --- a/arch/x86/Kconfig
1652 +++ b/arch/x86/Kconfig
1653 @@ -17,6 +17,7 @@ config X86_64
1654  ### Arch settings
1655  config X86
1656         def_bool y
1657 +       select HAVE_PREEMPT_LAZY
1658         select ACPI_LEGACY_TABLES_LOOKUP        if ACPI
1659         select ACPI_SYSTEM_POWER_STATES_SUPPORT if ACPI
1660         select ANON_INODES
1661 @@ -232,8 +233,11 @@ config ARCH_MAY_HAVE_PC_FDC
1662         def_bool y
1663         depends on ISA_DMA_API
1665 +config RWSEM_GENERIC_SPINLOCK
1666 +       def_bool PREEMPT_RT_FULL
1668  config RWSEM_XCHGADD_ALGORITHM
1669 -       def_bool y
1670 +       def_bool !RWSEM_GENERIC_SPINLOCK && !PREEMPT_RT_FULL
1672  config GENERIC_CALIBRATE_DELAY
1673         def_bool y
1674 @@ -897,7 +901,7 @@ config IOMMU_HELPER
1675  config MAXSMP
1676         bool "Enable Maximum number of SMP Processors and NUMA Nodes"
1677         depends on X86_64 && SMP && DEBUG_KERNEL
1678 -       select CPUMASK_OFFSTACK
1679 +       select CPUMASK_OFFSTACK if !PREEMPT_RT_FULL
1680         ---help---
1681           Enable maximum number of CPUS and NUMA Nodes for this architecture.
1682           If unsure, say N.
1683 diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
1684 index aa8b0672f87a..2429414bfc71 100644
1685 --- a/arch/x86/crypto/aesni-intel_glue.c
1686 +++ b/arch/x86/crypto/aesni-intel_glue.c
1687 @@ -372,14 +372,14 @@ static int ecb_encrypt(struct blkcipher_desc *desc,
1688         err = blkcipher_walk_virt(desc, &walk);
1689         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
1691 -       kernel_fpu_begin();
1692         while ((nbytes = walk.nbytes)) {
1693 +               kernel_fpu_begin();
1694                 aesni_ecb_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr,
1695 -                             nbytes & AES_BLOCK_MASK);
1696 +                               nbytes & AES_BLOCK_MASK);
1697 +               kernel_fpu_end();
1698                 nbytes &= AES_BLOCK_SIZE - 1;
1699                 err = blkcipher_walk_done(desc, &walk, nbytes);
1700         }
1701 -       kernel_fpu_end();
1703         return err;
1705 @@ -396,14 +396,14 @@ static int ecb_decrypt(struct blkcipher_desc *desc,
1706         err = blkcipher_walk_virt(desc, &walk);
1707         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
1709 -       kernel_fpu_begin();
1710         while ((nbytes = walk.nbytes)) {
1711 +               kernel_fpu_begin();
1712                 aesni_ecb_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr,
1713                               nbytes & AES_BLOCK_MASK);
1714 +               kernel_fpu_end();
1715                 nbytes &= AES_BLOCK_SIZE - 1;
1716                 err = blkcipher_walk_done(desc, &walk, nbytes);
1717         }
1718 -       kernel_fpu_end();
1720         return err;
1722 @@ -420,14 +420,14 @@ static int cbc_encrypt(struct blkcipher_desc *desc,
1723         err = blkcipher_walk_virt(desc, &walk);
1724         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
1726 -       kernel_fpu_begin();
1727         while ((nbytes = walk.nbytes)) {
1728 +               kernel_fpu_begin();
1729                 aesni_cbc_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr,
1730                               nbytes & AES_BLOCK_MASK, walk.iv);
1731 +               kernel_fpu_end();
1732                 nbytes &= AES_BLOCK_SIZE - 1;
1733                 err = blkcipher_walk_done(desc, &walk, nbytes);
1734         }
1735 -       kernel_fpu_end();
1737         return err;
1739 @@ -444,14 +444,14 @@ static int cbc_decrypt(struct blkcipher_desc *desc,
1740         err = blkcipher_walk_virt(desc, &walk);
1741         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
1743 -       kernel_fpu_begin();
1744         while ((nbytes = walk.nbytes)) {
1745 +               kernel_fpu_begin();
1746                 aesni_cbc_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr,
1747                               nbytes & AES_BLOCK_MASK, walk.iv);
1748 +               kernel_fpu_end();
1749                 nbytes &= AES_BLOCK_SIZE - 1;
1750                 err = blkcipher_walk_done(desc, &walk, nbytes);
1751         }
1752 -       kernel_fpu_end();
1754         return err;
1756 @@ -503,18 +503,20 @@ static int ctr_crypt(struct blkcipher_desc *desc,
1757         err = blkcipher_walk_virt_block(desc, &walk, AES_BLOCK_SIZE);
1758         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
1760 -       kernel_fpu_begin();
1761         while ((nbytes = walk.nbytes) >= AES_BLOCK_SIZE) {
1762 +               kernel_fpu_begin();
1763                 aesni_ctr_enc_tfm(ctx, walk.dst.virt.addr, walk.src.virt.addr,
1764                                       nbytes & AES_BLOCK_MASK, walk.iv);
1765 +               kernel_fpu_end();
1766                 nbytes &= AES_BLOCK_SIZE - 1;
1767                 err = blkcipher_walk_done(desc, &walk, nbytes);
1768         }
1769         if (walk.nbytes) {
1770 +               kernel_fpu_begin();
1771                 ctr_crypt_final(ctx, &walk);
1772 +               kernel_fpu_end();
1773                 err = blkcipher_walk_done(desc, &walk, 0);
1774         }
1775 -       kernel_fpu_end();
1777         return err;
1779 diff --git a/arch/x86/crypto/cast5_avx_glue.c b/arch/x86/crypto/cast5_avx_glue.c
1780 index 8648158f3916..d7699130ee36 100644
1781 --- a/arch/x86/crypto/cast5_avx_glue.c
1782 +++ b/arch/x86/crypto/cast5_avx_glue.c
1783 @@ -59,7 +59,7 @@ static inline void cast5_fpu_end(bool fpu_enabled)
1784  static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
1785                      bool enc)
1787 -       bool fpu_enabled = false;
1788 +       bool fpu_enabled;
1789         struct cast5_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
1790         const unsigned int bsize = CAST5_BLOCK_SIZE;
1791         unsigned int nbytes;
1792 @@ -75,7 +75,7 @@ static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
1793                 u8 *wsrc = walk->src.virt.addr;
1794                 u8 *wdst = walk->dst.virt.addr;
1796 -               fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes);
1797 +               fpu_enabled = cast5_fpu_begin(false, nbytes);
1799                 /* Process multi-block batch */
1800                 if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) {
1801 @@ -103,10 +103,9 @@ static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
1802                 } while (nbytes >= bsize);
1804  done:
1805 +               cast5_fpu_end(fpu_enabled);
1806                 err = blkcipher_walk_done(desc, walk, nbytes);
1807         }
1809 -       cast5_fpu_end(fpu_enabled);
1810         return err;
1813 @@ -227,7 +226,7 @@ static unsigned int __cbc_decrypt(struct blkcipher_desc *desc,
1814  static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
1815                        struct scatterlist *src, unsigned int nbytes)
1817 -       bool fpu_enabled = false;
1818 +       bool fpu_enabled;
1819         struct blkcipher_walk walk;
1820         int err;
1822 @@ -236,12 +235,11 @@ static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
1823         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
1825         while ((nbytes = walk.nbytes)) {
1826 -               fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes);
1827 +               fpu_enabled = cast5_fpu_begin(false, nbytes);
1828                 nbytes = __cbc_decrypt(desc, &walk);
1829 +               cast5_fpu_end(fpu_enabled);
1830                 err = blkcipher_walk_done(desc, &walk, nbytes);
1831         }
1833 -       cast5_fpu_end(fpu_enabled);
1834         return err;
1837 @@ -311,7 +309,7 @@ static unsigned int __ctr_crypt(struct blkcipher_desc *desc,
1838  static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
1839                      struct scatterlist *src, unsigned int nbytes)
1841 -       bool fpu_enabled = false;
1842 +       bool fpu_enabled;
1843         struct blkcipher_walk walk;
1844         int err;
1846 @@ -320,13 +318,12 @@ static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
1847         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
1849         while ((nbytes = walk.nbytes) >= CAST5_BLOCK_SIZE) {
1850 -               fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes);
1851 +               fpu_enabled = cast5_fpu_begin(false, nbytes);
1852                 nbytes = __ctr_crypt(desc, &walk);
1853 +               cast5_fpu_end(fpu_enabled);
1854                 err = blkcipher_walk_done(desc, &walk, nbytes);
1855         }
1857 -       cast5_fpu_end(fpu_enabled);
1859         if (walk.nbytes) {
1860                 ctr_crypt_final(desc, &walk);
1861                 err = blkcipher_walk_done(desc, &walk, 0);
1862 diff --git a/arch/x86/crypto/glue_helper.c b/arch/x86/crypto/glue_helper.c
1863 index 6a85598931b5..3a506ce7ed93 100644
1864 --- a/arch/x86/crypto/glue_helper.c
1865 +++ b/arch/x86/crypto/glue_helper.c
1866 @@ -39,7 +39,7 @@ static int __glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx,
1867         void *ctx = crypto_blkcipher_ctx(desc->tfm);
1868         const unsigned int bsize = 128 / 8;
1869         unsigned int nbytes, i, func_bytes;
1870 -       bool fpu_enabled = false;
1871 +       bool fpu_enabled;
1872         int err;
1874         err = blkcipher_walk_virt(desc, walk);
1875 @@ -49,7 +49,7 @@ static int __glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx,
1876                 u8 *wdst = walk->dst.virt.addr;
1878                 fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
1879 -                                            desc, fpu_enabled, nbytes);
1880 +                                            desc, false, nbytes);
1882                 for (i = 0; i < gctx->num_funcs; i++) {
1883                         func_bytes = bsize * gctx->funcs[i].num_blocks;
1884 @@ -71,10 +71,10 @@ static int __glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx,
1885                 }
1887  done:
1888 +               glue_fpu_end(fpu_enabled);
1889                 err = blkcipher_walk_done(desc, walk, nbytes);
1890         }
1892 -       glue_fpu_end(fpu_enabled);
1893         return err;
1896 @@ -194,7 +194,7 @@ int glue_cbc_decrypt_128bit(const struct common_glue_ctx *gctx,
1897                             struct scatterlist *src, unsigned int nbytes)
1899         const unsigned int bsize = 128 / 8;
1900 -       bool fpu_enabled = false;
1901 +       bool fpu_enabled;
1902         struct blkcipher_walk walk;
1903         int err;
1905 @@ -203,12 +203,12 @@ int glue_cbc_decrypt_128bit(const struct common_glue_ctx *gctx,
1907         while ((nbytes = walk.nbytes)) {
1908                 fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
1909 -                                            desc, fpu_enabled, nbytes);
1910 +                                            desc, false, nbytes);
1911                 nbytes = __glue_cbc_decrypt_128bit(gctx, desc, &walk);
1912 +               glue_fpu_end(fpu_enabled);
1913                 err = blkcipher_walk_done(desc, &walk, nbytes);
1914         }
1916 -       glue_fpu_end(fpu_enabled);
1917         return err;
1919  EXPORT_SYMBOL_GPL(glue_cbc_decrypt_128bit);
1920 @@ -277,7 +277,7 @@ int glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx,
1921                           struct scatterlist *src, unsigned int nbytes)
1923         const unsigned int bsize = 128 / 8;
1924 -       bool fpu_enabled = false;
1925 +       bool fpu_enabled;
1926         struct blkcipher_walk walk;
1927         int err;
1929 @@ -286,13 +286,12 @@ int glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx,
1931         while ((nbytes = walk.nbytes) >= bsize) {
1932                 fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
1933 -                                            desc, fpu_enabled, nbytes);
1934 +                                            desc, false, nbytes);
1935                 nbytes = __glue_ctr_crypt_128bit(gctx, desc, &walk);
1936 +               glue_fpu_end(fpu_enabled);
1937                 err = blkcipher_walk_done(desc, &walk, nbytes);
1938         }
1940 -       glue_fpu_end(fpu_enabled);
1942         if (walk.nbytes) {
1943                 glue_ctr_crypt_final_128bit(
1944                         gctx->funcs[gctx->num_funcs - 1].fn_u.ctr, desc, &walk);
1945 @@ -347,7 +346,7 @@ int glue_xts_crypt_128bit(const struct common_glue_ctx *gctx,
1946                           void *tweak_ctx, void *crypt_ctx)
1948         const unsigned int bsize = 128 / 8;
1949 -       bool fpu_enabled = false;
1950 +       bool fpu_enabled;
1951         struct blkcipher_walk walk;
1952         int err;
1954 @@ -360,21 +359,21 @@ int glue_xts_crypt_128bit(const struct common_glue_ctx *gctx,
1956         /* set minimum length to bsize, for tweak_fn */
1957         fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
1958 -                                    desc, fpu_enabled,
1959 +                                    desc, false,
1960                                      nbytes < bsize ? bsize : nbytes);
1962         /* calculate first value of T */
1963         tweak_fn(tweak_ctx, walk.iv, walk.iv);
1964 +       glue_fpu_end(fpu_enabled);
1966         while (nbytes) {
1967 +               fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
1968 +                               desc, false, nbytes);
1969                 nbytes = __glue_xts_crypt_128bit(gctx, crypt_ctx, desc, &walk);
1971 +               glue_fpu_end(fpu_enabled);
1972                 err = blkcipher_walk_done(desc, &walk, nbytes);
1973                 nbytes = walk.nbytes;
1974         }
1976 -       glue_fpu_end(fpu_enabled);
1978         return err;
1980  EXPORT_SYMBOL_GPL(glue_xts_crypt_128bit);
1981 diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
1982 index bdd9cc59d20f..56d01a339ba4 100644
1983 --- a/arch/x86/entry/common.c
1984 +++ b/arch/x86/entry/common.c
1985 @@ -129,7 +129,7 @@ static long syscall_trace_enter(struct pt_regs *regs)
1987  #define EXIT_TO_USERMODE_LOOP_FLAGS                            \
1988         (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE |   \
1989 -        _TIF_NEED_RESCHED | _TIF_USER_RETURN_NOTIFY)
1990 +        _TIF_NEED_RESCHED_MASK | _TIF_USER_RETURN_NOTIFY)
1992  static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags)
1994 @@ -145,9 +145,16 @@ static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags)
1995                 /* We have work to do. */
1996                 local_irq_enable();
1998 -               if (cached_flags & _TIF_NEED_RESCHED)
1999 +               if (cached_flags & _TIF_NEED_RESCHED_MASK)
2000                         schedule();
2002 +#ifdef ARCH_RT_DELAYS_SIGNAL_SEND
2003 +               if (unlikely(current->forced_info.si_signo)) {
2004 +                       struct task_struct *t = current;
2005 +                       force_sig_info(t->forced_info.si_signo, &t->forced_info, t);
2006 +                       t->forced_info.si_signo = 0;
2007 +               }
2008 +#endif
2009                 if (cached_flags & _TIF_UPROBE)
2010                         uprobe_notify_resume(regs);
2012 diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
2013 index edba8606b99a..4a3389535fc6 100644
2014 --- a/arch/x86/entry/entry_32.S
2015 +++ b/arch/x86/entry/entry_32.S
2016 @@ -308,8 +308,25 @@ END(ret_from_exception)
2017  ENTRY(resume_kernel)
2018         DISABLE_INTERRUPTS(CLBR_ANY)
2019  need_resched:
2020 +       # preempt count == 0 + NEED_RS set?
2021         cmpl    $0, PER_CPU_VAR(__preempt_count)
2022 +#ifndef CONFIG_PREEMPT_LAZY
2023         jnz     restore_all
2024 +#else
2025 +       jz test_int_off
2027 +       # atleast preempt count == 0 ?
2028 +       cmpl $_PREEMPT_ENABLED,PER_CPU_VAR(__preempt_count)
2029 +       jne restore_all
2031 +       movl    PER_CPU_VAR(current_task), %ebp
2032 +       cmpl $0,TASK_TI_preempt_lazy_count(%ebp)        # non-zero preempt_lazy_count ?
2033 +       jnz restore_all
2035 +       testl $_TIF_NEED_RESCHED_LAZY, TASK_TI_flags(%ebp)
2036 +       jz restore_all
2037 +test_int_off:
2038 +#endif
2039         testl   $X86_EFLAGS_IF, PT_EFLAGS(%esp) # interrupts off (exception path) ?
2040         jz      restore_all
2041         call    preempt_schedule_irq
2042 diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
2043 index af4e58132d91..22803e2f7495 100644
2044 --- a/arch/x86/entry/entry_64.S
2045 +++ b/arch/x86/entry/entry_64.S
2046 @@ -575,7 +575,23 @@ retint_kernel:
2047         bt      $9, EFLAGS(%rsp)                /* were interrupts off? */
2048         jnc     1f
2049  0:     cmpl    $0, PER_CPU_VAR(__preempt_count)
2050 +#ifndef CONFIG_PREEMPT_LAZY
2051         jnz     1f
2052 +#else
2053 +       jz      do_preempt_schedule_irq
2055 +       # atleast preempt count == 0 ?
2056 +       cmpl $_PREEMPT_ENABLED,PER_CPU_VAR(__preempt_count)
2057 +       jnz     1f
2059 +       movq    PER_CPU_VAR(current_task), %rcx
2060 +       cmpl    $0, TASK_TI_preempt_lazy_count(%rcx)
2061 +       jnz     1f
2063 +       bt      $TIF_NEED_RESCHED_LAZY,TASK_TI_flags(%rcx)
2064 +       jnc     1f
2065 +do_preempt_schedule_irq:
2066 +#endif
2067         call    preempt_schedule_irq
2068         jmp     0b
2069  1:
2070 @@ -925,6 +941,7 @@ bad_gs:
2071         jmp     2b
2072         .previous
2074 +#ifndef CONFIG_PREEMPT_RT_FULL
2075  /* Call softirq on interrupt stack. Interrupts are off. */
2076  ENTRY(do_softirq_own_stack)
2077         pushq   %rbp
2078 @@ -937,6 +954,7 @@ ENTRY(do_softirq_own_stack)
2079         decl    PER_CPU_VAR(irq_count)
2080         ret
2081  END(do_softirq_own_stack)
2082 +#endif
2084  #ifdef CONFIG_XEN
2085  idtentry xen_hypervisor_callback xen_do_hypervisor_callback has_error_code=0
2086 diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
2087 index 17f218645701..11bd1b7ee6eb 100644
2088 --- a/arch/x86/include/asm/preempt.h
2089 +++ b/arch/x86/include/asm/preempt.h
2090 @@ -79,17 +79,46 @@ static __always_inline void __preempt_count_sub(int val)
2091   * a decrement which hits zero means we have no preempt_count and should
2092   * reschedule.
2093   */
2094 -static __always_inline bool __preempt_count_dec_and_test(void)
2095 +static __always_inline bool ____preempt_count_dec_and_test(void)
2097         GEN_UNARY_RMWcc("decl", __preempt_count, __percpu_arg(0), e);
2100 +static __always_inline bool __preempt_count_dec_and_test(void)
2102 +       if (____preempt_count_dec_and_test())
2103 +               return true;
2104 +#ifdef CONFIG_PREEMPT_LAZY
2105 +       if (current_thread_info()->preempt_lazy_count)
2106 +               return false;
2107 +       return test_thread_flag(TIF_NEED_RESCHED_LAZY);
2108 +#else
2109 +       return false;
2110 +#endif
2113  /*
2114   * Returns true when we need to resched and can (barring IRQ state).
2115   */
2116  static __always_inline bool should_resched(int preempt_offset)
2118 +#ifdef CONFIG_PREEMPT_LAZY
2119 +       u32 tmp;
2121 +       tmp = raw_cpu_read_4(__preempt_count);
2122 +       if (tmp == preempt_offset)
2123 +               return true;
2125 +       /* preempt count == 0 ? */
2126 +       tmp &= ~PREEMPT_NEED_RESCHED;
2127 +       if (tmp)
2128 +               return false;
2129 +       if (current_thread_info()->preempt_lazy_count)
2130 +               return false;
2131 +       return test_thread_flag(TIF_NEED_RESCHED_LAZY);
2132 +#else
2133         return unlikely(raw_cpu_read_4(__preempt_count) == preempt_offset);
2134 +#endif
2137  #ifdef CONFIG_PREEMPT
2138 diff --git a/arch/x86/include/asm/signal.h b/arch/x86/include/asm/signal.h
2139 index 8af22be0fe61..d1328789b759 100644
2140 --- a/arch/x86/include/asm/signal.h
2141 +++ b/arch/x86/include/asm/signal.h
2142 @@ -27,6 +27,19 @@ typedef struct {
2143  #define SA_IA32_ABI    0x02000000u
2144  #define SA_X32_ABI     0x01000000u
2147 + * Because some traps use the IST stack, we must keep preemption
2148 + * disabled while calling do_trap(), but do_trap() may call
2149 + * force_sig_info() which will grab the signal spin_locks for the
2150 + * task, which in PREEMPT_RT_FULL are mutexes.  By defining
2151 + * ARCH_RT_DELAYS_SIGNAL_SEND the force_sig_info() will set
2152 + * TIF_NOTIFY_RESUME and set up the signal to be sent on exit of the
2153 + * trap.
2154 + */
2155 +#if defined(CONFIG_PREEMPT_RT_FULL)
2156 +#define ARCH_RT_DELAYS_SIGNAL_SEND
2157 +#endif
2159  #ifndef CONFIG_COMPAT
2160  typedef sigset_t compat_sigset_t;
2161  #endif
2162 diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h
2163 index 58505f01962f..02fa39652cd6 100644
2164 --- a/arch/x86/include/asm/stackprotector.h
2165 +++ b/arch/x86/include/asm/stackprotector.h
2166 @@ -59,7 +59,7 @@
2167   */
2168  static __always_inline void boot_init_stack_canary(void)
2170 -       u64 canary;
2171 +       u64 uninitialized_var(canary);
2172         u64 tsc;
2174  #ifdef CONFIG_X86_64
2175 @@ -70,8 +70,15 @@ static __always_inline void boot_init_stack_canary(void)
2176          * of randomness. The TSC only matters for very early init,
2177          * there it already has some randomness on most systems. Later
2178          * on during the bootup the random pool has true entropy too.
2179 +        *
2180 +        * For preempt-rt we need to weaken the randomness a bit, as
2181 +        * we can't call into the random generator from atomic context
2182 +        * due to locking constraints. We just leave canary
2183 +        * uninitialized and use the TSC based randomness on top of it.
2184          */
2185 +#ifndef CONFIG_PREEMPT_RT_FULL
2186         get_random_bytes(&canary, sizeof(canary));
2187 +#endif
2188         tsc = rdtsc();
2189         canary += tsc + (tsc << 32UL);
2191 diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
2192 index ad6f5eb07a95..5ceb3a1c2b1a 100644
2193 --- a/arch/x86/include/asm/thread_info.h
2194 +++ b/arch/x86/include/asm/thread_info.h
2195 @@ -54,11 +54,14 @@ struct task_struct;
2197  struct thread_info {
2198         unsigned long           flags;          /* low level flags */
2199 +       int                     preempt_lazy_count;     /* 0 => lazy preemptable
2200 +                                                          <0 => BUG */
2201  };
2203  #define INIT_THREAD_INFO(tsk)                  \
2204  {                                              \
2205         .flags          = 0,                    \
2206 +       .preempt_lazy_count = 0,                \
2209  #define init_stack             (init_thread_union.stack)
2210 @@ -67,6 +70,10 @@ struct thread_info {
2212  #include <asm/asm-offsets.h>
2214 +#define GET_THREAD_INFO(reg) \
2215 +       _ASM_MOV PER_CPU_VAR(cpu_current_top_of_stack),reg ; \
2216 +       _ASM_SUB $(THREAD_SIZE),reg ;
2218  #endif
2220  /*
2221 @@ -85,6 +92,7 @@ struct thread_info {
2222  #define TIF_SYSCALL_EMU                6       /* syscall emulation active */
2223  #define TIF_SYSCALL_AUDIT      7       /* syscall auditing active */
2224  #define TIF_SECCOMP            8       /* secure computing */
2225 +#define TIF_NEED_RESCHED_LAZY  9       /* lazy rescheduling necessary */
2226  #define TIF_USER_RETURN_NOTIFY 11      /* notify kernel of userspace return */
2227  #define TIF_UPROBE             12      /* breakpointed or singlestepping */
2228  #define TIF_NOTSC              16      /* TSC is not accessible in userland */
2229 @@ -108,6 +116,7 @@ struct thread_info {
2230  #define _TIF_SYSCALL_EMU       (1 << TIF_SYSCALL_EMU)
2231  #define _TIF_SYSCALL_AUDIT     (1 << TIF_SYSCALL_AUDIT)
2232  #define _TIF_SECCOMP           (1 << TIF_SECCOMP)
2233 +#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY)
2234  #define _TIF_USER_RETURN_NOTIFY        (1 << TIF_USER_RETURN_NOTIFY)
2235  #define _TIF_UPROBE            (1 << TIF_UPROBE)
2236  #define _TIF_NOTSC             (1 << TIF_NOTSC)
2237 @@ -143,6 +152,8 @@ struct thread_info {
2238  #define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY)
2239  #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW)
2241 +#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)
2243  #define STACK_WARN             (THREAD_SIZE/8)
2245  /*
2246 diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
2247 index 57ab86d94d64..35d25e27180f 100644
2248 --- a/arch/x86/include/asm/uv/uv_bau.h
2249 +++ b/arch/x86/include/asm/uv/uv_bau.h
2250 @@ -624,9 +624,9 @@ struct bau_control {
2251         cycles_t                send_message;
2252         cycles_t                period_end;
2253         cycles_t                period_time;
2254 -       spinlock_t              uvhub_lock;
2255 -       spinlock_t              queue_lock;
2256 -       spinlock_t              disable_lock;
2257 +       raw_spinlock_t          uvhub_lock;
2258 +       raw_spinlock_t          queue_lock;
2259 +       raw_spinlock_t          disable_lock;
2260         /* tunables */
2261         int                     max_concurr;
2262         int                     max_concurr_const;
2263 @@ -815,15 +815,15 @@ static inline int atom_asr(short i, struct atomic_short *v)
2264   * to be lowered below the current 'v'.  atomic_add_unless can only stop
2265   * on equal.
2266   */
2267 -static inline int atomic_inc_unless_ge(spinlock_t *lock, atomic_t *v, int u)
2268 +static inline int atomic_inc_unless_ge(raw_spinlock_t *lock, atomic_t *v, int u)
2270 -       spin_lock(lock);
2271 +       raw_spin_lock(lock);
2272         if (atomic_read(v) >= u) {
2273 -               spin_unlock(lock);
2274 +               raw_spin_unlock(lock);
2275                 return 0;
2276         }
2277         atomic_inc(v);
2278 -       spin_unlock(lock);
2279 +       raw_spin_unlock(lock);
2280         return 1;
2283 diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
2284 index 11cc600f4df0..8cbfc51ce339 100644
2285 --- a/arch/x86/kernel/acpi/boot.c
2286 +++ b/arch/x86/kernel/acpi/boot.c
2287 @@ -87,7 +87,9 @@ static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
2288   *             ->ioapic_mutex
2289   *                     ->ioapic_lock
2290   */
2291 +#ifdef CONFIG_X86_IO_APIC
2292  static DEFINE_MUTEX(acpi_ioapic_lock);
2293 +#endif
2295  /* --------------------------------------------------------------------------
2296                                Boot-time Configuration
2297 diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
2298 index cf89928dbd46..18b5ec2a71df 100644
2299 --- a/arch/x86/kernel/apic/io_apic.c
2300 +++ b/arch/x86/kernel/apic/io_apic.c
2301 @@ -1712,7 +1712,8 @@ static bool io_apic_level_ack_pending(struct mp_chip_data *data)
2302  static inline bool ioapic_irqd_mask(struct irq_data *data)
2304         /* If we are moving the irq we need to mask it */
2305 -       if (unlikely(irqd_is_setaffinity_pending(data))) {
2306 +       if (unlikely(irqd_is_setaffinity_pending(data) &&
2307 +                    !irqd_irq_inprogress(data))) {
2308                 mask_ioapic_irq(data);
2309                 return true;
2310         }
2311 diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
2312 index c62e015b126c..0cc71257fca6 100644
2313 --- a/arch/x86/kernel/asm-offsets.c
2314 +++ b/arch/x86/kernel/asm-offsets.c
2315 @@ -36,6 +36,7 @@ void common(void) {
2317         BLANK();
2318         OFFSET(TASK_TI_flags, task_struct, thread_info.flags);
2319 +       OFFSET(TASK_TI_preempt_lazy_count, task_struct, thread_info.preempt_lazy_count);
2320         OFFSET(TASK_addr_limit, task_struct, thread.addr_limit);
2322         BLANK();
2323 @@ -91,4 +92,5 @@ void common(void) {
2325         BLANK();
2326         DEFINE(PTREGS_SIZE, sizeof(struct pt_regs));
2327 +       DEFINE(_PREEMPT_ENABLED, PREEMPT_ENABLED);
2329 diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
2330 index 8ca5f8ad008e..edcbd18b3189 100644
2331 --- a/arch/x86/kernel/cpu/mcheck/mce.c
2332 +++ b/arch/x86/kernel/cpu/mcheck/mce.c
2333 @@ -41,6 +41,8 @@
2334  #include <linux/debugfs.h>
2335  #include <linux/irq_work.h>
2336  #include <linux/export.h>
2337 +#include <linux/jiffies.h>
2338 +#include <linux/swork.h>
2339  #include <linux/jump_label.h>
2341  #include <asm/processor.h>
2342 @@ -1306,7 +1308,7 @@ void mce_log_therm_throt_event(__u64 status)
2343  static unsigned long check_interval = INITIAL_CHECK_INTERVAL;
2345  static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
2346 -static DEFINE_PER_CPU(struct timer_list, mce_timer);
2347 +static DEFINE_PER_CPU(struct hrtimer, mce_timer);
2349  static unsigned long mce_adjust_timer_default(unsigned long interval)
2351 @@ -1315,32 +1317,18 @@ static unsigned long mce_adjust_timer_default(unsigned long interval)
2353  static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default;
2355 -static void __restart_timer(struct timer_list *t, unsigned long interval)
2356 +static enum hrtimer_restart __restart_timer(struct hrtimer *timer, unsigned long interval)
2358 -       unsigned long when = jiffies + interval;
2359 -       unsigned long flags;
2361 -       local_irq_save(flags);
2363 -       if (timer_pending(t)) {
2364 -               if (time_before(when, t->expires))
2365 -                       mod_timer(t, when);
2366 -       } else {
2367 -               t->expires = round_jiffies(when);
2368 -               add_timer_on(t, smp_processor_id());
2369 -       }
2371 -       local_irq_restore(flags);
2372 +       if (!interval)
2373 +               return HRTIMER_NORESTART;
2374 +       hrtimer_forward_now(timer, ns_to_ktime(jiffies_to_nsecs(interval)));
2375 +       return HRTIMER_RESTART;
2378 -static void mce_timer_fn(unsigned long data)
2379 +static enum hrtimer_restart mce_timer_fn(struct hrtimer *timer)
2381 -       struct timer_list *t = this_cpu_ptr(&mce_timer);
2382 -       int cpu = smp_processor_id();
2383         unsigned long iv;
2385 -       WARN_ON(cpu != data);
2387         iv = __this_cpu_read(mce_next_interval);
2389         if (mce_available(this_cpu_ptr(&cpu_info))) {
2390 @@ -1363,7 +1351,7 @@ static void mce_timer_fn(unsigned long data)
2392  done:
2393         __this_cpu_write(mce_next_interval, iv);
2394 -       __restart_timer(t, iv);
2395 +       return __restart_timer(timer, iv);
2398  /*
2399 @@ -1371,7 +1359,7 @@ static void mce_timer_fn(unsigned long data)
2400   */
2401  void mce_timer_kick(unsigned long interval)
2403 -       struct timer_list *t = this_cpu_ptr(&mce_timer);
2404 +       struct hrtimer *t = this_cpu_ptr(&mce_timer);
2405         unsigned long iv = __this_cpu_read(mce_next_interval);
2407         __restart_timer(t, interval);
2408 @@ -1386,7 +1374,7 @@ static void mce_timer_delete_all(void)
2409         int cpu;
2411         for_each_online_cpu(cpu)
2412 -               del_timer_sync(&per_cpu(mce_timer, cpu));
2413 +               hrtimer_cancel(&per_cpu(mce_timer, cpu));
2416  static void mce_do_trigger(struct work_struct *work)
2417 @@ -1396,6 +1384,56 @@ static void mce_do_trigger(struct work_struct *work)
2419  static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
2421 +static void __mce_notify_work(struct swork_event *event)
2423 +       /* Not more than two messages every minute */
2424 +       static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
2426 +       /* wake processes polling /dev/mcelog */
2427 +       wake_up_interruptible(&mce_chrdev_wait);
2429 +       /*
2430 +        * There is no risk of missing notifications because
2431 +        * work_pending is always cleared before the function is
2432 +        * executed.
2433 +        */
2434 +       if (mce_helper[0] && !work_pending(&mce_trigger_work))
2435 +               schedule_work(&mce_trigger_work);
2437 +       if (__ratelimit(&ratelimit))
2438 +               pr_info(HW_ERR "Machine check events logged\n");
2441 +#ifdef CONFIG_PREEMPT_RT_FULL
2442 +static bool notify_work_ready __read_mostly;
2443 +static struct swork_event notify_work;
2445 +static int mce_notify_work_init(void)
2447 +       int err;
2449 +       err = swork_get();
2450 +       if (err)
2451 +               return err;
2453 +       INIT_SWORK(&notify_work, __mce_notify_work);
2454 +       notify_work_ready = true;
2455 +       return 0;
2458 +static void mce_notify_work(void)
2460 +       if (notify_work_ready)
2461 +               swork_queue(&notify_work);
2463 +#else
2464 +static void mce_notify_work(void)
2466 +       __mce_notify_work(NULL);
2468 +static inline int mce_notify_work_init(void) { return 0; }
2469 +#endif
2471  /*
2472   * Notify the user(s) about new machine check events.
2473   * Can be called from interrupt context, but not from machine check/NMI
2474 @@ -1403,19 +1441,8 @@ static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
2475   */
2476  int mce_notify_irq(void)
2478 -       /* Not more than two messages every minute */
2479 -       static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
2481         if (test_and_clear_bit(0, &mce_need_notify)) {
2482 -               /* wake processes polling /dev/mcelog */
2483 -               wake_up_interruptible(&mce_chrdev_wait);
2485 -               if (mce_helper[0])
2486 -                       schedule_work(&mce_trigger_work);
2488 -               if (__ratelimit(&ratelimit))
2489 -                       pr_info(HW_ERR "Machine check events logged\n");
2491 +               mce_notify_work();
2492                 return 1;
2493         }
2494         return 0;
2495 @@ -1721,7 +1748,7 @@ static void __mcheck_cpu_clear_vendor(struct cpuinfo_x86 *c)
2496         }
2499 -static void mce_start_timer(unsigned int cpu, struct timer_list *t)
2500 +static void mce_start_timer(unsigned int cpu, struct hrtimer *t)
2502         unsigned long iv = check_interval * HZ;
2504 @@ -1730,16 +1757,17 @@ static void mce_start_timer(unsigned int cpu, struct timer_list *t)
2506         per_cpu(mce_next_interval, cpu) = iv;
2508 -       t->expires = round_jiffies(jiffies + iv);
2509 -       add_timer_on(t, cpu);
2510 +       hrtimer_start_range_ns(t, ns_to_ktime(jiffies_to_usecs(iv) * 1000ULL),
2511 +                       0, HRTIMER_MODE_REL_PINNED);
2514  static void __mcheck_cpu_init_timer(void)
2516 -       struct timer_list *t = this_cpu_ptr(&mce_timer);
2517 +       struct hrtimer *t = this_cpu_ptr(&mce_timer);
2518         unsigned int cpu = smp_processor_id();
2520 -       setup_pinned_timer(t, mce_timer_fn, cpu);
2521 +       hrtimer_init(t, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2522 +       t->function = mce_timer_fn;
2523         mce_start_timer(cpu, t);
2526 @@ -2464,6 +2492,8 @@ static void mce_disable_cpu(void *h)
2527         if (!mce_available(raw_cpu_ptr(&cpu_info)))
2528                 return;
2530 +       hrtimer_cancel(this_cpu_ptr(&mce_timer));
2532         if (!(action & CPU_TASKS_FROZEN))
2533                 cmci_clear();
2535 @@ -2486,6 +2516,7 @@ static void mce_reenable_cpu(void *h)
2536                 if (b->init)
2537                         wrmsrl(msr_ops.ctl(i), b->ctl);
2538         }
2539 +       __mcheck_cpu_init_timer();
2542  /* Get notified when a cpu comes on/off. Be hotplug friendly. */
2543 @@ -2493,7 +2524,6 @@ static int
2544  mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
2546         unsigned int cpu = (unsigned long)hcpu;
2547 -       struct timer_list *t = &per_cpu(mce_timer, cpu);
2549         switch (action & ~CPU_TASKS_FROZEN) {
2550         case CPU_ONLINE:
2551 @@ -2513,11 +2543,9 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
2552                 break;
2553         case CPU_DOWN_PREPARE:
2554                 smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
2555 -               del_timer_sync(t);
2556                 break;
2557         case CPU_DOWN_FAILED:
2558                 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
2559 -               mce_start_timer(cpu, t);
2560                 break;
2561         }
2563 @@ -2556,6 +2584,10 @@ static __init int mcheck_init_device(void)
2564                 goto err_out;
2565         }
2567 +       err = mce_notify_work_init();
2568 +       if (err)
2569 +               goto err_out;
2571         if (!zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL)) {
2572                 err = -ENOMEM;
2573                 goto err_out;
2574 diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
2575 index 1f38d9a4d9de..053bf3b2ef39 100644
2576 --- a/arch/x86/kernel/irq_32.c
2577 +++ b/arch/x86/kernel/irq_32.c
2578 @@ -127,6 +127,7 @@ void irq_ctx_init(int cpu)
2579                cpu, per_cpu(hardirq_stack, cpu),  per_cpu(softirq_stack, cpu));
2582 +#ifndef CONFIG_PREEMPT_RT_FULL
2583  void do_softirq_own_stack(void)
2585         struct irq_stack *irqstk;
2586 @@ -143,6 +144,7 @@ void do_softirq_own_stack(void)
2588         call_on_stack(__do_softirq, isp);
2590 +#endif
2592  bool handle_irq(struct irq_desc *desc, struct pt_regs *regs)
2594 diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
2595 index bd7be8efdc4c..b3b0a7f7b1ca 100644
2596 --- a/arch/x86/kernel/process_32.c
2597 +++ b/arch/x86/kernel/process_32.c
2598 @@ -35,6 +35,7 @@
2599  #include <linux/uaccess.h>
2600  #include <linux/io.h>
2601  #include <linux/kdebug.h>
2602 +#include <linux/highmem.h>
2604  #include <asm/pgtable.h>
2605  #include <asm/ldt.h>
2606 @@ -195,6 +196,35 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
2608  EXPORT_SYMBOL_GPL(start_thread);
2610 +#ifdef CONFIG_PREEMPT_RT_FULL
2611 +static void switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p)
2613 +       int i;
2615 +       /*
2616 +        * Clear @prev's kmap_atomic mappings
2617 +        */
2618 +       for (i = 0; i < prev_p->kmap_idx; i++) {
2619 +               int idx = i + KM_TYPE_NR * smp_processor_id();
2620 +               pte_t *ptep = kmap_pte - idx;
2622 +               kpte_clear_flush(ptep, __fix_to_virt(FIX_KMAP_BEGIN + idx));
2623 +       }
2624 +       /*
2625 +        * Restore @next_p's kmap_atomic mappings
2626 +        */
2627 +       for (i = 0; i < next_p->kmap_idx; i++) {
2628 +               int idx = i + KM_TYPE_NR * smp_processor_id();
2630 +               if (!pte_none(next_p->kmap_pte[i]))
2631 +                       set_pte(kmap_pte - idx, next_p->kmap_pte[i]);
2632 +       }
2634 +#else
2635 +static inline void
2636 +switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p) { }
2637 +#endif
2640  /*
2641   *     switch_to(x,y) should switch tasks from x to y.
2642 @@ -271,6 +301,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
2643                      task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT))
2644                 __switch_to_xtra(prev_p, next_p, tss);
2646 +       switch_kmaps(prev_p, next_p);
2648         /*
2649          * Leave lazy mode, flushing any hypercalls made here.
2650          * This must be done before restoring TLS segments so
2651 diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
2652 index b24b3c6d686e..02a062b0de5d 100644
2653 --- a/arch/x86/kvm/lapic.c
2654 +++ b/arch/x86/kvm/lapic.c
2655 @@ -1944,6 +1944,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
2656         hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC,
2657                      HRTIMER_MODE_ABS_PINNED);
2658         apic->lapic_timer.timer.function = apic_timer_fn;
2659 +       apic->lapic_timer.timer.irqsafe = 1;
2661         /*
2662          * APIC is created enabled. This will prevent kvm_lapic_set_base from
2663 diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
2664 index 73304b1a03cc..2a0fae2ef089 100644
2665 --- a/arch/x86/kvm/x86.c
2666 +++ b/arch/x86/kvm/x86.c
2667 @@ -5967,6 +5967,13 @@ int kvm_arch_init(void *opaque)
2668                 goto out;
2669         }
2671 +#ifdef CONFIG_PREEMPT_RT_FULL
2672 +       if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
2673 +               printk(KERN_ERR "RT requires X86_FEATURE_CONSTANT_TSC\n");
2674 +               return -EOPNOTSUPP;
2675 +       }
2676 +#endif
2678         r = kvm_mmu_module_init();
2679         if (r)
2680                 goto out_free_percpu;
2681 diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
2682 index 6d18b70ed5a9..f752724c22e8 100644
2683 --- a/arch/x86/mm/highmem_32.c
2684 +++ b/arch/x86/mm/highmem_32.c
2685 @@ -32,10 +32,11 @@ EXPORT_SYMBOL(kunmap);
2686   */
2687  void *kmap_atomic_prot(struct page *page, pgprot_t prot)
2689 +       pte_t pte = mk_pte(page, prot);
2690         unsigned long vaddr;
2691         int idx, type;
2693 -       preempt_disable();
2694 +       preempt_disable_nort();
2695         pagefault_disable();
2697         if (!PageHighMem(page))
2698 @@ -45,7 +46,10 @@ void *kmap_atomic_prot(struct page *page, pgprot_t prot)
2699         idx = type + KM_TYPE_NR*smp_processor_id();
2700         vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
2701         BUG_ON(!pte_none(*(kmap_pte-idx)));
2702 -       set_pte(kmap_pte-idx, mk_pte(page, prot));
2703 +#ifdef CONFIG_PREEMPT_RT_FULL
2704 +       current->kmap_pte[type] = pte;
2705 +#endif
2706 +       set_pte(kmap_pte-idx, pte);
2707         arch_flush_lazy_mmu_mode();
2709         return (void *)vaddr;
2710 @@ -88,6 +92,9 @@ void __kunmap_atomic(void *kvaddr)
2711                  * is a bad idea also, in case the page changes cacheability
2712                  * attributes or becomes a protected page in a hypervisor.
2713                  */
2714 +#ifdef CONFIG_PREEMPT_RT_FULL
2715 +               current->kmap_pte[type] = __pte(0);
2716 +#endif
2717                 kpte_clear_flush(kmap_pte-idx, vaddr);
2718                 kmap_atomic_idx_pop();
2719                 arch_flush_lazy_mmu_mode();
2720 @@ -100,7 +107,7 @@ void __kunmap_atomic(void *kvaddr)
2721  #endif
2723         pagefault_enable();
2724 -       preempt_enable();
2725 +       preempt_enable_nort();
2727  EXPORT_SYMBOL(__kunmap_atomic);
2729 diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c
2730 index ada98b39b8ad..585f6829653b 100644
2731 --- a/arch/x86/mm/iomap_32.c
2732 +++ b/arch/x86/mm/iomap_32.c
2733 @@ -56,6 +56,7 @@ EXPORT_SYMBOL_GPL(iomap_free);
2735  void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
2737 +       pte_t pte = pfn_pte(pfn, prot);
2738         unsigned long vaddr;
2739         int idx, type;
2741 @@ -65,7 +66,12 @@ void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
2742         type = kmap_atomic_idx_push();
2743         idx = type + KM_TYPE_NR * smp_processor_id();
2744         vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
2745 -       set_pte(kmap_pte - idx, pfn_pte(pfn, prot));
2746 +       WARN_ON(!pte_none(*(kmap_pte - idx)));
2748 +#ifdef CONFIG_PREEMPT_RT_FULL
2749 +       current->kmap_pte[type] = pte;
2750 +#endif
2751 +       set_pte(kmap_pte - idx, pte);
2752         arch_flush_lazy_mmu_mode();
2754         return (void *)vaddr;
2755 @@ -113,6 +119,9 @@ iounmap_atomic(void __iomem *kvaddr)
2756                  * is a bad idea also, in case the page changes cacheability
2757                  * attributes or becomes a protected page in a hypervisor.
2758                  */
2759 +#ifdef CONFIG_PREEMPT_RT_FULL
2760 +               current->kmap_pte[type] = __pte(0);
2761 +#endif
2762                 kpte_clear_flush(kmap_pte-idx, vaddr);
2763                 kmap_atomic_idx_pop();
2764         }
2765 diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
2766 index 73dcb0e18c1b..c1085c7ee212 100644
2767 --- a/arch/x86/mm/pageattr.c
2768 +++ b/arch/x86/mm/pageattr.c
2769 @@ -215,7 +215,15 @@ static void cpa_flush_array(unsigned long *start, int numpages, int cache,
2770                             int in_flags, struct page **pages)
2772         unsigned int i, level;
2773 +#ifdef CONFIG_PREEMPT
2774 +       /*
2775 +        * Avoid wbinvd() because it causes latencies on all CPUs,
2776 +        * regardless of any CPU isolation that may be in effect.
2777 +        */
2778 +       unsigned long do_wbinvd = 0;
2779 +#else
2780         unsigned long do_wbinvd = cache && numpages >= 1024; /* 4M threshold */
2781 +#endif
2783         BUG_ON(irqs_disabled());
2785 diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
2786 index 0f0175186f1b..39b5d5b2627d 100644
2787 --- a/arch/x86/platform/uv/tlb_uv.c
2788 +++ b/arch/x86/platform/uv/tlb_uv.c
2789 @@ -748,9 +748,9 @@ static void destination_plugged(struct bau_desc *bau_desc,
2791                 quiesce_local_uvhub(hmaster);
2793 -               spin_lock(&hmaster->queue_lock);
2794 +               raw_spin_lock(&hmaster->queue_lock);
2795                 reset_with_ipi(&bau_desc->distribution, bcp);
2796 -               spin_unlock(&hmaster->queue_lock);
2797 +               raw_spin_unlock(&hmaster->queue_lock);
2799                 end_uvhub_quiesce(hmaster);
2801 @@ -770,9 +770,9 @@ static void destination_timeout(struct bau_desc *bau_desc,
2803                 quiesce_local_uvhub(hmaster);
2805 -               spin_lock(&hmaster->queue_lock);
2806 +               raw_spin_lock(&hmaster->queue_lock);
2807                 reset_with_ipi(&bau_desc->distribution, bcp);
2808 -               spin_unlock(&hmaster->queue_lock);
2809 +               raw_spin_unlock(&hmaster->queue_lock);
2811                 end_uvhub_quiesce(hmaster);
2813 @@ -793,7 +793,7 @@ static void disable_for_period(struct bau_control *bcp, struct ptc_stats *stat)
2814         cycles_t tm1;
2816         hmaster = bcp->uvhub_master;
2817 -       spin_lock(&hmaster->disable_lock);
2818 +       raw_spin_lock(&hmaster->disable_lock);
2819         if (!bcp->baudisabled) {
2820                 stat->s_bau_disabled++;
2821                 tm1 = get_cycles();
2822 @@ -806,7 +806,7 @@ static void disable_for_period(struct bau_control *bcp, struct ptc_stats *stat)
2823                         }
2824                 }
2825         }
2826 -       spin_unlock(&hmaster->disable_lock);
2827 +       raw_spin_unlock(&hmaster->disable_lock);
2830  static void count_max_concurr(int stat, struct bau_control *bcp,
2831 @@ -869,7 +869,7 @@ static void record_send_stats(cycles_t time1, cycles_t time2,
2832   */
2833  static void uv1_throttle(struct bau_control *hmaster, struct ptc_stats *stat)
2835 -       spinlock_t *lock = &hmaster->uvhub_lock;
2836 +       raw_spinlock_t *lock = &hmaster->uvhub_lock;
2837         atomic_t *v;
2839         v = &hmaster->active_descriptor_count;
2840 @@ -1002,7 +1002,7 @@ static int check_enable(struct bau_control *bcp, struct ptc_stats *stat)
2841         struct bau_control *hmaster;
2843         hmaster = bcp->uvhub_master;
2844 -       spin_lock(&hmaster->disable_lock);
2845 +       raw_spin_lock(&hmaster->disable_lock);
2846         if (bcp->baudisabled && (get_cycles() >= bcp->set_bau_on_time)) {
2847                 stat->s_bau_reenabled++;
2848                 for_each_present_cpu(tcpu) {
2849 @@ -1014,10 +1014,10 @@ static int check_enable(struct bau_control *bcp, struct ptc_stats *stat)
2850                                 tbcp->period_giveups = 0;
2851                         }
2852                 }
2853 -               spin_unlock(&hmaster->disable_lock);
2854 +               raw_spin_unlock(&hmaster->disable_lock);
2855                 return 0;
2856         }
2857 -       spin_unlock(&hmaster->disable_lock);
2858 +       raw_spin_unlock(&hmaster->disable_lock);
2859         return -1;
2862 @@ -1939,9 +1939,9 @@ static void __init init_per_cpu_tunables(void)
2863                 bcp->cong_reps                  = congested_reps;
2864                 bcp->disabled_period            = sec_2_cycles(disabled_period);
2865                 bcp->giveup_limit               = giveup_limit;
2866 -               spin_lock_init(&bcp->queue_lock);
2867 -               spin_lock_init(&bcp->uvhub_lock);
2868 -               spin_lock_init(&bcp->disable_lock);
2869 +               raw_spin_lock_init(&bcp->queue_lock);
2870 +               raw_spin_lock_init(&bcp->uvhub_lock);
2871 +               raw_spin_lock_init(&bcp->disable_lock);
2872         }
2875 diff --git a/arch/x86/platform/uv/uv_time.c b/arch/x86/platform/uv/uv_time.c
2876 index b333fc45f9ec..8b85916e6986 100644
2877 --- a/arch/x86/platform/uv/uv_time.c
2878 +++ b/arch/x86/platform/uv/uv_time.c
2879 @@ -57,7 +57,7 @@ static DEFINE_PER_CPU(struct clock_event_device, cpu_ced);
2881  /* There is one of these allocated per node */
2882  struct uv_rtc_timer_head {
2883 -       spinlock_t      lock;
2884 +       raw_spinlock_t  lock;
2885         /* next cpu waiting for timer, local node relative: */
2886         int             next_cpu;
2887         /* number of cpus on this node: */
2888 @@ -177,7 +177,7 @@ static __init int uv_rtc_allocate_timers(void)
2889                                 uv_rtc_deallocate_timers();
2890                                 return -ENOMEM;
2891                         }
2892 -                       spin_lock_init(&head->lock);
2893 +                       raw_spin_lock_init(&head->lock);
2894                         head->ncpus = uv_blade_nr_possible_cpus(bid);
2895                         head->next_cpu = -1;
2896                         blade_info[bid] = head;
2897 @@ -231,7 +231,7 @@ static int uv_rtc_set_timer(int cpu, u64 expires)
2898         unsigned long flags;
2899         int next_cpu;
2901 -       spin_lock_irqsave(&head->lock, flags);
2902 +       raw_spin_lock_irqsave(&head->lock, flags);
2904         next_cpu = head->next_cpu;
2905         *t = expires;
2906 @@ -243,12 +243,12 @@ static int uv_rtc_set_timer(int cpu, u64 expires)
2907                 if (uv_setup_intr(cpu, expires)) {
2908                         *t = ULLONG_MAX;
2909                         uv_rtc_find_next_timer(head, pnode);
2910 -                       spin_unlock_irqrestore(&head->lock, flags);
2911 +                       raw_spin_unlock_irqrestore(&head->lock, flags);
2912                         return -ETIME;
2913                 }
2914         }
2916 -       spin_unlock_irqrestore(&head->lock, flags);
2917 +       raw_spin_unlock_irqrestore(&head->lock, flags);
2918         return 0;
2921 @@ -267,7 +267,7 @@ static int uv_rtc_unset_timer(int cpu, int force)
2922         unsigned long flags;
2923         int rc = 0;
2925 -       spin_lock_irqsave(&head->lock, flags);
2926 +       raw_spin_lock_irqsave(&head->lock, flags);
2928         if ((head->next_cpu == bcpu && uv_read_rtc(NULL) >= *t) || force)
2929                 rc = 1;
2930 @@ -279,7 +279,7 @@ static int uv_rtc_unset_timer(int cpu, int force)
2931                         uv_rtc_find_next_timer(head, pnode);
2932         }
2934 -       spin_unlock_irqrestore(&head->lock, flags);
2935 +       raw_spin_unlock_irqrestore(&head->lock, flags);
2937         return rc;
2939 @@ -299,13 +299,18 @@ static int uv_rtc_unset_timer(int cpu, int force)
2940  static cycle_t uv_read_rtc(struct clocksource *cs)
2942         unsigned long offset;
2943 +       cycle_t cycles;
2945 +       preempt_disable();
2946         if (uv_get_min_hub_revision_id() == 1)
2947                 offset = 0;
2948         else
2949                 offset = (uv_blade_processor_id() * L1_CACHE_BYTES) % PAGE_SIZE;
2951 -       return (cycle_t)uv_read_local_mmr(UVH_RTC | offset);
2952 +       cycles = (cycle_t)uv_read_local_mmr(UVH_RTC | offset);
2953 +       preempt_enable();
2955 +       return cycles;
2958  /*
2959 diff --git a/block/blk-core.c b/block/blk-core.c
2960 index 23daf40be371..e8341f78f119 100644
2961 --- a/block/blk-core.c
2962 +++ b/block/blk-core.c
2963 @@ -125,6 +125,9 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
2965         INIT_LIST_HEAD(&rq->queuelist);
2966         INIT_LIST_HEAD(&rq->timeout_list);
2967 +#ifdef CONFIG_PREEMPT_RT_FULL
2968 +       INIT_WORK(&rq->work, __blk_mq_complete_request_remote_work);
2969 +#endif
2970         rq->cpu = -1;
2971         rq->q = q;
2972         rq->__sector = (sector_t) -1;
2973 @@ -233,7 +236,7 @@ EXPORT_SYMBOL(blk_start_queue_async);
2974   **/
2975  void blk_start_queue(struct request_queue *q)
2977 -       WARN_ON(!in_interrupt() && !irqs_disabled());
2978 +       WARN_ON_NONRT(!in_interrupt() && !irqs_disabled());
2980         queue_flag_clear(QUEUE_FLAG_STOPPED, q);
2981         __blk_run_queue(q);
2982 @@ -660,7 +663,7 @@ int blk_queue_enter(struct request_queue *q, bool nowait)
2983                 if (nowait)
2984                         return -EBUSY;
2986 -               ret = wait_event_interruptible(q->mq_freeze_wq,
2987 +               ret = swait_event_interruptible(q->mq_freeze_wq,
2988                                 !atomic_read(&q->mq_freeze_depth) ||
2989                                 blk_queue_dying(q));
2990                 if (blk_queue_dying(q))
2991 @@ -680,7 +683,7 @@ static void blk_queue_usage_counter_release(struct percpu_ref *ref)
2992         struct request_queue *q =
2993                 container_of(ref, struct request_queue, q_usage_counter);
2995 -       wake_up_all(&q->mq_freeze_wq);
2996 +       swake_up_all(&q->mq_freeze_wq);
2999  static void blk_rq_timed_out_timer(unsigned long data)
3000 @@ -750,7 +753,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
3001         q->bypass_depth = 1;
3002         __set_bit(QUEUE_FLAG_BYPASS, &q->queue_flags);
3004 -       init_waitqueue_head(&q->mq_freeze_wq);
3005 +       init_swait_queue_head(&q->mq_freeze_wq);
3007         /*
3008          * Init percpu_ref in atomic mode so that it's faster to shutdown.
3009 @@ -3202,7 +3205,7 @@ static void queue_unplugged(struct request_queue *q, unsigned int depth,
3010                 blk_run_queue_async(q);
3011         else
3012                 __blk_run_queue(q);
3013 -       spin_unlock(q->queue_lock);
3014 +       spin_unlock_irq(q->queue_lock);
3017  static void flush_plug_callbacks(struct blk_plug *plug, bool from_schedule)
3018 @@ -3250,7 +3253,6 @@ EXPORT_SYMBOL(blk_check_plugged);
3019  void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
3021         struct request_queue *q;
3022 -       unsigned long flags;
3023         struct request *rq;
3024         LIST_HEAD(list);
3025         unsigned int depth;
3026 @@ -3270,11 +3272,6 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
3027         q = NULL;
3028         depth = 0;
3030 -       /*
3031 -        * Save and disable interrupts here, to avoid doing it for every
3032 -        * queue lock we have to take.
3033 -        */
3034 -       local_irq_save(flags);
3035         while (!list_empty(&list)) {
3036                 rq = list_entry_rq(list.next);
3037                 list_del_init(&rq->queuelist);
3038 @@ -3287,7 +3284,7 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
3039                                 queue_unplugged(q, depth, from_schedule);
3040                         q = rq->q;
3041                         depth = 0;
3042 -                       spin_lock(q->queue_lock);
3043 +                       spin_lock_irq(q->queue_lock);
3044                 }
3046                 /*
3047 @@ -3314,8 +3311,6 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
3048          */
3049         if (q)
3050                 queue_unplugged(q, depth, from_schedule);
3052 -       local_irq_restore(flags);
3055  void blk_finish_plug(struct blk_plug *plug)
3056 diff --git a/block/blk-ioc.c b/block/blk-ioc.c
3057 index 381cb50a673c..dc8785233d94 100644
3058 --- a/block/blk-ioc.c
3059 +++ b/block/blk-ioc.c
3060 @@ -7,6 +7,7 @@
3061  #include <linux/bio.h>
3062  #include <linux/blkdev.h>
3063  #include <linux/slab.h>
3064 +#include <linux/delay.h>
3066  #include "blk.h"
3068 @@ -109,7 +110,7 @@ static void ioc_release_fn(struct work_struct *work)
3069                         spin_unlock(q->queue_lock);
3070                 } else {
3071                         spin_unlock_irqrestore(&ioc->lock, flags);
3072 -                       cpu_relax();
3073 +                       cpu_chill();
3074                         spin_lock_irqsave_nested(&ioc->lock, flags, 1);
3075                 }
3076         }
3077 @@ -187,7 +188,7 @@ void put_io_context_active(struct io_context *ioc)
3078                         spin_unlock(icq->q->queue_lock);
3079                 } else {
3080                         spin_unlock_irqrestore(&ioc->lock, flags);
3081 -                       cpu_relax();
3082 +                       cpu_chill();
3083                         goto retry;
3084                 }
3085         }
3086 diff --git a/block/blk-mq.c b/block/blk-mq.c
3087 index 10f8f94b7f20..82500641f37b 100644
3088 --- a/block/blk-mq.c
3089 +++ b/block/blk-mq.c
3090 @@ -72,7 +72,7 @@ EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_start);
3092  static void blk_mq_freeze_queue_wait(struct request_queue *q)
3094 -       wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter));
3095 +       swait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter));
3098  /*
3099 @@ -110,7 +110,7 @@ void blk_mq_unfreeze_queue(struct request_queue *q)
3100         WARN_ON_ONCE(freeze_depth < 0);
3101         if (!freeze_depth) {
3102                 percpu_ref_reinit(&q->q_usage_counter);
3103 -               wake_up_all(&q->mq_freeze_wq);
3104 +               swake_up_all(&q->mq_freeze_wq);
3105         }
3107  EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);
3108 @@ -129,7 +129,7 @@ void blk_mq_wake_waiters(struct request_queue *q)
3109          * dying, we need to ensure that processes currently waiting on
3110          * the queue are notified as well.
3111          */
3112 -       wake_up_all(&q->mq_freeze_wq);
3113 +       swake_up_all(&q->mq_freeze_wq);
3116  bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
3117 @@ -177,6 +177,9 @@ static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
3118         rq->resid_len = 0;
3119         rq->sense = NULL;
3121 +#ifdef CONFIG_PREEMPT_RT_FULL
3122 +       INIT_WORK(&rq->work, __blk_mq_complete_request_remote_work);
3123 +#endif
3124         INIT_LIST_HEAD(&rq->timeout_list);
3125         rq->timeout = 0;
3127 @@ -345,6 +348,17 @@ void blk_mq_end_request(struct request *rq, int error)
3129  EXPORT_SYMBOL(blk_mq_end_request);
3131 +#ifdef CONFIG_PREEMPT_RT_FULL
3133 +void __blk_mq_complete_request_remote_work(struct work_struct *work)
3135 +       struct request *rq = container_of(work, struct request, work);
3137 +       rq->q->softirq_done_fn(rq);
3140 +#else
3142  static void __blk_mq_complete_request_remote(void *data)
3144         struct request *rq = data;
3145 @@ -352,6 +366,8 @@ static void __blk_mq_complete_request_remote(void *data)
3146         rq->q->softirq_done_fn(rq);
3149 +#endif
3151  static void blk_mq_ipi_complete_request(struct request *rq)
3153         struct blk_mq_ctx *ctx = rq->mq_ctx;
3154 @@ -363,19 +379,23 @@ static void blk_mq_ipi_complete_request(struct request *rq)
3155                 return;
3156         }
3158 -       cpu = get_cpu();
3159 +       cpu = get_cpu_light();
3160         if (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags))
3161                 shared = cpus_share_cache(cpu, ctx->cpu);
3163         if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) {
3164 +#ifdef CONFIG_PREEMPT_RT_FULL
3165 +               schedule_work_on(ctx->cpu, &rq->work);
3166 +#else
3167                 rq->csd.func = __blk_mq_complete_request_remote;
3168                 rq->csd.info = rq;
3169                 rq->csd.flags = 0;
3170                 smp_call_function_single_async(ctx->cpu, &rq->csd);
3171 +#endif
3172         } else {
3173                 rq->q->softirq_done_fn(rq);
3174         }
3175 -       put_cpu();
3176 +       put_cpu_light();
3179  static void __blk_mq_complete_request(struct request *rq)
3180 @@ -906,14 +926,14 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
3181                 return;
3183         if (!async && !(hctx->flags & BLK_MQ_F_BLOCKING)) {
3184 -               int cpu = get_cpu();
3185 +               int cpu = get_cpu_light();
3186                 if (cpumask_test_cpu(cpu, hctx->cpumask)) {
3187                         __blk_mq_run_hw_queue(hctx);
3188 -                       put_cpu();
3189 +                       put_cpu_light();
3190                         return;
3191                 }
3193 -               put_cpu();
3194 +               put_cpu_light();
3195         }
3197         kblockd_schedule_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work);
3198 diff --git a/block/blk-mq.h b/block/blk-mq.h
3199 index c55bcf67b956..c26a84d44cc4 100644
3200 --- a/block/blk-mq.h
3201 +++ b/block/blk-mq.h
3202 @@ -73,12 +73,12 @@ static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q,
3203   */
3204  static inline struct blk_mq_ctx *blk_mq_get_ctx(struct request_queue *q)
3206 -       return __blk_mq_get_ctx(q, get_cpu());
3207 +       return __blk_mq_get_ctx(q, get_cpu_light());
3210  static inline void blk_mq_put_ctx(struct blk_mq_ctx *ctx)
3212 -       put_cpu();
3213 +       put_cpu_light();
3216  struct blk_mq_alloc_data {
3217 diff --git a/block/blk-softirq.c b/block/blk-softirq.c
3218 index 06cf9807f49a..c40342643ca0 100644
3219 --- a/block/blk-softirq.c
3220 +++ b/block/blk-softirq.c
3221 @@ -51,6 +51,7 @@ static void trigger_softirq(void *data)
3222                 raise_softirq_irqoff(BLOCK_SOFTIRQ);
3224         local_irq_restore(flags);
3225 +       preempt_check_resched_rt();
3228  /*
3229 @@ -89,6 +90,7 @@ static int blk_softirq_cpu_dead(unsigned int cpu)
3230                          this_cpu_ptr(&blk_cpu_done));
3231         raise_softirq_irqoff(BLOCK_SOFTIRQ);
3232         local_irq_enable();
3233 +       preempt_check_resched_rt();
3235         return 0;
3237 @@ -141,6 +143,7 @@ void __blk_complete_request(struct request *req)
3238                 goto do_local;
3240         local_irq_restore(flags);
3241 +       preempt_check_resched_rt();
3244  /**
3245 diff --git a/block/bounce.c b/block/bounce.c
3246 index 1cb5dd3a5da1..2f1ec8a67cbe 100644
3247 --- a/block/bounce.c
3248 +++ b/block/bounce.c
3249 @@ -55,11 +55,11 @@ static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom)
3250         unsigned long flags;
3251         unsigned char *vto;
3253 -       local_irq_save(flags);
3254 +       local_irq_save_nort(flags);
3255         vto = kmap_atomic(to->bv_page);
3256         memcpy(vto + to->bv_offset, vfrom, to->bv_len);
3257         kunmap_atomic(vto);
3258 -       local_irq_restore(flags);
3259 +       local_irq_restore_nort(flags);
3262  #else /* CONFIG_HIGHMEM */
3263 diff --git a/crypto/algapi.c b/crypto/algapi.c
3264 index 1fad2a6b3bbb..ecb7315426a9 100644
3265 --- a/crypto/algapi.c
3266 +++ b/crypto/algapi.c
3267 @@ -719,13 +719,13 @@ EXPORT_SYMBOL_GPL(crypto_spawn_tfm2);
3269  int crypto_register_notifier(struct notifier_block *nb)
3271 -       return blocking_notifier_chain_register(&crypto_chain, nb);
3272 +       return srcu_notifier_chain_register(&crypto_chain, nb);
3274  EXPORT_SYMBOL_GPL(crypto_register_notifier);
3276  int crypto_unregister_notifier(struct notifier_block *nb)
3278 -       return blocking_notifier_chain_unregister(&crypto_chain, nb);
3279 +       return srcu_notifier_chain_unregister(&crypto_chain, nb);
3281  EXPORT_SYMBOL_GPL(crypto_unregister_notifier);
3283 diff --git a/crypto/api.c b/crypto/api.c
3284 index bbc147cb5dec..bc1a848f02ec 100644
3285 --- a/crypto/api.c
3286 +++ b/crypto/api.c
3287 @@ -31,7 +31,7 @@ EXPORT_SYMBOL_GPL(crypto_alg_list);
3288  DECLARE_RWSEM(crypto_alg_sem);
3289  EXPORT_SYMBOL_GPL(crypto_alg_sem);
3291 -BLOCKING_NOTIFIER_HEAD(crypto_chain);
3292 +SRCU_NOTIFIER_HEAD(crypto_chain);
3293  EXPORT_SYMBOL_GPL(crypto_chain);
3295  static struct crypto_alg *crypto_larval_wait(struct crypto_alg *alg);
3296 @@ -236,10 +236,10 @@ int crypto_probing_notify(unsigned long val, void *v)
3298         int ok;
3300 -       ok = blocking_notifier_call_chain(&crypto_chain, val, v);
3301 +       ok = srcu_notifier_call_chain(&crypto_chain, val, v);
3302         if (ok == NOTIFY_DONE) {
3303                 request_module("cryptomgr");
3304 -               ok = blocking_notifier_call_chain(&crypto_chain, val, v);
3305 +               ok = srcu_notifier_call_chain(&crypto_chain, val, v);
3306         }
3308         return ok;
3309 diff --git a/crypto/internal.h b/crypto/internal.h
3310 index 7eefcdb00227..0ecc7f5a2f40 100644
3311 --- a/crypto/internal.h
3312 +++ b/crypto/internal.h
3313 @@ -47,7 +47,7 @@ struct crypto_larval {
3315  extern struct list_head crypto_alg_list;
3316  extern struct rw_semaphore crypto_alg_sem;
3317 -extern struct blocking_notifier_head crypto_chain;
3318 +extern struct srcu_notifier_head crypto_chain;
3320  #ifdef CONFIG_PROC_FS
3321  void __init crypto_init_proc(void);
3322 @@ -146,7 +146,7 @@ static inline int crypto_is_moribund(struct crypto_alg *alg)
3324  static inline void crypto_notify(unsigned long val, void *v)
3326 -       blocking_notifier_call_chain(&crypto_chain, val, v);
3327 +       srcu_notifier_call_chain(&crypto_chain, val, v);
3330  #endif /* _CRYPTO_INTERNAL_H */
3331 diff --git a/drivers/acpi/acpica/acglobal.h b/drivers/acpi/acpica/acglobal.h
3332 index 750fa824d42c..441edf51484a 100644
3333 --- a/drivers/acpi/acpica/acglobal.h
3334 +++ b/drivers/acpi/acpica/acglobal.h
3335 @@ -116,7 +116,7 @@ ACPI_GLOBAL(u8, acpi_gbl_global_lock_pending);
3336   * interrupt level
3337   */
3338  ACPI_GLOBAL(acpi_spinlock, acpi_gbl_gpe_lock); /* For GPE data structs and registers */
3339 -ACPI_GLOBAL(acpi_spinlock, acpi_gbl_hardware_lock);    /* For ACPI H/W except GPE registers */
3340 +ACPI_GLOBAL(acpi_raw_spinlock, acpi_gbl_hardware_lock);        /* For ACPI H/W except GPE registers */
3341  ACPI_GLOBAL(acpi_spinlock, acpi_gbl_reference_count_lock);
3343  /* Mutex for _OSI support */
3344 diff --git a/drivers/acpi/acpica/hwregs.c b/drivers/acpi/acpica/hwregs.c
3345 index 3b7fb99362b6..696bf8e62afb 100644
3346 --- a/drivers/acpi/acpica/hwregs.c
3347 +++ b/drivers/acpi/acpica/hwregs.c
3348 @@ -363,14 +363,14 @@ acpi_status acpi_hw_clear_acpi_status(void)
3349                           ACPI_BITMASK_ALL_FIXED_STATUS,
3350                           ACPI_FORMAT_UINT64(acpi_gbl_xpm1a_status.address)));
3352 -       lock_flags = acpi_os_acquire_lock(acpi_gbl_hardware_lock);
3353 +       raw_spin_lock_irqsave(acpi_gbl_hardware_lock, lock_flags);
3355         /* Clear the fixed events in PM1 A/B */
3357         status = acpi_hw_register_write(ACPI_REGISTER_PM1_STATUS,
3358                                         ACPI_BITMASK_ALL_FIXED_STATUS);
3360 -       acpi_os_release_lock(acpi_gbl_hardware_lock, lock_flags);
3361 +       raw_spin_unlock_irqrestore(acpi_gbl_hardware_lock, lock_flags);
3363         if (ACPI_FAILURE(status)) {
3364                 goto exit;
3365 diff --git a/drivers/acpi/acpica/hwxface.c b/drivers/acpi/acpica/hwxface.c
3366 index 98c26ff39409..6e236f2ea791 100644
3367 --- a/drivers/acpi/acpica/hwxface.c
3368 +++ b/drivers/acpi/acpica/hwxface.c
3369 @@ -373,7 +373,7 @@ acpi_status acpi_write_bit_register(u32 register_id, u32 value)
3370                 return_ACPI_STATUS(AE_BAD_PARAMETER);
3371         }
3373 -       lock_flags = acpi_os_acquire_lock(acpi_gbl_hardware_lock);
3374 +       raw_spin_lock_irqsave(acpi_gbl_hardware_lock, lock_flags);
3376         /*
3377          * At this point, we know that the parent register is one of the
3378 @@ -434,7 +434,7 @@ acpi_status acpi_write_bit_register(u32 register_id, u32 value)
3380  unlock_and_exit:
3382 -       acpi_os_release_lock(acpi_gbl_hardware_lock, lock_flags);
3383 +       raw_spin_unlock_irqrestore(acpi_gbl_hardware_lock, lock_flags);
3384         return_ACPI_STATUS(status);
3387 diff --git a/drivers/acpi/acpica/utmutex.c b/drivers/acpi/acpica/utmutex.c
3388 index 15073375bd00..357e7ca5a587 100644
3389 --- a/drivers/acpi/acpica/utmutex.c
3390 +++ b/drivers/acpi/acpica/utmutex.c
3391 @@ -88,7 +88,7 @@ acpi_status acpi_ut_mutex_initialize(void)
3392                 return_ACPI_STATUS (status);
3393         }
3395 -       status = acpi_os_create_lock (&acpi_gbl_hardware_lock);
3396 +       status = acpi_os_create_raw_lock (&acpi_gbl_hardware_lock);
3397         if (ACPI_FAILURE (status)) {
3398                 return_ACPI_STATUS (status);
3399         }
3400 @@ -145,7 +145,7 @@ void acpi_ut_mutex_terminate(void)
3401         /* Delete the spinlocks */
3403         acpi_os_delete_lock(acpi_gbl_gpe_lock);
3404 -       acpi_os_delete_lock(acpi_gbl_hardware_lock);
3405 +       acpi_os_delete_raw_lock(acpi_gbl_hardware_lock);
3406         acpi_os_delete_lock(acpi_gbl_reference_count_lock);
3408         /* Delete the reader/writer lock */
3409 diff --git a/drivers/ata/libata-sff.c b/drivers/ata/libata-sff.c
3410 index 8d22acdf90f0..64fbad747da9 100644
3411 --- a/drivers/ata/libata-sff.c
3412 +++ b/drivers/ata/libata-sff.c
3413 @@ -678,9 +678,9 @@ unsigned int ata_sff_data_xfer_noirq(struct ata_device *dev, unsigned char *buf,
3414         unsigned long flags;
3415         unsigned int consumed;
3417 -       local_irq_save(flags);
3418 +       local_irq_save_nort(flags);
3419         consumed = ata_sff_data_xfer32(dev, buf, buflen, rw);
3420 -       local_irq_restore(flags);
3421 +       local_irq_restore_nort(flags);
3423         return consumed;
3425 @@ -719,7 +719,7 @@ static void ata_pio_sector(struct ata_queued_cmd *qc)
3426                 unsigned long flags;
3428                 /* FIXME: use a bounce buffer */
3429 -               local_irq_save(flags);
3430 +               local_irq_save_nort(flags);
3431                 buf = kmap_atomic(page);
3433                 /* do the actual data transfer */
3434 @@ -727,7 +727,7 @@ static void ata_pio_sector(struct ata_queued_cmd *qc)
3435                                        do_write);
3437                 kunmap_atomic(buf);
3438 -               local_irq_restore(flags);
3439 +               local_irq_restore_nort(flags);
3440         } else {
3441                 buf = page_address(page);
3442                 ap->ops->sff_data_xfer(qc->dev, buf + offset, qc->sect_size,
3443 @@ -864,7 +864,7 @@ static int __atapi_pio_bytes(struct ata_queued_cmd *qc, unsigned int bytes)
3444                 unsigned long flags;
3446                 /* FIXME: use bounce buffer */
3447 -               local_irq_save(flags);
3448 +               local_irq_save_nort(flags);
3449                 buf = kmap_atomic(page);
3451                 /* do the actual data transfer */
3452 @@ -872,7 +872,7 @@ static int __atapi_pio_bytes(struct ata_queued_cmd *qc, unsigned int bytes)
3453                                                                 count, rw);
3455                 kunmap_atomic(buf);
3456 -               local_irq_restore(flags);
3457 +               local_irq_restore_nort(flags);
3458         } else {
3459                 buf = page_address(page);
3460                 consumed = ap->ops->sff_data_xfer(dev,  buf + offset,
3461 diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c
3462 index 4b5cd3a7b2b6..8c93ee150ee8 100644
3463 --- a/drivers/block/zram/zcomp.c
3464 +++ b/drivers/block/zram/zcomp.c
3465 @@ -118,12 +118,20 @@ ssize_t zcomp_available_show(const char *comp, char *buf)
3467  struct zcomp_strm *zcomp_stream_get(struct zcomp *comp)
3469 -       return *get_cpu_ptr(comp->stream);
3470 +       struct zcomp_strm *zstrm;
3472 +       zstrm = *get_local_ptr(comp->stream);
3473 +       spin_lock(&zstrm->zcomp_lock);
3474 +       return zstrm;
3477  void zcomp_stream_put(struct zcomp *comp)
3479 -       put_cpu_ptr(comp->stream);
3480 +       struct zcomp_strm *zstrm;
3482 +       zstrm = *this_cpu_ptr(comp->stream);
3483 +       spin_unlock(&zstrm->zcomp_lock);
3484 +       put_local_ptr(zstrm);
3487  int zcomp_compress(struct zcomp_strm *zstrm,
3488 @@ -174,6 +182,7 @@ static int __zcomp_cpu_notifier(struct zcomp *comp,
3489                         pr_err("Can't allocate a compression stream\n");
3490                         return NOTIFY_BAD;
3491                 }
3492 +               spin_lock_init(&zstrm->zcomp_lock);
3493                 *per_cpu_ptr(comp->stream, cpu) = zstrm;
3494                 break;
3495         case CPU_DEAD:
3496 diff --git a/drivers/block/zram/zcomp.h b/drivers/block/zram/zcomp.h
3497 index 478cac2ed465..f7a6efdc3285 100644
3498 --- a/drivers/block/zram/zcomp.h
3499 +++ b/drivers/block/zram/zcomp.h
3500 @@ -14,6 +14,7 @@ struct zcomp_strm {
3501         /* compression/decompression buffer */
3502         void *buffer;
3503         struct crypto_comp *tfm;
3504 +       spinlock_t zcomp_lock;
3505  };
3507  /* dynamic per-device compression frontend */
3508 diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
3509 index b7c0b69a02f5..47d033b8a966 100644
3510 --- a/drivers/block/zram/zram_drv.c
3511 +++ b/drivers/block/zram/zram_drv.c
3512 @@ -528,6 +528,8 @@ static struct zram_meta *zram_meta_alloc(char *pool_name, u64 disksize)
3513                 goto out_error;
3514         }
3516 +       zram_meta_init_table_locks(meta, disksize);
3518         return meta;
3520  out_error:
3521 @@ -575,28 +577,28 @@ static int zram_decompress_page(struct zram *zram, char *mem, u32 index)
3522         struct zram_meta *meta = zram->meta;
3523         unsigned long handle;
3524         unsigned int size;
3525 +       struct zcomp_strm *zstrm;
3527 -       bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
3528 +       zram_lock_table(&meta->table[index]);
3529         handle = meta->table[index].handle;
3530         size = zram_get_obj_size(meta, index);
3532         if (!handle || zram_test_flag(meta, index, ZRAM_ZERO)) {
3533 -               bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
3534 +               zram_unlock_table(&meta->table[index]);
3535                 memset(mem, 0, PAGE_SIZE);
3536                 return 0;
3537         }
3539 +       zstrm = zcomp_stream_get(zram->comp);
3540         cmem = zs_map_object(meta->mem_pool, handle, ZS_MM_RO);
3541         if (size == PAGE_SIZE) {
3542                 memcpy(mem, cmem, PAGE_SIZE);
3543         } else {
3544 -               struct zcomp_strm *zstrm = zcomp_stream_get(zram->comp);
3546                 ret = zcomp_decompress(zstrm, cmem, size, mem);
3547 -               zcomp_stream_put(zram->comp);
3548         }
3549         zs_unmap_object(meta->mem_pool, handle);
3550 -       bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
3551 +       zcomp_stream_put(zram->comp);
3552 +       zram_unlock_table(&meta->table[index]);
3554         /* Should NEVER happen. Return bio error if it does. */
3555         if (unlikely(ret)) {
3556 @@ -616,14 +618,14 @@ static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
3557         struct zram_meta *meta = zram->meta;
3558         page = bvec->bv_page;
3560 -       bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
3561 +       zram_lock_table(&meta->table[index]);
3562         if (unlikely(!meta->table[index].handle) ||
3563                         zram_test_flag(meta, index, ZRAM_ZERO)) {
3564 -               bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
3565 +               zram_unlock_table(&meta->table[index]);
3566                 handle_zero_page(bvec);
3567                 return 0;
3568         }
3569 -       bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
3570 +       zram_unlock_table(&meta->table[index]);
3572         if (is_partial_io(bvec))
3573                 /* Use  a temporary buffer to decompress the page */
3574 @@ -700,10 +702,10 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
3575                 if (user_mem)
3576                         kunmap_atomic(user_mem);
3577                 /* Free memory associated with this sector now. */
3578 -               bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
3579 +               zram_lock_table(&meta->table[index]);
3580                 zram_free_page(zram, index);
3581                 zram_set_flag(meta, index, ZRAM_ZERO);
3582 -               bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
3583 +               zram_unlock_table(&meta->table[index]);
3585                 atomic64_inc(&zram->stats.zero_pages);
3586                 ret = 0;
3587 @@ -794,12 +796,12 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
3588          * Free memory associated with this sector
3589          * before overwriting unused sectors.
3590          */
3591 -       bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
3592 +       zram_lock_table(&meta->table[index]);
3593         zram_free_page(zram, index);
3595         meta->table[index].handle = handle;
3596         zram_set_obj_size(meta, index, clen);
3597 -       bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
3598 +       zram_unlock_table(&meta->table[index]);
3600         /* Update stats */
3601         atomic64_add(clen, &zram->stats.compr_data_size);
3602 @@ -842,9 +844,9 @@ static void zram_bio_discard(struct zram *zram, u32 index,
3603         }
3605         while (n >= PAGE_SIZE) {
3606 -               bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
3607 +               zram_lock_table(&meta->table[index]);
3608                 zram_free_page(zram, index);
3609 -               bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
3610 +               zram_unlock_table(&meta->table[index]);
3611                 atomic64_inc(&zram->stats.notify_free);
3612                 index++;
3613                 n -= PAGE_SIZE;
3614 @@ -973,9 +975,9 @@ static void zram_slot_free_notify(struct block_device *bdev,
3615         zram = bdev->bd_disk->private_data;
3616         meta = zram->meta;
3618 -       bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
3619 +       zram_lock_table(&meta->table[index]);
3620         zram_free_page(zram, index);
3621 -       bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
3622 +       zram_unlock_table(&meta->table[index]);
3623         atomic64_inc(&zram->stats.notify_free);
3626 diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
3627 index 74fcf10da374..fd4020c99b9e 100644
3628 --- a/drivers/block/zram/zram_drv.h
3629 +++ b/drivers/block/zram/zram_drv.h
3630 @@ -73,6 +73,9 @@ enum zram_pageflags {
3631  struct zram_table_entry {
3632         unsigned long handle;
3633         unsigned long value;
3634 +#ifdef CONFIG_PREEMPT_RT_BASE
3635 +       spinlock_t lock;
3636 +#endif
3637  };
3639  struct zram_stats {
3640 @@ -120,4 +123,42 @@ struct zram {
3641          */
3642         bool claim; /* Protected by bdev->bd_mutex */
3643  };
3645 +#ifndef CONFIG_PREEMPT_RT_BASE
3646 +static inline void zram_lock_table(struct zram_table_entry *table)
3648 +       bit_spin_lock(ZRAM_ACCESS, &table->value);
3651 +static inline void zram_unlock_table(struct zram_table_entry *table)
3653 +       bit_spin_unlock(ZRAM_ACCESS, &table->value);
3656 +static inline void zram_meta_init_table_locks(struct zram_meta *meta, u64 disksize) { }
3657 +#else /* CONFIG_PREEMPT_RT_BASE */
3658 +static inline void zram_lock_table(struct zram_table_entry *table)
3660 +       spin_lock(&table->lock);
3661 +       __set_bit(ZRAM_ACCESS, &table->value);
3664 +static inline void zram_unlock_table(struct zram_table_entry *table)
3666 +       __clear_bit(ZRAM_ACCESS, &table->value);
3667 +       spin_unlock(&table->lock);
3670 +static inline void zram_meta_init_table_locks(struct zram_meta *meta, u64 disksize)
3672 +        size_t num_pages = disksize >> PAGE_SHIFT;
3673 +        size_t index;
3675 +        for (index = 0; index < num_pages; index++) {
3676 +               spinlock_t *lock = &meta->table[index].lock;
3677 +               spin_lock_init(lock);
3678 +        }
3680 +#endif /* CONFIG_PREEMPT_RT_BASE */
3682  #endif
3683 diff --git a/drivers/char/random.c b/drivers/char/random.c
3684 index 08d1dd58c0d2..25ee319dc8e3 100644
3685 --- a/drivers/char/random.c
3686 +++ b/drivers/char/random.c
3687 @@ -262,6 +262,7 @@
3688  #include <linux/syscalls.h>
3689  #include <linux/completion.h>
3690  #include <linux/uuid.h>
3691 +#include <linux/locallock.h>
3692  #include <crypto/chacha20.h>
3694  #include <asm/processor.h>
3695 @@ -1028,8 +1029,6 @@ static void add_timer_randomness(struct timer_rand_state *state, unsigned num)
3696         } sample;
3697         long delta, delta2, delta3;
3699 -       preempt_disable();
3701         sample.jiffies = jiffies;
3702         sample.cycles = random_get_entropy();
3703         sample.num = num;
3704 @@ -1070,7 +1069,6 @@ static void add_timer_randomness(struct timer_rand_state *state, unsigned num)
3705                  */
3706                 credit_entropy_bits(r, min_t(int, fls(delta>>1), 11));
3707         }
3708 -       preempt_enable();
3711  void add_input_randomness(unsigned int type, unsigned int code,
3712 @@ -1123,28 +1121,27 @@ static __u32 get_reg(struct fast_pool *f, struct pt_regs *regs)
3713         return *(ptr + f->reg_idx++);
3716 -void add_interrupt_randomness(int irq, int irq_flags)
3717 +void add_interrupt_randomness(int irq, int irq_flags, __u64 ip)
3719         struct entropy_store    *r;
3720         struct fast_pool        *fast_pool = this_cpu_ptr(&irq_randomness);
3721 -       struct pt_regs          *regs = get_irq_regs();
3722         unsigned long           now = jiffies;
3723         cycles_t                cycles = random_get_entropy();
3724         __u32                   c_high, j_high;
3725 -       __u64                   ip;
3726         unsigned long           seed;
3727         int                     credit = 0;
3729         if (cycles == 0)
3730 -               cycles = get_reg(fast_pool, regs);
3731 +               cycles = get_reg(fast_pool, NULL);
3732         c_high = (sizeof(cycles) > 4) ? cycles >> 32 : 0;
3733         j_high = (sizeof(now) > 4) ? now >> 32 : 0;
3734         fast_pool->pool[0] ^= cycles ^ j_high ^ irq;
3735         fast_pool->pool[1] ^= now ^ c_high;
3736 -       ip = regs ? instruction_pointer(regs) : _RET_IP_;
3737 +       if (!ip)
3738 +               ip = _RET_IP_;
3739         fast_pool->pool[2] ^= ip;
3740         fast_pool->pool[3] ^= (sizeof(ip) > 4) ? ip >> 32 :
3741 -               get_reg(fast_pool, regs);
3742 +               get_reg(fast_pool, NULL);
3744         fast_mix(fast_pool);
3745         add_interrupt_bench(cycles);
3746 @@ -2056,6 +2053,7 @@ struct batched_entropy {
3747   * goal of being quite fast and not depleting entropy.
3748   */
3749  static DEFINE_PER_CPU(struct batched_entropy, batched_entropy_long);
3750 +static DEFINE_LOCAL_IRQ_LOCK(batched_entropy_long_lock);
3751  unsigned long get_random_long(void)
3753         unsigned long ret;
3754 @@ -2064,13 +2062,13 @@ unsigned long get_random_long(void)
3755         if (arch_get_random_long(&ret))
3756                 return ret;
3758 -       batch = &get_cpu_var(batched_entropy_long);
3759 +       batch = &get_locked_var(batched_entropy_long_lock, batched_entropy_long);
3760         if (batch->position % ARRAY_SIZE(batch->entropy_long) == 0) {
3761                 extract_crng((u8 *)batch->entropy_long);
3762                 batch->position = 0;
3763         }
3764         ret = batch->entropy_long[batch->position++];
3765 -       put_cpu_var(batched_entropy_long);
3766 +       put_locked_var(batched_entropy_long_lock, batched_entropy_long);
3767         return ret;
3769  EXPORT_SYMBOL(get_random_long);
3770 @@ -2082,6 +2080,8 @@ unsigned int get_random_int(void)
3772  #else
3773  static DEFINE_PER_CPU(struct batched_entropy, batched_entropy_int);
3774 +static DEFINE_LOCAL_IRQ_LOCK(batched_entropy_int_lock);
3776  unsigned int get_random_int(void)
3778         unsigned int ret;
3779 @@ -2090,13 +2090,13 @@ unsigned int get_random_int(void)
3780         if (arch_get_random_int(&ret))
3781                 return ret;
3783 -       batch = &get_cpu_var(batched_entropy_int);
3784 +       batch = &get_locked_var(batched_entropy_int_lock, batched_entropy_int);
3785         if (batch->position % ARRAY_SIZE(batch->entropy_int) == 0) {
3786                 extract_crng((u8 *)batch->entropy_int);
3787                 batch->position = 0;
3788         }
3789         ret = batch->entropy_int[batch->position++];
3790 -       put_cpu_var(batched_entropy_int);
3791 +       put_locked_var(batched_entropy_int_lock, batched_entropy_int);
3792         return ret;
3794  #endif
3795 diff --git a/drivers/char/tpm/tpm_tis.c b/drivers/char/tpm/tpm_tis.c
3796 index 8022bea27fed..247330efd310 100644
3797 --- a/drivers/char/tpm/tpm_tis.c
3798 +++ b/drivers/char/tpm/tpm_tis.c
3799 @@ -50,6 +50,31 @@ static inline struct tpm_tis_tcg_phy *to_tpm_tis_tcg_phy(struct tpm_tis_data *da
3800         return container_of(data, struct tpm_tis_tcg_phy, priv);
3803 +#ifdef CONFIG_PREEMPT_RT_FULL
3805 + * Flushes previous write operations to chip so that a subsequent
3806 + * ioread*()s won't stall a cpu.
3807 + */
3808 +static inline void tpm_tis_flush(void __iomem *iobase)
3810 +       ioread8(iobase + TPM_ACCESS(0));
3812 +#else
3813 +#define tpm_tis_flush(iobase) do { } while (0)
3814 +#endif
3816 +static inline void tpm_tis_iowrite8(u8 b, void __iomem *iobase, u32 addr)
3818 +       iowrite8(b, iobase + addr);
3819 +       tpm_tis_flush(iobase);
3822 +static inline void tpm_tis_iowrite32(u32 b, void __iomem *iobase, u32 addr)
3824 +       iowrite32(b, iobase + addr);
3825 +       tpm_tis_flush(iobase);
3828  static bool interrupts = true;
3829  module_param(interrupts, bool, 0444);
3830  MODULE_PARM_DESC(interrupts, "Enable interrupts");
3831 @@ -103,7 +128,7 @@ static int tpm_tcg_write_bytes(struct tpm_tis_data *data, u32 addr, u16 len,
3832         struct tpm_tis_tcg_phy *phy = to_tpm_tis_tcg_phy(data);
3834         while (len--)
3835 -               iowrite8(*value++, phy->iobase + addr);
3836 +               tpm_tis_iowrite8(*value++, phy->iobase, addr);
3837         return 0;
3840 @@ -127,7 +152,7 @@ static int tpm_tcg_write32(struct tpm_tis_data *data, u32 addr, u32 value)
3842         struct tpm_tis_tcg_phy *phy = to_tpm_tis_tcg_phy(data);
3844 -       iowrite32(value, phy->iobase + addr);
3845 +       tpm_tis_iowrite32(value, phy->iobase, addr);
3846         return 0;
3849 diff --git a/drivers/clocksource/tcb_clksrc.c b/drivers/clocksource/tcb_clksrc.c
3850 index 4da2af9694a2..5b6f57f500b8 100644
3851 --- a/drivers/clocksource/tcb_clksrc.c
3852 +++ b/drivers/clocksource/tcb_clksrc.c
3853 @@ -23,8 +23,7 @@
3854   *     this 32 bit free-running counter. the second channel is not used.
3855   *
3856   *   - The third channel may be used to provide a 16-bit clockevent
3857 - *     source, used in either periodic or oneshot mode.  This runs
3858 - *     at 32 KiHZ, and can handle delays of up to two seconds.
3859 + *     source, used in either periodic or oneshot mode.
3860   *
3861   * A boot clocksource and clockevent source are also currently needed,
3862   * unless the relevant platforms (ARM/AT91, AVR32/AT32) are changed so
3863 @@ -74,6 +73,8 @@ static struct clocksource clksrc = {
3864  struct tc_clkevt_device {
3865         struct clock_event_device       clkevt;
3866         struct clk                      *clk;
3867 +       bool                            clk_enabled;
3868 +       u32                             freq;
3869         void __iomem                    *regs;
3870  };
3872 @@ -82,15 +83,26 @@ static struct tc_clkevt_device *to_tc_clkevt(struct clock_event_device *clkevt)
3873         return container_of(clkevt, struct tc_clkevt_device, clkevt);
3876 -/* For now, we always use the 32K clock ... this optimizes for NO_HZ,
3877 - * because using one of the divided clocks would usually mean the
3878 - * tick rate can never be less than several dozen Hz (vs 0.5 Hz).
3879 - *
3880 - * A divided clock could be good for high resolution timers, since
3881 - * 30.5 usec resolution can seem "low".
3882 - */
3883  static u32 timer_clock;
3885 +static void tc_clk_disable(struct clock_event_device *d)
3887 +       struct tc_clkevt_device *tcd = to_tc_clkevt(d);
3889 +       clk_disable(tcd->clk);
3890 +       tcd->clk_enabled = false;
3893 +static void tc_clk_enable(struct clock_event_device *d)
3895 +       struct tc_clkevt_device *tcd = to_tc_clkevt(d);
3897 +       if (tcd->clk_enabled)
3898 +               return;
3899 +       clk_enable(tcd->clk);
3900 +       tcd->clk_enabled = true;
3903  static int tc_shutdown(struct clock_event_device *d)
3905         struct tc_clkevt_device *tcd = to_tc_clkevt(d);
3906 @@ -98,8 +110,14 @@ static int tc_shutdown(struct clock_event_device *d)
3908         __raw_writel(0xff, regs + ATMEL_TC_REG(2, IDR));
3909         __raw_writel(ATMEL_TC_CLKDIS, regs + ATMEL_TC_REG(2, CCR));
3910 +       return 0;
3913 +static int tc_shutdown_clk_off(struct clock_event_device *d)
3915 +       tc_shutdown(d);
3916         if (!clockevent_state_detached(d))
3917 -               clk_disable(tcd->clk);
3918 +               tc_clk_disable(d);
3920         return 0;
3922 @@ -112,9 +130,9 @@ static int tc_set_oneshot(struct clock_event_device *d)
3923         if (clockevent_state_oneshot(d) || clockevent_state_periodic(d))
3924                 tc_shutdown(d);
3926 -       clk_enable(tcd->clk);
3927 +       tc_clk_enable(d);
3929 -       /* slow clock, count up to RC, then irq and stop */
3930 +       /* count up to RC, then irq and stop */
3931         __raw_writel(timer_clock | ATMEL_TC_CPCSTOP | ATMEL_TC_WAVE |
3932                      ATMEL_TC_WAVESEL_UP_AUTO, regs + ATMEL_TC_REG(2, CMR));
3933         __raw_writel(ATMEL_TC_CPCS, regs + ATMEL_TC_REG(2, IER));
3934 @@ -134,12 +152,12 @@ static int tc_set_periodic(struct clock_event_device *d)
3935         /* By not making the gentime core emulate periodic mode on top
3936          * of oneshot, we get lower overhead and improved accuracy.
3937          */
3938 -       clk_enable(tcd->clk);
3939 +       tc_clk_enable(d);
3941 -       /* slow clock, count up to RC, then irq and restart */
3942 +       /* count up to RC, then irq and restart */
3943         __raw_writel(timer_clock | ATMEL_TC_WAVE | ATMEL_TC_WAVESEL_UP_AUTO,
3944                      regs + ATMEL_TC_REG(2, CMR));
3945 -       __raw_writel((32768 + HZ / 2) / HZ, tcaddr + ATMEL_TC_REG(2, RC));
3946 +       __raw_writel((tcd->freq + HZ / 2) / HZ, tcaddr + ATMEL_TC_REG(2, RC));
3948         /* Enable clock and interrupts on RC compare */
3949         __raw_writel(ATMEL_TC_CPCS, regs + ATMEL_TC_REG(2, IER));
3950 @@ -166,9 +184,13 @@ static struct tc_clkevt_device clkevt = {
3951                 .features               = CLOCK_EVT_FEAT_PERIODIC |
3952                                           CLOCK_EVT_FEAT_ONESHOT,
3953                 /* Should be lower than at91rm9200's system timer */
3954 +#ifdef CONFIG_ATMEL_TCB_CLKSRC_USE_SLOW_CLOCK
3955                 .rating                 = 125,
3956 +#else
3957 +               .rating                 = 200,
3958 +#endif
3959                 .set_next_event         = tc_next_event,
3960 -               .set_state_shutdown     = tc_shutdown,
3961 +               .set_state_shutdown     = tc_shutdown_clk_off,
3962                 .set_state_periodic     = tc_set_periodic,
3963                 .set_state_oneshot      = tc_set_oneshot,
3964         },
3965 @@ -188,8 +210,9 @@ static irqreturn_t ch2_irq(int irq, void *handle)
3966         return IRQ_NONE;
3969 -static int __init setup_clkevents(struct atmel_tc *tc, int clk32k_divisor_idx)
3970 +static int __init setup_clkevents(struct atmel_tc *tc, int divisor_idx)
3972 +       unsigned divisor = atmel_tc_divisors[divisor_idx];
3973         int ret;
3974         struct clk *t2_clk = tc->clk[2];
3975         int irq = tc->irq[2];
3976 @@ -210,7 +233,11 @@ static int __init setup_clkevents(struct atmel_tc *tc, int clk32k_divisor_idx)
3977         clkevt.regs = tc->regs;
3978         clkevt.clk = t2_clk;
3980 -       timer_clock = clk32k_divisor_idx;
3981 +       timer_clock = divisor_idx;
3982 +       if (!divisor)
3983 +               clkevt.freq = 32768;
3984 +       else
3985 +               clkevt.freq = clk_get_rate(t2_clk) / divisor;
3987         clkevt.clkevt.cpumask = cpumask_of(0);
3989 @@ -221,7 +248,7 @@ static int __init setup_clkevents(struct atmel_tc *tc, int clk32k_divisor_idx)
3990                 return ret;
3991         }
3993 -       clockevents_config_and_register(&clkevt.clkevt, 32768, 1, 0xffff);
3994 +       clockevents_config_and_register(&clkevt.clkevt, clkevt.freq, 1, 0xffff);
3996         return ret;
3998 @@ -358,7 +385,11 @@ static int __init tcb_clksrc_init(void)
3999                 goto err_disable_t1;
4001         /* channel 2:  periodic and oneshot timer support */
4002 +#ifdef CONFIG_ATMEL_TCB_CLKSRC_USE_SLOW_CLOCK
4003         ret = setup_clkevents(tc, clk32k_divisor_idx);
4004 +#else
4005 +       ret = setup_clkevents(tc, best_divisor_idx);
4006 +#endif
4007         if (ret)
4008                 goto err_unregister_clksrc;
4010 diff --git a/drivers/clocksource/timer-atmel-pit.c b/drivers/clocksource/timer-atmel-pit.c
4011 index 6555821bbdae..93288849b2bd 100644
4012 --- a/drivers/clocksource/timer-atmel-pit.c
4013 +++ b/drivers/clocksource/timer-atmel-pit.c
4014 @@ -46,6 +46,7 @@ struct pit_data {
4015         u32             cycle;
4016         u32             cnt;
4017         unsigned int    irq;
4018 +       bool            irq_requested;
4019         struct clk      *mck;
4020  };
4022 @@ -96,15 +97,29 @@ static int pit_clkevt_shutdown(struct clock_event_device *dev)
4024         /* disable irq, leaving the clocksource active */
4025         pit_write(data->base, AT91_PIT_MR, (data->cycle - 1) | AT91_PIT_PITEN);
4026 +       if (data->irq_requested) {
4027 +               free_irq(data->irq, data);
4028 +               data->irq_requested = false;
4029 +       }
4030         return 0;
4033 +static irqreturn_t at91sam926x_pit_interrupt(int irq, void *dev_id);
4034  /*
4035   * Clockevent device:  interrupts every 1/HZ (== pit_cycles * MCK/16)
4036   */
4037  static int pit_clkevt_set_periodic(struct clock_event_device *dev)
4039         struct pit_data *data = clkevt_to_pit_data(dev);
4040 +       int ret;
4042 +       ret = request_irq(data->irq, at91sam926x_pit_interrupt,
4043 +                         IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
4044 +                         "at91_tick", data);
4045 +       if (ret)
4046 +               panic(pr_fmt("Unable to setup IRQ\n"));
4048 +       data->irq_requested = true;
4050         /* update clocksource counter */
4051         data->cnt += data->cycle * PIT_PICNT(pit_read(data->base, AT91_PIT_PIVR));
4052 @@ -230,15 +245,6 @@ static int __init at91sam926x_pit_dt_init(struct device_node *node)
4053                 return ret;
4054         }
4056 -       /* Set up irq handler */
4057 -       ret = request_irq(data->irq, at91sam926x_pit_interrupt,
4058 -                         IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
4059 -                         "at91_tick", data);
4060 -       if (ret) {
4061 -               pr_err("Unable to setup IRQ\n");
4062 -               return ret;
4063 -       }
4065         /* Set up and register clockevents */
4066         data->clkevt.name = "pit";
4067         data->clkevt.features = CLOCK_EVT_FEAT_PERIODIC;
4068 diff --git a/drivers/clocksource/timer-atmel-st.c b/drivers/clocksource/timer-atmel-st.c
4069 index e90ab5b63a90..9e124087c55f 100644
4070 --- a/drivers/clocksource/timer-atmel-st.c
4071 +++ b/drivers/clocksource/timer-atmel-st.c
4072 @@ -115,18 +115,29 @@ static void clkdev32k_disable_and_flush_irq(void)
4073         last_crtr = read_CRTR();
4076 +static int atmel_st_irq;
4078  static int clkevt32k_shutdown(struct clock_event_device *evt)
4080         clkdev32k_disable_and_flush_irq();
4081         irqmask = 0;
4082         regmap_write(regmap_st, AT91_ST_IER, irqmask);
4083 +       free_irq(atmel_st_irq, regmap_st);
4084         return 0;
4087  static int clkevt32k_set_oneshot(struct clock_event_device *dev)
4089 +       int ret;
4091         clkdev32k_disable_and_flush_irq();
4093 +       ret = request_irq(atmel_st_irq, at91rm9200_timer_interrupt,
4094 +                         IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
4095 +                         "at91_tick", regmap_st);
4096 +       if (ret)
4097 +               panic(pr_fmt("Unable to setup IRQ\n"));
4099         /*
4100          * ALM for oneshot irqs, set by next_event()
4101          * before 32 seconds have passed.
4102 @@ -139,8 +150,16 @@ static int clkevt32k_set_oneshot(struct clock_event_device *dev)
4104  static int clkevt32k_set_periodic(struct clock_event_device *dev)
4106 +       int ret;
4108         clkdev32k_disable_and_flush_irq();
4110 +       ret = request_irq(atmel_st_irq, at91rm9200_timer_interrupt,
4111 +                         IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
4112 +                         "at91_tick", regmap_st);
4113 +       if (ret)
4114 +               panic(pr_fmt("Unable to setup IRQ\n"));
4116         /* PIT for periodic irqs; fixed rate of 1/HZ */
4117         irqmask = AT91_ST_PITS;
4118         regmap_write(regmap_st, AT91_ST_PIMR, timer_latch);
4119 @@ -198,7 +217,7 @@ static int __init atmel_st_timer_init(struct device_node *node)
4121         struct clk *sclk;
4122         unsigned int sclk_rate, val;
4123 -       int irq, ret;
4124 +       int ret;
4126         regmap_st = syscon_node_to_regmap(node);
4127         if (IS_ERR(regmap_st)) {
4128 @@ -212,21 +231,12 @@ static int __init atmel_st_timer_init(struct device_node *node)
4129         regmap_read(regmap_st, AT91_ST_SR, &val);
4131         /* Get the interrupts property */
4132 -       irq  = irq_of_parse_and_map(node, 0);
4133 -       if (!irq) {
4134 +       atmel_st_irq  = irq_of_parse_and_map(node, 0);
4135 +       if (!atmel_st_irq) {
4136                 pr_err("Unable to get IRQ from DT\n");
4137                 return -EINVAL;
4138         }
4140 -       /* Make IRQs happen for the system timer */
4141 -       ret = request_irq(irq, at91rm9200_timer_interrupt,
4142 -                         IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
4143 -                         "at91_tick", regmap_st);
4144 -       if (ret) {
4145 -               pr_err("Unable to setup IRQ\n");
4146 -               return ret;
4147 -       }
4149         sclk = of_clk_get(node, 0);
4150         if (IS_ERR(sclk)) {
4151                 pr_err("Unable to get slow clock\n");
4152 diff --git a/drivers/connector/cn_proc.c b/drivers/connector/cn_proc.c
4153 index a782ce87715c..19d265948526 100644
4154 --- a/drivers/connector/cn_proc.c
4155 +++ b/drivers/connector/cn_proc.c
4156 @@ -32,6 +32,7 @@
4157  #include <linux/pid_namespace.h>
4159  #include <linux/cn_proc.h>
4160 +#include <linux/locallock.h>
4162  /*
4163   * Size of a cn_msg followed by a proc_event structure.  Since the
4164 @@ -54,10 +55,11 @@ static struct cb_id cn_proc_event_id = { CN_IDX_PROC, CN_VAL_PROC };
4166  /* proc_event_counts is used as the sequence number of the netlink message */
4167  static DEFINE_PER_CPU(__u32, proc_event_counts) = { 0 };
4168 +static DEFINE_LOCAL_IRQ_LOCK(send_msg_lock);
4170  static inline void send_msg(struct cn_msg *msg)
4172 -       preempt_disable();
4173 +       local_lock(send_msg_lock);
4175         msg->seq = __this_cpu_inc_return(proc_event_counts) - 1;
4176         ((struct proc_event *)msg->data)->cpu = smp_processor_id();
4177 @@ -70,7 +72,7 @@ static inline void send_msg(struct cn_msg *msg)
4178          */
4179         cn_netlink_send(msg, 0, CN_IDX_PROC, GFP_NOWAIT);
4181 -       preempt_enable();
4182 +       local_unlock(send_msg_lock);
4185  void proc_fork_connector(struct task_struct *task)
4186 diff --git a/drivers/cpufreq/Kconfig.x86 b/drivers/cpufreq/Kconfig.x86
4187 index adbd1de1cea5..1fac5074f2cf 100644
4188 --- a/drivers/cpufreq/Kconfig.x86
4189 +++ b/drivers/cpufreq/Kconfig.x86
4190 @@ -124,7 +124,7 @@ config X86_POWERNOW_K7_ACPI
4192  config X86_POWERNOW_K8
4193         tristate "AMD Opteron/Athlon64 PowerNow!"
4194 -       depends on ACPI && ACPI_PROCESSOR && X86_ACPI_CPUFREQ
4195 +       depends on ACPI && ACPI_PROCESSOR && X86_ACPI_CPUFREQ && !PREEMPT_RT_BASE
4196         help
4197           This adds the CPUFreq driver for K8/early Opteron/Athlon64 processors.
4198           Support for K10 and newer processors is now in acpi-cpufreq.
4199 diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
4200 index 2117f172d7a2..96c15501b0c8 100644
4201 --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
4202 +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
4203 @@ -1489,7 +1489,9 @@ execbuf_submit(struct i915_execbuffer_params *params,
4204         if (ret)
4205                 return ret;
4207 +#ifndef CONFIG_PREEMPT_RT_BASE
4208         trace_i915_gem_ring_dispatch(params->request, params->dispatch_flags);
4209 +#endif
4211         i915_gem_execbuffer_move_to_active(vmas, params->request);
4213 diff --git a/drivers/gpu/drm/i915/i915_gem_shrinker.c b/drivers/gpu/drm/i915/i915_gem_shrinker.c
4214 index 755d78832a66..97fb03dc4971 100644
4215 --- a/drivers/gpu/drm/i915/i915_gem_shrinker.c
4216 +++ b/drivers/gpu/drm/i915/i915_gem_shrinker.c
4217 @@ -40,7 +40,7 @@ static bool mutex_is_locked_by(struct mutex *mutex, struct task_struct *task)
4218         if (!mutex_is_locked(mutex))
4219                 return false;
4221 -#if defined(CONFIG_DEBUG_MUTEXES) || defined(CONFIG_MUTEX_SPIN_ON_OWNER)
4222 +#if (defined(CONFIG_DEBUG_MUTEXES) || defined(CONFIG_MUTEX_SPIN_ON_OWNER)) && !defined(CONFIG_PREEMPT_RT_BASE)
4223         return mutex->owner == task;
4224  #else
4225         /* Since UP may be pre-empted, we cannot assume that we own the lock */
4226 diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
4227 index 02908e37c228..05c0480576e1 100644
4228 --- a/drivers/gpu/drm/i915/i915_irq.c
4229 +++ b/drivers/gpu/drm/i915/i915_irq.c
4230 @@ -812,6 +812,7 @@ static int i915_get_crtc_scanoutpos(struct drm_device *dev, unsigned int pipe,
4231         spin_lock_irqsave(&dev_priv->uncore.lock, irqflags);
4233         /* preempt_disable_rt() should go right here in PREEMPT_RT patchset. */
4234 +       preempt_disable_rt();
4236         /* Get optional system timestamp before query. */
4237         if (stime)
4238 @@ -863,6 +864,7 @@ static int i915_get_crtc_scanoutpos(struct drm_device *dev, unsigned int pipe,
4239                 *etime = ktime_get();
4241         /* preempt_enable_rt() should go right here in PREEMPT_RT patchset. */
4242 +       preempt_enable_rt();
4244         spin_unlock_irqrestore(&dev_priv->uncore.lock, irqflags);
4246 diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
4247 index ce32303b3013..c0a53bf2e952 100644
4248 --- a/drivers/gpu/drm/i915/intel_display.c
4249 +++ b/drivers/gpu/drm/i915/intel_display.c
4250 @@ -12138,7 +12138,7 @@ void intel_check_page_flip(struct drm_i915_private *dev_priv, int pipe)
4251         struct intel_crtc *intel_crtc = to_intel_crtc(crtc);
4252         struct intel_flip_work *work;
4254 -       WARN_ON(!in_interrupt());
4255 +       WARN_ON_NONRT(!in_interrupt());
4257         if (crtc == NULL)
4258                 return;
4259 diff --git a/drivers/gpu/drm/i915/intel_sprite.c b/drivers/gpu/drm/i915/intel_sprite.c
4260 index 64f4e2e18594..aebf1e9eabcb 100644
4261 --- a/drivers/gpu/drm/i915/intel_sprite.c
4262 +++ b/drivers/gpu/drm/i915/intel_sprite.c
4263 @@ -35,6 +35,7 @@
4264  #include <drm/drm_rect.h>
4265  #include <drm/drm_atomic.h>
4266  #include <drm/drm_plane_helper.h>
4267 +#include <linux/locallock.h>
4268  #include "intel_drv.h"
4269  #include "intel_frontbuffer.h"
4270  #include <drm/i915_drm.h>
4271 @@ -65,6 +66,8 @@ int intel_usecs_to_scanlines(const struct drm_display_mode *adjusted_mode,
4272                             1000 * adjusted_mode->crtc_htotal);
4275 +static DEFINE_LOCAL_IRQ_LOCK(pipe_update_lock);
4277  /**
4278   * intel_pipe_update_start() - start update of a set of display registers
4279   * @crtc: the crtc of which the registers are going to be updated
4280 @@ -98,7 +101,7 @@ void intel_pipe_update_start(struct intel_crtc *crtc)
4281         min = vblank_start - intel_usecs_to_scanlines(adjusted_mode, 100);
4282         max = vblank_start - 1;
4284 -       local_irq_disable();
4285 +       local_lock_irq(pipe_update_lock);
4287         if (min <= 0 || max <= 0)
4288                 return;
4289 @@ -128,11 +131,11 @@ void intel_pipe_update_start(struct intel_crtc *crtc)
4290                         break;
4291                 }
4293 -               local_irq_enable();
4294 +               local_unlock_irq(pipe_update_lock);
4296                 timeout = schedule_timeout(timeout);
4298 -               local_irq_disable();
4299 +               local_lock_irq(pipe_update_lock);
4300         }
4302         finish_wait(wq, &wait);
4303 @@ -202,7 +205,7 @@ void intel_pipe_update_end(struct intel_crtc *crtc, struct intel_flip_work *work
4304                 crtc->base.state->event = NULL;
4305         }
4307 -       local_irq_enable();
4308 +       local_unlock_irq(pipe_update_lock);
4310         if (crtc->debug.start_vbl_count &&
4311             crtc->debug.start_vbl_count != end_vbl_count) {
4312 diff --git a/drivers/gpu/drm/msm/msm_gem_shrinker.c b/drivers/gpu/drm/msm/msm_gem_shrinker.c
4313 index 192b2d3a79cb..d5372a207326 100644
4314 --- a/drivers/gpu/drm/msm/msm_gem_shrinker.c
4315 +++ b/drivers/gpu/drm/msm/msm_gem_shrinker.c
4316 @@ -23,7 +23,7 @@ static bool mutex_is_locked_by(struct mutex *mutex, struct task_struct *task)
4317         if (!mutex_is_locked(mutex))
4318                 return false;
4320 -#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_MUTEXES)
4321 +#if (defined(CONFIG_SMP) || defined(CONFIG_DEBUG_MUTEXES)) && !defined(CONFIG_PREEMPT_RT_BASE)
4322         return mutex->owner == task;
4323  #else
4324         /* Since UP may be pre-empted, we cannot assume that we own the lock */
4325 diff --git a/drivers/gpu/drm/radeon/radeon_display.c b/drivers/gpu/drm/radeon/radeon_display.c
4326 index cdb8cb568c15..b6d7fd964cbc 100644
4327 --- a/drivers/gpu/drm/radeon/radeon_display.c
4328 +++ b/drivers/gpu/drm/radeon/radeon_display.c
4329 @@ -1845,6 +1845,7 @@ int radeon_get_crtc_scanoutpos(struct drm_device *dev, unsigned int pipe,
4330         struct radeon_device *rdev = dev->dev_private;
4332         /* preempt_disable_rt() should go right here in PREEMPT_RT patchset. */
4333 +       preempt_disable_rt();
4335         /* Get optional system timestamp before query. */
4336         if (stime)
4337 @@ -1937,6 +1938,7 @@ int radeon_get_crtc_scanoutpos(struct drm_device *dev, unsigned int pipe,
4338                 *etime = ktime_get();
4340         /* preempt_enable_rt() should go right here in PREEMPT_RT patchset. */
4341 +       preempt_enable_rt();
4343         /* Decode into vertical and horizontal scanout position. */
4344         *vpos = position & 0x1fff;
4345 diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
4346 index 0276d2ef06ee..8868045eabde 100644
4347 --- a/drivers/hv/vmbus_drv.c
4348 +++ b/drivers/hv/vmbus_drv.c
4349 @@ -761,6 +761,8 @@ static void vmbus_isr(void)
4350         void *page_addr;
4351         struct hv_message *msg;
4352         union hv_synic_event_flags *event;
4353 +       struct pt_regs *regs = get_irq_regs();
4354 +       u64 ip = regs ? instruction_pointer(regs) : 0;
4355         bool handled = false;
4357         page_addr = hv_context.synic_event_page[cpu];
4358 @@ -808,7 +810,7 @@ static void vmbus_isr(void)
4359                         tasklet_schedule(hv_context.msg_dpc[cpu]);
4360         }
4362 -       add_interrupt_randomness(HYPERVISOR_CALLBACK_VECTOR, 0);
4363 +       add_interrupt_randomness(HYPERVISOR_CALLBACK_VECTOR, 0, ip);
4367 diff --git a/drivers/ide/alim15x3.c b/drivers/ide/alim15x3.c
4368 index 36f76e28a0bf..394f142f90c7 100644
4369 --- a/drivers/ide/alim15x3.c
4370 +++ b/drivers/ide/alim15x3.c
4371 @@ -234,7 +234,7 @@ static int init_chipset_ali15x3(struct pci_dev *dev)
4373         isa_dev = pci_get_device(PCI_VENDOR_ID_AL, PCI_DEVICE_ID_AL_M1533, NULL);
4375 -       local_irq_save(flags);
4376 +       local_irq_save_nort(flags);
4378         if (m5229_revision < 0xC2) {
4379                 /*
4380 @@ -325,7 +325,7 @@ static int init_chipset_ali15x3(struct pci_dev *dev)
4381         }
4382         pci_dev_put(north);
4383         pci_dev_put(isa_dev);
4384 -       local_irq_restore(flags);
4385 +       local_irq_restore_nort(flags);
4386         return 0;
4389 diff --git a/drivers/ide/hpt366.c b/drivers/ide/hpt366.c
4390 index 0ceae5cbd89a..c212e85d7f3e 100644
4391 --- a/drivers/ide/hpt366.c
4392 +++ b/drivers/ide/hpt366.c
4393 @@ -1236,7 +1236,7 @@ static int init_dma_hpt366(ide_hwif_t *hwif,
4395         dma_old = inb(base + 2);
4397 -       local_irq_save(flags);
4398 +       local_irq_save_nort(flags);
4400         dma_new = dma_old;
4401         pci_read_config_byte(dev, hwif->channel ? 0x4b : 0x43, &masterdma);
4402 @@ -1247,7 +1247,7 @@ static int init_dma_hpt366(ide_hwif_t *hwif,
4403         if (dma_new != dma_old)
4404                 outb(dma_new, base + 2);
4406 -       local_irq_restore(flags);
4407 +       local_irq_restore_nort(flags);
4409         printk(KERN_INFO "    %s: BM-DMA at 0x%04lx-0x%04lx\n",
4410                          hwif->name, base, base + 7);
4411 diff --git a/drivers/ide/ide-io-std.c b/drivers/ide/ide-io-std.c
4412 index 19763977568c..4169433faab5 100644
4413 --- a/drivers/ide/ide-io-std.c
4414 +++ b/drivers/ide/ide-io-std.c
4415 @@ -175,7 +175,7 @@ void ide_input_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf,
4416                 unsigned long uninitialized_var(flags);
4418                 if ((io_32bit & 2) && !mmio) {
4419 -                       local_irq_save(flags);
4420 +                       local_irq_save_nort(flags);
4421                         ata_vlb_sync(io_ports->nsect_addr);
4422                 }
4424 @@ -186,7 +186,7 @@ void ide_input_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf,
4425                         insl(data_addr, buf, words);
4427                 if ((io_32bit & 2) && !mmio)
4428 -                       local_irq_restore(flags);
4429 +                       local_irq_restore_nort(flags);
4431                 if (((len + 1) & 3) < 2)
4432                         return;
4433 @@ -219,7 +219,7 @@ void ide_output_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf,
4434                 unsigned long uninitialized_var(flags);
4436                 if ((io_32bit & 2) && !mmio) {
4437 -                       local_irq_save(flags);
4438 +                       local_irq_save_nort(flags);
4439                         ata_vlb_sync(io_ports->nsect_addr);
4440                 }
4442 @@ -230,7 +230,7 @@ void ide_output_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf,
4443                         outsl(data_addr, buf, words);
4445                 if ((io_32bit & 2) && !mmio)
4446 -                       local_irq_restore(flags);
4447 +                       local_irq_restore_nort(flags);
4449                 if (((len + 1) & 3) < 2)
4450                         return;
4451 diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
4452 index 669ea1e45795..e12e43e62245 100644
4453 --- a/drivers/ide/ide-io.c
4454 +++ b/drivers/ide/ide-io.c
4455 @@ -659,7 +659,7 @@ void ide_timer_expiry (unsigned long data)
4456                 /* disable_irq_nosync ?? */
4457                 disable_irq(hwif->irq);
4458                 /* local CPU only, as if we were handling an interrupt */
4459 -               local_irq_disable();
4460 +               local_irq_disable_nort();
4461                 if (hwif->polling) {
4462                         startstop = handler(drive);
4463                 } else if (drive_is_ready(drive)) {
4464 diff --git a/drivers/ide/ide-iops.c b/drivers/ide/ide-iops.c
4465 index 376f2dc410c5..f014dd1b73dc 100644
4466 --- a/drivers/ide/ide-iops.c
4467 +++ b/drivers/ide/ide-iops.c
4468 @@ -129,12 +129,12 @@ int __ide_wait_stat(ide_drive_t *drive, u8 good, u8 bad,
4469                                 if ((stat & ATA_BUSY) == 0)
4470                                         break;
4472 -                               local_irq_restore(flags);
4473 +                               local_irq_restore_nort(flags);
4474                                 *rstat = stat;
4475                                 return -EBUSY;
4476                         }
4477                 }
4478 -               local_irq_restore(flags);
4479 +               local_irq_restore_nort(flags);
4480         }
4481         /*
4482          * Allow status to settle, then read it again.
4483 diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
4484 index 0b63facd1d87..4ceba37afc0c 100644
4485 --- a/drivers/ide/ide-probe.c
4486 +++ b/drivers/ide/ide-probe.c
4487 @@ -196,10 +196,10 @@ static void do_identify(ide_drive_t *drive, u8 cmd, u16 *id)
4488         int bswap = 1;
4490         /* local CPU only; some systems need this */
4491 -       local_irq_save(flags);
4492 +       local_irq_save_nort(flags);
4493         /* read 512 bytes of id info */
4494         hwif->tp_ops->input_data(drive, NULL, id, SECTOR_SIZE);
4495 -       local_irq_restore(flags);
4496 +       local_irq_restore_nort(flags);
4498         drive->dev_flags |= IDE_DFLAG_ID_READ;
4499  #ifdef DEBUG
4500 diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c
4501 index a716693417a3..be0568c722d6 100644
4502 --- a/drivers/ide/ide-taskfile.c
4503 +++ b/drivers/ide/ide-taskfile.c
4504 @@ -250,7 +250,7 @@ void ide_pio_bytes(ide_drive_t *drive, struct ide_cmd *cmd,
4506                 page_is_high = PageHighMem(page);
4507                 if (page_is_high)
4508 -                       local_irq_save(flags);
4509 +                       local_irq_save_nort(flags);
4511                 buf = kmap_atomic(page) + offset;
4513 @@ -271,7 +271,7 @@ void ide_pio_bytes(ide_drive_t *drive, struct ide_cmd *cmd,
4514                 kunmap_atomic(buf);
4516                 if (page_is_high)
4517 -                       local_irq_restore(flags);
4518 +                       local_irq_restore_nort(flags);
4520                 len -= nr_bytes;
4521         }
4522 @@ -414,7 +414,7 @@ static ide_startstop_t pre_task_out_intr(ide_drive_t *drive,
4523         }
4525         if ((drive->dev_flags & IDE_DFLAG_UNMASK) == 0)
4526 -               local_irq_disable();
4527 +               local_irq_disable_nort();
4529         ide_set_handler(drive, &task_pio_intr, WAIT_WORSTCASE);
4531 diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
4532 index fddff403d5d2..cca1bb4fbfe3 100644
4533 --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
4534 +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
4535 @@ -902,7 +902,7 @@ void ipoib_mcast_restart_task(struct work_struct *work)
4537         ipoib_dbg_mcast(priv, "restarting multicast task\n");
4539 -       local_irq_save(flags);
4540 +       local_irq_save_nort(flags);
4541         netif_addr_lock(dev);
4542         spin_lock(&priv->lock);
4544 @@ -984,7 +984,7 @@ void ipoib_mcast_restart_task(struct work_struct *work)
4546         spin_unlock(&priv->lock);
4547         netif_addr_unlock(dev);
4548 -       local_irq_restore(flags);
4549 +       local_irq_restore_nort(flags);
4551         /*
4552          * make sure the in-flight joins have finished before we attempt
4553 diff --git a/drivers/input/gameport/gameport.c b/drivers/input/gameport/gameport.c
4554 index 4a2a9e370be7..e970d9afd179 100644
4555 --- a/drivers/input/gameport/gameport.c
4556 +++ b/drivers/input/gameport/gameport.c
4557 @@ -91,13 +91,13 @@ static int gameport_measure_speed(struct gameport *gameport)
4558         tx = ~0;
4560         for (i = 0; i < 50; i++) {
4561 -               local_irq_save(flags);
4562 +               local_irq_save_nort(flags);
4563                 t1 = ktime_get_ns();
4564                 for (t = 0; t < 50; t++)
4565                         gameport_read(gameport);
4566                 t2 = ktime_get_ns();
4567                 t3 = ktime_get_ns();
4568 -               local_irq_restore(flags);
4569 +               local_irq_restore_nort(flags);
4570                 udelay(i * 10);
4571                 t = (t2 - t1) - (t3 - t2);
4572                 if (t < tx)
4573 @@ -124,12 +124,12 @@ static int old_gameport_measure_speed(struct gameport *gameport)
4574         tx = 1 << 30;
4576         for(i = 0; i < 50; i++) {
4577 -               local_irq_save(flags);
4578 +               local_irq_save_nort(flags);
4579                 GET_TIME(t1);
4580                 for (t = 0; t < 50; t++) gameport_read(gameport);
4581                 GET_TIME(t2);
4582                 GET_TIME(t3);
4583 -               local_irq_restore(flags);
4584 +               local_irq_restore_nort(flags);
4585                 udelay(i * 10);
4586                 if ((t = DELTA(t2,t1) - DELTA(t3,t2)) < tx) tx = t;
4587         }
4588 @@ -148,11 +148,11 @@ static int old_gameport_measure_speed(struct gameport *gameport)
4589         tx = 1 << 30;
4591         for(i = 0; i < 50; i++) {
4592 -               local_irq_save(flags);
4593 +               local_irq_save_nort(flags);
4594                 t1 = rdtsc();
4595                 for (t = 0; t < 50; t++) gameport_read(gameport);
4596                 t2 = rdtsc();
4597 -               local_irq_restore(flags);
4598 +               local_irq_restore_nort(flags);
4599                 udelay(i * 10);
4600                 if (t2 - t1 < tx) tx = t2 - t1;
4601         }
4602 diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
4603 index 0c910a863581..3408e5dd1b93 100644
4604 --- a/drivers/iommu/amd_iommu.c
4605 +++ b/drivers/iommu/amd_iommu.c
4606 @@ -1923,10 +1923,10 @@ static int __attach_device(struct iommu_dev_data *dev_data,
4607         int ret;
4609         /*
4610 -        * Must be called with IRQs disabled. Warn here to detect early
4611 -        * when its not.
4612 +        * Must be called with IRQs disabled on a non RT kernel. Warn here to
4613 +        * detect early when its not.
4614          */
4615 -       WARN_ON(!irqs_disabled());
4616 +       WARN_ON_NONRT(!irqs_disabled());
4618         /* lock domain */
4619         spin_lock(&domain->lock);
4620 @@ -2094,10 +2094,10 @@ static void __detach_device(struct iommu_dev_data *dev_data)
4621         struct protection_domain *domain;
4623         /*
4624 -        * Must be called with IRQs disabled. Warn here to detect early
4625 -        * when its not.
4626 +        * Must be called with IRQs disabled on a non RT kernel. Warn here to
4627 +        * detect early when its not.
4628          */
4629 -       WARN_ON(!irqs_disabled());
4630 +       WARN_ON_NONRT(!irqs_disabled());
4632         if (WARN_ON(!dev_data->domain))
4633                 return;
4634 @@ -2283,7 +2283,7 @@ static void queue_add(struct dma_ops_domain *dma_dom,
4635         pages     = __roundup_pow_of_two(pages);
4636         address >>= PAGE_SHIFT;
4638 -       queue = get_cpu_ptr(&flush_queue);
4639 +       queue = raw_cpu_ptr(&flush_queue);
4640         spin_lock_irqsave(&queue->lock, flags);
4642         if (queue->next == FLUSH_QUEUE_SIZE)
4643 @@ -2300,8 +2300,6 @@ static void queue_add(struct dma_ops_domain *dma_dom,
4645         if (atomic_cmpxchg(&queue_timer_on, 0, 1) == 0)
4646                 mod_timer(&queue_timer, jiffies + msecs_to_jiffies(10));
4648 -       put_cpu_ptr(&flush_queue);
4652 diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
4653 index 88bbc8ccc5e3..8a1a8432a6bd 100644
4654 --- a/drivers/iommu/intel-iommu.c
4655 +++ b/drivers/iommu/intel-iommu.c
4656 @@ -479,7 +479,7 @@ struct deferred_flush_data {
4657         struct deferred_flush_table *tables;
4658  };
4660 -DEFINE_PER_CPU(struct deferred_flush_data, deferred_flush);
4661 +static DEFINE_PER_CPU(struct deferred_flush_data, deferred_flush);
4663  /* bitmap for indexing intel_iommus */
4664  static int g_num_of_iommus;
4665 @@ -3721,10 +3721,8 @@ static void add_unmap(struct dmar_domain *dom, unsigned long iova_pfn,
4666         struct intel_iommu *iommu;
4667         struct deferred_flush_entry *entry;
4668         struct deferred_flush_data *flush_data;
4669 -       unsigned int cpuid;
4671 -       cpuid = get_cpu();
4672 -       flush_data = per_cpu_ptr(&deferred_flush, cpuid);
4673 +       flush_data = raw_cpu_ptr(&deferred_flush);
4675         /* Flush all CPUs' entries to avoid deferring too much.  If
4676          * this becomes a bottleneck, can just flush us, and rely on
4677 @@ -3757,8 +3755,6 @@ static void add_unmap(struct dmar_domain *dom, unsigned long iova_pfn,
4678         }
4679         flush_data->size++;
4680         spin_unlock_irqrestore(&flush_data->lock, flags);
4682 -       put_cpu();
4685  static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
4686 diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
4687 index e23001bfcfee..359d5d169ec0 100644
4688 --- a/drivers/iommu/iova.c
4689 +++ b/drivers/iommu/iova.c
4690 @@ -22,6 +22,7 @@
4691  #include <linux/slab.h>
4692  #include <linux/smp.h>
4693  #include <linux/bitops.h>
4694 +#include <linux/cpu.h>
4696  static bool iova_rcache_insert(struct iova_domain *iovad,
4697                                unsigned long pfn,
4698 @@ -420,10 +421,8 @@ alloc_iova_fast(struct iova_domain *iovad, unsigned long size,
4700                 /* Try replenishing IOVAs by flushing rcache. */
4701                 flushed_rcache = true;
4702 -               preempt_disable();
4703                 for_each_online_cpu(cpu)
4704                         free_cpu_cached_iovas(cpu, iovad);
4705 -               preempt_enable();
4706                 goto retry;
4707         }
4709 @@ -751,7 +750,7 @@ static bool __iova_rcache_insert(struct iova_domain *iovad,
4710         bool can_insert = false;
4711         unsigned long flags;
4713 -       cpu_rcache = get_cpu_ptr(rcache->cpu_rcaches);
4714 +       cpu_rcache = raw_cpu_ptr(rcache->cpu_rcaches);
4715         spin_lock_irqsave(&cpu_rcache->lock, flags);
4717         if (!iova_magazine_full(cpu_rcache->loaded)) {
4718 @@ -781,7 +780,6 @@ static bool __iova_rcache_insert(struct iova_domain *iovad,
4719                 iova_magazine_push(cpu_rcache->loaded, iova_pfn);
4721         spin_unlock_irqrestore(&cpu_rcache->lock, flags);
4722 -       put_cpu_ptr(rcache->cpu_rcaches);
4724         if (mag_to_free) {
4725                 iova_magazine_free_pfns(mag_to_free, iovad);
4726 @@ -815,7 +813,7 @@ static unsigned long __iova_rcache_get(struct iova_rcache *rcache,
4727         bool has_pfn = false;
4728         unsigned long flags;
4730 -       cpu_rcache = get_cpu_ptr(rcache->cpu_rcaches);
4731 +       cpu_rcache = raw_cpu_ptr(rcache->cpu_rcaches);
4732         spin_lock_irqsave(&cpu_rcache->lock, flags);
4734         if (!iova_magazine_empty(cpu_rcache->loaded)) {
4735 @@ -837,7 +835,6 @@ static unsigned long __iova_rcache_get(struct iova_rcache *rcache,
4736                 iova_pfn = iova_magazine_pop(cpu_rcache->loaded, limit_pfn);
4738         spin_unlock_irqrestore(&cpu_rcache->lock, flags);
4739 -       put_cpu_ptr(rcache->cpu_rcaches);
4741         return iova_pfn;
4743 diff --git a/drivers/leds/trigger/Kconfig b/drivers/leds/trigger/Kconfig
4744 index 3f9ddb9fafa7..09da5b6b44a1 100644
4745 --- a/drivers/leds/trigger/Kconfig
4746 +++ b/drivers/leds/trigger/Kconfig
4747 @@ -69,7 +69,7 @@ config LEDS_TRIGGER_BACKLIGHT
4749  config LEDS_TRIGGER_CPU
4750         bool "LED CPU Trigger"
4751 -       depends on LEDS_TRIGGERS
4752 +       depends on LEDS_TRIGGERS && !PREEMPT_RT_BASE
4753         help
4754           This allows LEDs to be controlled by active CPUs. This shows
4755           the active CPUs across an array of LEDs so you can see which
4756 diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig
4757 index 4d200883c505..98b64ed5cb81 100644
4758 --- a/drivers/md/bcache/Kconfig
4759 +++ b/drivers/md/bcache/Kconfig
4760 @@ -1,6 +1,7 @@
4762  config BCACHE
4763         tristate "Block device as cache"
4764 +       depends on !PREEMPT_RT_FULL
4765         ---help---
4766         Allows a block device to be used as cache for other devices; uses
4767         a btree for indexing and the layout is optimized for SSDs.
4768 diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
4769 index ba7c4c685db3..834ec328f217 100644
4770 --- a/drivers/md/dm-rq.c
4771 +++ b/drivers/md/dm-rq.c
4772 @@ -842,7 +842,7 @@ static void dm_old_request_fn(struct request_queue *q)
4773                 /* Establish tio->ti before queuing work (map_tio_request) */
4774                 tio->ti = ti;
4775                 kthread_queue_work(&md->kworker, &tio->work);
4776 -               BUG_ON(!irqs_disabled());
4777 +               BUG_ON_NONRT(!irqs_disabled());
4778         }
4781 diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
4782 index 475a7a1bcfe0..8d2c9d70042e 100644
4783 --- a/drivers/md/raid5.c
4784 +++ b/drivers/md/raid5.c
4785 @@ -429,7 +429,7 @@ void raid5_release_stripe(struct stripe_head *sh)
4786                 md_wakeup_thread(conf->mddev->thread);
4787         return;
4788  slow_path:
4789 -       local_irq_save(flags);
4790 +       local_irq_save_nort(flags);
4791         /* we are ok here if STRIPE_ON_RELEASE_LIST is set or not */
4792         if (atomic_dec_and_lock(&sh->count, &conf->device_lock)) {
4793                 INIT_LIST_HEAD(&list);
4794 @@ -438,7 +438,7 @@ void raid5_release_stripe(struct stripe_head *sh)
4795                 spin_unlock(&conf->device_lock);
4796                 release_inactive_stripe_list(conf, &list, hash);
4797         }
4798 -       local_irq_restore(flags);
4799 +       local_irq_restore_nort(flags);
4802  static inline void remove_hash(struct stripe_head *sh)
4803 @@ -1937,8 +1937,9 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
4804         struct raid5_percpu *percpu;
4805         unsigned long cpu;
4807 -       cpu = get_cpu();
4808 +       cpu = get_cpu_light();
4809         percpu = per_cpu_ptr(conf->percpu, cpu);
4810 +       spin_lock(&percpu->lock);
4811         if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) {
4812                 ops_run_biofill(sh);
4813                 overlap_clear++;
4814 @@ -1994,7 +1995,8 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
4815                         if (test_and_clear_bit(R5_Overlap, &dev->flags))
4816                                 wake_up(&sh->raid_conf->wait_for_overlap);
4817                 }
4818 -       put_cpu();
4819 +       spin_unlock(&percpu->lock);
4820 +       put_cpu_light();
4823  static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp,
4824 @@ -6410,6 +6412,7 @@ static int raid456_cpu_up_prepare(unsigned int cpu, struct hlist_node *node)
4825                        __func__, cpu);
4826                 return -ENOMEM;
4827         }
4828 +       spin_lock_init(&per_cpu_ptr(conf->percpu, cpu)->lock);
4829         return 0;
4832 @@ -6420,7 +6423,6 @@ static int raid5_alloc_percpu(struct r5conf *conf)
4833         conf->percpu = alloc_percpu(struct raid5_percpu);
4834         if (!conf->percpu)
4835                 return -ENOMEM;
4837         err = cpuhp_state_add_instance(CPUHP_MD_RAID5_PREPARE, &conf->node);
4838         if (!err) {
4839                 conf->scribble_disks = max(conf->raid_disks,
4840 diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
4841 index 57ec49f0839e..0739604990b7 100644
4842 --- a/drivers/md/raid5.h
4843 +++ b/drivers/md/raid5.h
4844 @@ -504,6 +504,7 @@ struct r5conf {
4845         int                     recovery_disabled;
4846         /* per cpu variables */
4847         struct raid5_percpu {
4848 +               spinlock_t      lock;           /* Protection for -RT */
4849                 struct page     *spare_page; /* Used when checking P/Q in raid6 */
4850                 struct flex_array *scribble;   /* space for constructing buffer
4851                                               * lists and performing address
4852 diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
4853 index 64971baf11fa..215e91e36198 100644
4854 --- a/drivers/misc/Kconfig
4855 +++ b/drivers/misc/Kconfig
4856 @@ -54,6 +54,7 @@ config AD525X_DPOT_SPI
4857  config ATMEL_TCLIB
4858         bool "Atmel AT32/AT91 Timer/Counter Library"
4859         depends on (AVR32 || ARCH_AT91)
4860 +       default y if PREEMPT_RT_FULL
4861         help
4862           Select this if you want a library to allocate the Timer/Counter
4863           blocks found on many Atmel processors.  This facilitates using
4864 @@ -69,8 +70,7 @@ config ATMEL_TCB_CLKSRC
4865           are combined to make a single 32-bit timer.
4867           When GENERIC_CLOCKEVENTS is defined, the third timer channel
4868 -         may be used as a clock event device supporting oneshot mode
4869 -         (delays of up to two seconds) based on the 32 KiHz clock.
4870 +         may be used as a clock event device supporting oneshot mode.
4872  config ATMEL_TCB_CLKSRC_BLOCK
4873         int
4874 @@ -84,6 +84,15 @@ config ATMEL_TCB_CLKSRC_BLOCK
4875           TC can be used for other purposes, such as PWM generation and
4876           interval timing.
4878 +config ATMEL_TCB_CLKSRC_USE_SLOW_CLOCK
4879 +       bool "TC Block use 32 KiHz clock"
4880 +       depends on ATMEL_TCB_CLKSRC
4881 +       default y if !PREEMPT_RT_FULL
4882 +       help
4883 +         Select this to use 32 KiHz base clock rate as TC block clock
4884 +         source for clock events.
4887  config DUMMY_IRQ
4888         tristate "Dummy IRQ handler"
4889         default n
4890 diff --git a/drivers/mmc/host/mmci.c b/drivers/mmc/host/mmci.c
4891 index df990bb8c873..1a162709a85e 100644
4892 --- a/drivers/mmc/host/mmci.c
4893 +++ b/drivers/mmc/host/mmci.c
4894 @@ -1147,15 +1147,12 @@ static irqreturn_t mmci_pio_irq(int irq, void *dev_id)
4895         struct sg_mapping_iter *sg_miter = &host->sg_miter;
4896         struct variant_data *variant = host->variant;
4897         void __iomem *base = host->base;
4898 -       unsigned long flags;
4899         u32 status;
4901         status = readl(base + MMCISTATUS);
4903         dev_dbg(mmc_dev(host->mmc), "irq1 (pio) %08x\n", status);
4905 -       local_irq_save(flags);
4907         do {
4908                 unsigned int remain, len;
4909                 char *buffer;
4910 @@ -1195,8 +1192,6 @@ static irqreturn_t mmci_pio_irq(int irq, void *dev_id)
4912         sg_miter_stop(sg_miter);
4914 -       local_irq_restore(flags);
4916         /*
4917          * If we have less than the fifo 'half-full' threshold to transfer,
4918          * trigger a PIO interrupt as soon as any data is available.
4919 diff --git a/drivers/net/ethernet/3com/3c59x.c b/drivers/net/ethernet/3com/3c59x.c
4920 index 9133e7926da5..63afb921ed40 100644
4921 --- a/drivers/net/ethernet/3com/3c59x.c
4922 +++ b/drivers/net/ethernet/3com/3c59x.c
4923 @@ -842,9 +842,9 @@ static void poll_vortex(struct net_device *dev)
4925         struct vortex_private *vp = netdev_priv(dev);
4926         unsigned long flags;
4927 -       local_irq_save(flags);
4928 +       local_irq_save_nort(flags);
4929         (vp->full_bus_master_rx ? boomerang_interrupt:vortex_interrupt)(dev->irq,dev);
4930 -       local_irq_restore(flags);
4931 +       local_irq_restore_nort(flags);
4933  #endif
4935 @@ -1910,12 +1910,12 @@ static void vortex_tx_timeout(struct net_device *dev)
4936                          * Block interrupts because vortex_interrupt does a bare spin_lock()
4937                          */
4938                         unsigned long flags;
4939 -                       local_irq_save(flags);
4940 +                       local_irq_save_nort(flags);
4941                         if (vp->full_bus_master_tx)
4942                                 boomerang_interrupt(dev->irq, dev);
4943                         else
4944                                 vortex_interrupt(dev->irq, dev);
4945 -                       local_irq_restore(flags);
4946 +                       local_irq_restore_nort(flags);
4947                 }
4948         }
4950 diff --git a/drivers/net/ethernet/realtek/8139too.c b/drivers/net/ethernet/realtek/8139too.c
4951 index da4c2d8a4173..1420dfb56bac 100644
4952 --- a/drivers/net/ethernet/realtek/8139too.c
4953 +++ b/drivers/net/ethernet/realtek/8139too.c
4954 @@ -2233,7 +2233,7 @@ static void rtl8139_poll_controller(struct net_device *dev)
4955         struct rtl8139_private *tp = netdev_priv(dev);
4956         const int irq = tp->pci_dev->irq;
4958 -       disable_irq(irq);
4959 +       disable_irq_nosync(irq);
4960         rtl8139_interrupt(irq, dev);
4961         enable_irq(irq);
4963 diff --git a/drivers/net/wireless/intersil/orinoco/orinoco_usb.c b/drivers/net/wireless/intersil/orinoco/orinoco_usb.c
4964 index bca6935a94db..d7a35ee34d03 100644
4965 --- a/drivers/net/wireless/intersil/orinoco/orinoco_usb.c
4966 +++ b/drivers/net/wireless/intersil/orinoco/orinoco_usb.c
4967 @@ -697,7 +697,7 @@ static void ezusb_req_ctx_wait(struct ezusb_priv *upriv,
4968                         while (!ctx->done.done && msecs--)
4969                                 udelay(1000);
4970                 } else {
4971 -                       wait_event_interruptible(ctx->done.wait,
4972 +                       swait_event_interruptible(ctx->done.wait,
4973                                                  ctx->done.done);
4974                 }
4975                 break;
4976 diff --git a/drivers/pinctrl/qcom/pinctrl-msm.c b/drivers/pinctrl/qcom/pinctrl-msm.c
4977 index bedce3453dd3..faf038978650 100644
4978 --- a/drivers/pinctrl/qcom/pinctrl-msm.c
4979 +++ b/drivers/pinctrl/qcom/pinctrl-msm.c
4980 @@ -61,7 +61,7 @@ struct msm_pinctrl {
4981         struct notifier_block restart_nb;
4982         int irq;
4984 -       spinlock_t lock;
4985 +       raw_spinlock_t lock;
4987         DECLARE_BITMAP(dual_edge_irqs, MAX_NR_GPIO);
4988         DECLARE_BITMAP(enabled_irqs, MAX_NR_GPIO);
4989 @@ -153,14 +153,14 @@ static int msm_pinmux_set_mux(struct pinctrl_dev *pctldev,
4990         if (WARN_ON(i == g->nfuncs))
4991                 return -EINVAL;
4993 -       spin_lock_irqsave(&pctrl->lock, flags);
4994 +       raw_spin_lock_irqsave(&pctrl->lock, flags);
4996         val = readl(pctrl->regs + g->ctl_reg);
4997         val &= ~mask;
4998         val |= i << g->mux_bit;
4999         writel(val, pctrl->regs + g->ctl_reg);
5001 -       spin_unlock_irqrestore(&pctrl->lock, flags);
5002 +       raw_spin_unlock_irqrestore(&pctrl->lock, flags);
5004         return 0;
5006 @@ -323,14 +323,14 @@ static int msm_config_group_set(struct pinctrl_dev *pctldev,
5007                         break;
5008                 case PIN_CONFIG_OUTPUT:
5009                         /* set output value */
5010 -                       spin_lock_irqsave(&pctrl->lock, flags);
5011 +                       raw_spin_lock_irqsave(&pctrl->lock, flags);
5012                         val = readl(pctrl->regs + g->io_reg);
5013                         if (arg)
5014                                 val |= BIT(g->out_bit);
5015                         else
5016                                 val &= ~BIT(g->out_bit);
5017                         writel(val, pctrl->regs + g->io_reg);
5018 -                       spin_unlock_irqrestore(&pctrl->lock, flags);
5019 +                       raw_spin_unlock_irqrestore(&pctrl->lock, flags);
5021                         /* enable output */
5022                         arg = 1;
5023 @@ -351,12 +351,12 @@ static int msm_config_group_set(struct pinctrl_dev *pctldev,
5024                         return -EINVAL;
5025                 }
5027 -               spin_lock_irqsave(&pctrl->lock, flags);
5028 +               raw_spin_lock_irqsave(&pctrl->lock, flags);
5029                 val = readl(pctrl->regs + g->ctl_reg);
5030                 val &= ~(mask << bit);
5031                 val |= arg << bit;
5032                 writel(val, pctrl->regs + g->ctl_reg);
5033 -               spin_unlock_irqrestore(&pctrl->lock, flags);
5034 +               raw_spin_unlock_irqrestore(&pctrl->lock, flags);
5035         }
5037         return 0;
5038 @@ -384,13 +384,13 @@ static int msm_gpio_direction_input(struct gpio_chip *chip, unsigned offset)
5040         g = &pctrl->soc->groups[offset];
5042 -       spin_lock_irqsave(&pctrl->lock, flags);
5043 +       raw_spin_lock_irqsave(&pctrl->lock, flags);
5045         val = readl(pctrl->regs + g->ctl_reg);
5046         val &= ~BIT(g->oe_bit);
5047         writel(val, pctrl->regs + g->ctl_reg);
5049 -       spin_unlock_irqrestore(&pctrl->lock, flags);
5050 +       raw_spin_unlock_irqrestore(&pctrl->lock, flags);
5052         return 0;
5054 @@ -404,7 +404,7 @@ static int msm_gpio_direction_output(struct gpio_chip *chip, unsigned offset, in
5056         g = &pctrl->soc->groups[offset];
5058 -       spin_lock_irqsave(&pctrl->lock, flags);
5059 +       raw_spin_lock_irqsave(&pctrl->lock, flags);
5061         val = readl(pctrl->regs + g->io_reg);
5062         if (value)
5063 @@ -417,7 +417,7 @@ static int msm_gpio_direction_output(struct gpio_chip *chip, unsigned offset, in
5064         val |= BIT(g->oe_bit);
5065         writel(val, pctrl->regs + g->ctl_reg);
5067 -       spin_unlock_irqrestore(&pctrl->lock, flags);
5068 +       raw_spin_unlock_irqrestore(&pctrl->lock, flags);
5070         return 0;
5072 @@ -443,7 +443,7 @@ static void msm_gpio_set(struct gpio_chip *chip, unsigned offset, int value)
5074         g = &pctrl->soc->groups[offset];
5076 -       spin_lock_irqsave(&pctrl->lock, flags);
5077 +       raw_spin_lock_irqsave(&pctrl->lock, flags);
5079         val = readl(pctrl->regs + g->io_reg);
5080         if (value)
5081 @@ -452,7 +452,7 @@ static void msm_gpio_set(struct gpio_chip *chip, unsigned offset, int value)
5082                 val &= ~BIT(g->out_bit);
5083         writel(val, pctrl->regs + g->io_reg);
5085 -       spin_unlock_irqrestore(&pctrl->lock, flags);
5086 +       raw_spin_unlock_irqrestore(&pctrl->lock, flags);
5089  #ifdef CONFIG_DEBUG_FS
5090 @@ -571,7 +571,7 @@ static void msm_gpio_irq_mask(struct irq_data *d)
5092         g = &pctrl->soc->groups[d->hwirq];
5094 -       spin_lock_irqsave(&pctrl->lock, flags);
5095 +       raw_spin_lock_irqsave(&pctrl->lock, flags);
5097         val = readl(pctrl->regs + g->intr_cfg_reg);
5098         val &= ~BIT(g->intr_enable_bit);
5099 @@ -579,7 +579,7 @@ static void msm_gpio_irq_mask(struct irq_data *d)
5101         clear_bit(d->hwirq, pctrl->enabled_irqs);
5103 -       spin_unlock_irqrestore(&pctrl->lock, flags);
5104 +       raw_spin_unlock_irqrestore(&pctrl->lock, flags);
5107  static void msm_gpio_irq_unmask(struct irq_data *d)
5108 @@ -592,7 +592,7 @@ static void msm_gpio_irq_unmask(struct irq_data *d)
5110         g = &pctrl->soc->groups[d->hwirq];
5112 -       spin_lock_irqsave(&pctrl->lock, flags);
5113 +       raw_spin_lock_irqsave(&pctrl->lock, flags);
5115         val = readl(pctrl->regs + g->intr_cfg_reg);
5116         val |= BIT(g->intr_enable_bit);
5117 @@ -600,7 +600,7 @@ static void msm_gpio_irq_unmask(struct irq_data *d)
5119         set_bit(d->hwirq, pctrl->enabled_irqs);
5121 -       spin_unlock_irqrestore(&pctrl->lock, flags);
5122 +       raw_spin_unlock_irqrestore(&pctrl->lock, flags);
5125  static void msm_gpio_irq_ack(struct irq_data *d)
5126 @@ -613,7 +613,7 @@ static void msm_gpio_irq_ack(struct irq_data *d)
5128         g = &pctrl->soc->groups[d->hwirq];
5130 -       spin_lock_irqsave(&pctrl->lock, flags);
5131 +       raw_spin_lock_irqsave(&pctrl->lock, flags);
5133         val = readl(pctrl->regs + g->intr_status_reg);
5134         if (g->intr_ack_high)
5135 @@ -625,7 +625,7 @@ static void msm_gpio_irq_ack(struct irq_data *d)
5136         if (test_bit(d->hwirq, pctrl->dual_edge_irqs))
5137                 msm_gpio_update_dual_edge_pos(pctrl, g, d);
5139 -       spin_unlock_irqrestore(&pctrl->lock, flags);
5140 +       raw_spin_unlock_irqrestore(&pctrl->lock, flags);
5143  static int msm_gpio_irq_set_type(struct irq_data *d, unsigned int type)
5144 @@ -638,7 +638,7 @@ static int msm_gpio_irq_set_type(struct irq_data *d, unsigned int type)
5146         g = &pctrl->soc->groups[d->hwirq];
5148 -       spin_lock_irqsave(&pctrl->lock, flags);
5149 +       raw_spin_lock_irqsave(&pctrl->lock, flags);
5151         /*
5152          * For hw without possibility of detecting both edges
5153 @@ -712,7 +712,7 @@ static int msm_gpio_irq_set_type(struct irq_data *d, unsigned int type)
5154         if (test_bit(d->hwirq, pctrl->dual_edge_irqs))
5155                 msm_gpio_update_dual_edge_pos(pctrl, g, d);
5157 -       spin_unlock_irqrestore(&pctrl->lock, flags);
5158 +       raw_spin_unlock_irqrestore(&pctrl->lock, flags);
5160         if (type & (IRQ_TYPE_LEVEL_LOW | IRQ_TYPE_LEVEL_HIGH))
5161                 irq_set_handler_locked(d, handle_level_irq);
5162 @@ -728,11 +728,11 @@ static int msm_gpio_irq_set_wake(struct irq_data *d, unsigned int on)
5163         struct msm_pinctrl *pctrl = gpiochip_get_data(gc);
5164         unsigned long flags;
5166 -       spin_lock_irqsave(&pctrl->lock, flags);
5167 +       raw_spin_lock_irqsave(&pctrl->lock, flags);
5169         irq_set_irq_wake(pctrl->irq, on);
5171 -       spin_unlock_irqrestore(&pctrl->lock, flags);
5172 +       raw_spin_unlock_irqrestore(&pctrl->lock, flags);
5174         return 0;
5176 @@ -878,7 +878,7 @@ int msm_pinctrl_probe(struct platform_device *pdev,
5177         pctrl->soc = soc_data;
5178         pctrl->chip = msm_gpio_template;
5180 -       spin_lock_init(&pctrl->lock);
5181 +       raw_spin_lock_init(&pctrl->lock);
5183         res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
5184         pctrl->regs = devm_ioremap_resource(&pdev->dev, res);
5185 diff --git a/drivers/scsi/fcoe/fcoe.c b/drivers/scsi/fcoe/fcoe.c
5186 index 9bd41a35a78a..8e2d436c2e3f 100644
5187 --- a/drivers/scsi/fcoe/fcoe.c
5188 +++ b/drivers/scsi/fcoe/fcoe.c
5189 @@ -1455,11 +1455,11 @@ static int fcoe_rcv(struct sk_buff *skb, struct net_device *netdev,
5190  static int fcoe_alloc_paged_crc_eof(struct sk_buff *skb, int tlen)
5192         struct fcoe_percpu_s *fps;
5193 -       int rc;
5194 +       int rc, cpu = get_cpu_light();
5196 -       fps = &get_cpu_var(fcoe_percpu);
5197 +       fps = &per_cpu(fcoe_percpu, cpu);
5198         rc = fcoe_get_paged_crc_eof(skb, tlen, fps);
5199 -       put_cpu_var(fcoe_percpu);
5200 +       put_cpu_light();
5202         return rc;
5204 @@ -1646,11 +1646,11 @@ static inline int fcoe_filter_frames(struct fc_lport *lport,
5205                 return 0;
5206         }
5208 -       stats = per_cpu_ptr(lport->stats, get_cpu());
5209 +       stats = per_cpu_ptr(lport->stats, get_cpu_light());
5210         stats->InvalidCRCCount++;
5211         if (stats->InvalidCRCCount < 5)
5212                 printk(KERN_WARNING "fcoe: dropping frame with CRC error\n");
5213 -       put_cpu();
5214 +       put_cpu_light();
5215         return -EINVAL;
5218 @@ -1693,7 +1693,7 @@ static void fcoe_recv_frame(struct sk_buff *skb)
5219          */
5220         hp = (struct fcoe_hdr *) skb_network_header(skb);
5222 -       stats = per_cpu_ptr(lport->stats, get_cpu());
5223 +       stats = per_cpu_ptr(lport->stats, get_cpu_light());
5224         if (unlikely(FC_FCOE_DECAPS_VER(hp) != FC_FCOE_VER)) {
5225                 if (stats->ErrorFrames < 5)
5226                         printk(KERN_WARNING "fcoe: FCoE version "
5227 @@ -1725,13 +1725,13 @@ static void fcoe_recv_frame(struct sk_buff *skb)
5228                 goto drop;
5230         if (!fcoe_filter_frames(lport, fp)) {
5231 -               put_cpu();
5232 +               put_cpu_light();
5233                 fc_exch_recv(lport, fp);
5234                 return;
5235         }
5236  drop:
5237         stats->ErrorFrames++;
5238 -       put_cpu();
5239 +       put_cpu_light();
5240         kfree_skb(skb);
5243 diff --git a/drivers/scsi/fcoe/fcoe_ctlr.c b/drivers/scsi/fcoe/fcoe_ctlr.c
5244 index dcf36537a767..1a1f2e46452c 100644
5245 --- a/drivers/scsi/fcoe/fcoe_ctlr.c
5246 +++ b/drivers/scsi/fcoe/fcoe_ctlr.c
5247 @@ -834,7 +834,7 @@ static unsigned long fcoe_ctlr_age_fcfs(struct fcoe_ctlr *fip)
5249         INIT_LIST_HEAD(&del_list);
5251 -       stats = per_cpu_ptr(fip->lp->stats, get_cpu());
5252 +       stats = per_cpu_ptr(fip->lp->stats, get_cpu_light());
5254         list_for_each_entry_safe(fcf, next, &fip->fcfs, list) {
5255                 deadline = fcf->time + fcf->fka_period + fcf->fka_period / 2;
5256 @@ -870,7 +870,7 @@ static unsigned long fcoe_ctlr_age_fcfs(struct fcoe_ctlr *fip)
5257                                 sel_time = fcf->time;
5258                 }
5259         }
5260 -       put_cpu();
5261 +       put_cpu_light();
5263         list_for_each_entry_safe(fcf, next, &del_list, list) {
5264                 /* Removes fcf from current list */
5265 diff --git a/drivers/scsi/libfc/fc_exch.c b/drivers/scsi/libfc/fc_exch.c
5266 index 16ca31ad5ec0..c3987347e762 100644
5267 --- a/drivers/scsi/libfc/fc_exch.c
5268 +++ b/drivers/scsi/libfc/fc_exch.c
5269 @@ -814,10 +814,10 @@ static struct fc_exch *fc_exch_em_alloc(struct fc_lport *lport,
5270         }
5271         memset(ep, 0, sizeof(*ep));
5273 -       cpu = get_cpu();
5274 +       cpu = get_cpu_light();
5275         pool = per_cpu_ptr(mp->pool, cpu);
5276         spin_lock_bh(&pool->lock);
5277 -       put_cpu();
5278 +       put_cpu_light();
5280         /* peek cache of free slot */
5281         if (pool->left != FC_XID_UNKNOWN) {
5282 diff --git a/drivers/scsi/libsas/sas_ata.c b/drivers/scsi/libsas/sas_ata.c
5283 index 87f5e694dbed..23c0a50fb6aa 100644
5284 --- a/drivers/scsi/libsas/sas_ata.c
5285 +++ b/drivers/scsi/libsas/sas_ata.c
5286 @@ -190,7 +190,7 @@ static unsigned int sas_ata_qc_issue(struct ata_queued_cmd *qc)
5287         /* TODO: audit callers to ensure they are ready for qc_issue to
5288          * unconditionally re-enable interrupts
5289          */
5290 -       local_irq_save(flags);
5291 +       local_irq_save_nort(flags);
5292         spin_unlock(ap->lock);
5294         /* If the device fell off, no sense in issuing commands */
5295 @@ -252,7 +252,7 @@ static unsigned int sas_ata_qc_issue(struct ata_queued_cmd *qc)
5297   out:
5298         spin_lock(ap->lock);
5299 -       local_irq_restore(flags);
5300 +       local_irq_restore_nort(flags);
5301         return ret;
5304 diff --git a/drivers/scsi/qla2xxx/qla_inline.h b/drivers/scsi/qla2xxx/qla_inline.h
5305 index edc48f3b8230..ee5c6f9dfb6f 100644
5306 --- a/drivers/scsi/qla2xxx/qla_inline.h
5307 +++ b/drivers/scsi/qla2xxx/qla_inline.h
5308 @@ -59,12 +59,12 @@ qla2x00_poll(struct rsp_que *rsp)
5310         unsigned long flags;
5311         struct qla_hw_data *ha = rsp->hw;
5312 -       local_irq_save(flags);
5313 +       local_irq_save_nort(flags);
5314         if (IS_P3P_TYPE(ha))
5315                 qla82xx_poll(0, rsp);
5316         else
5317                 ha->isp_ops->intr_handler(0, rsp);
5318 -       local_irq_restore(flags);
5319 +       local_irq_restore_nort(flags);
5322  static inline uint8_t *
5323 diff --git a/drivers/scsi/qla2xxx/qla_isr.c b/drivers/scsi/qla2xxx/qla_isr.c
5324 index bddaabb288d4..8de0ec4222fe 100644
5325 --- a/drivers/scsi/qla2xxx/qla_isr.c
5326 +++ b/drivers/scsi/qla2xxx/qla_isr.c
5327 @@ -3129,7 +3129,11 @@ qla24xx_enable_msix(struct qla_hw_data *ha, struct rsp_que *rsp)
5328                 * kref_put().
5329                 */
5330                 kref_get(&qentry->irq_notify.kref);
5331 +#ifdef CONFIG_PREEMPT_RT_BASE
5332 +               swork_queue(&qentry->irq_notify.swork);
5333 +#else
5334                 schedule_work(&qentry->irq_notify.work);
5335 +#endif
5336         }
5338         /*
5339 diff --git a/drivers/thermal/x86_pkg_temp_thermal.c b/drivers/thermal/x86_pkg_temp_thermal.c
5340 index 95f4c1bcdb4c..0be934799bff 100644
5341 --- a/drivers/thermal/x86_pkg_temp_thermal.c
5342 +++ b/drivers/thermal/x86_pkg_temp_thermal.c
5343 @@ -29,6 +29,7 @@
5344  #include <linux/pm.h>
5345  #include <linux/thermal.h>
5346  #include <linux/debugfs.h>
5347 +#include <linux/swork.h>
5348  #include <asm/cpu_device_id.h>
5349  #include <asm/mce.h>
5351 @@ -353,7 +354,7 @@ static void pkg_temp_thermal_threshold_work_fn(struct work_struct *work)
5352         }
5355 -static int pkg_temp_thermal_platform_thermal_notify(__u64 msr_val)
5356 +static void platform_thermal_notify_work(struct swork_event *event)
5358         unsigned long flags;
5359         int cpu = smp_processor_id();
5360 @@ -370,7 +371,7 @@ static int pkg_temp_thermal_platform_thermal_notify(__u64 msr_val)
5361                         pkg_work_scheduled[phy_id]) {
5362                 disable_pkg_thres_interrupt();
5363                 spin_unlock_irqrestore(&pkg_work_lock, flags);
5364 -               return -EINVAL;
5365 +               return;
5366         }
5367         pkg_work_scheduled[phy_id] = 1;
5368         spin_unlock_irqrestore(&pkg_work_lock, flags);
5369 @@ -379,9 +380,48 @@ static int pkg_temp_thermal_platform_thermal_notify(__u64 msr_val)
5370         schedule_delayed_work_on(cpu,
5371                                 &per_cpu(pkg_temp_thermal_threshold_work, cpu),
5372                                 msecs_to_jiffies(notify_delay_ms));
5375 +#ifdef CONFIG_PREEMPT_RT_FULL
5376 +static struct swork_event notify_work;
5378 +static int thermal_notify_work_init(void)
5380 +       int err;
5382 +       err = swork_get();
5383 +       if (err)
5384 +               return err;
5386 +       INIT_SWORK(&notify_work, platform_thermal_notify_work);
5387         return 0;
5390 +static void thermal_notify_work_cleanup(void)
5392 +       swork_put();
5395 +static int pkg_temp_thermal_platform_thermal_notify(__u64 msr_val)
5397 +       swork_queue(&notify_work);
5398 +       return 0;
5401 +#else  /* !CONFIG_PREEMPT_RT_FULL */
5403 +static int thermal_notify_work_init(void) { return 0; }
5405 +static void thermal_notify_work_cleanup(void) {  }
5407 +static int pkg_temp_thermal_platform_thermal_notify(__u64 msr_val)
5409 +       platform_thermal_notify_work(NULL);
5411 +       return 0;
5413 +#endif /* CONFIG_PREEMPT_RT_FULL */
5415  static int find_siblings_cpu(int cpu)
5417         int i;
5418 @@ -585,6 +625,9 @@ static int __init pkg_temp_thermal_init(void)
5419         if (!x86_match_cpu(pkg_temp_thermal_ids))
5420                 return -ENODEV;
5422 +       if (!thermal_notify_work_init())
5423 +               return -ENODEV;
5425         spin_lock_init(&pkg_work_lock);
5426         platform_thermal_package_notify =
5427                         pkg_temp_thermal_platform_thermal_notify;
5428 @@ -609,7 +652,7 @@ static int __init pkg_temp_thermal_init(void)
5429         kfree(pkg_work_scheduled);
5430         platform_thermal_package_notify = NULL;
5431         platform_thermal_package_rate_control = NULL;
5433 +       thermal_notify_work_cleanup();
5434         return -ENODEV;
5437 @@ -634,6 +677,7 @@ static void __exit pkg_temp_thermal_exit(void)
5438         mutex_unlock(&phy_dev_list_mutex);
5439         platform_thermal_package_notify = NULL;
5440         platform_thermal_package_rate_control = NULL;
5441 +       thermal_notify_work_cleanup();
5442         for_each_online_cpu(i)
5443                 cancel_delayed_work_sync(
5444                         &per_cpu(pkg_temp_thermal_threshold_work, i));
5445 diff --git a/drivers/tty/serial/8250/8250_core.c b/drivers/tty/serial/8250/8250_core.c
5446 index e8819aa20415..dd7f9bf45d6c 100644
5447 --- a/drivers/tty/serial/8250/8250_core.c
5448 +++ b/drivers/tty/serial/8250/8250_core.c
5449 @@ -58,7 +58,16 @@ static struct uart_driver serial8250_reg;
5451  static unsigned int skip_txen_test; /* force skip of txen test at init time */
5453 -#define PASS_LIMIT     512
5455 + * On -rt we can have a more delays, and legitimately
5456 + * so - so don't drop work spuriously and spam the
5457 + * syslog:
5458 + */
5459 +#ifdef CONFIG_PREEMPT_RT_FULL
5460 +# define PASS_LIMIT    1000000
5461 +#else
5462 +# define PASS_LIMIT    512
5463 +#endif
5465  #include <asm/serial.h>
5466  /*
5467 diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c
5468 index f6e4373a8850..4620b51b0e7c 100644
5469 --- a/drivers/tty/serial/8250/8250_port.c
5470 +++ b/drivers/tty/serial/8250/8250_port.c
5471 @@ -35,6 +35,7 @@
5472  #include <linux/nmi.h>
5473  #include <linux/mutex.h>
5474  #include <linux/slab.h>
5475 +#include <linux/kdb.h>
5476  #include <linux/uaccess.h>
5477  #include <linux/pm_runtime.h>
5478  #include <linux/timer.h>
5479 @@ -3143,9 +3144,9 @@ void serial8250_console_write(struct uart_8250_port *up, const char *s,
5481         serial8250_rpm_get(up);
5483 -       if (port->sysrq)
5484 +       if (port->sysrq || oops_in_progress)
5485                 locked = 0;
5486 -       else if (oops_in_progress)
5487 +       else if (in_kdb_printk())
5488                 locked = spin_trylock_irqsave(&port->lock, flags);
5489         else
5490                 spin_lock_irqsave(&port->lock, flags);
5491 diff --git a/drivers/tty/serial/amba-pl011.c b/drivers/tty/serial/amba-pl011.c
5492 index e2c33b9528d8..53af53c43e8c 100644
5493 --- a/drivers/tty/serial/amba-pl011.c
5494 +++ b/drivers/tty/serial/amba-pl011.c
5495 @@ -2194,13 +2194,19 @@ pl011_console_write(struct console *co, const char *s, unsigned int count)
5497         clk_enable(uap->clk);
5499 -       local_irq_save(flags);
5500 +       /*
5501 +        * local_irq_save(flags);
5502 +        *
5503 +        * This local_irq_save() is nonsense. If we come in via sysrq
5504 +        * handling then interrupts are already disabled. Aside of
5505 +        * that the port.sysrq check is racy on SMP regardless.
5506 +       */
5507         if (uap->port.sysrq)
5508                 locked = 0;
5509         else if (oops_in_progress)
5510 -               locked = spin_trylock(&uap->port.lock);
5511 +               locked = spin_trylock_irqsave(&uap->port.lock, flags);
5512         else
5513 -               spin_lock(&uap->port.lock);
5514 +               spin_lock_irqsave(&uap->port.lock, flags);
5516         /*
5517          *      First save the CR then disable the interrupts
5518 @@ -2224,8 +2230,7 @@ pl011_console_write(struct console *co, const char *s, unsigned int count)
5519                 pl011_write(old_cr, uap, REG_CR);
5521         if (locked)
5522 -               spin_unlock(&uap->port.lock);
5523 -       local_irq_restore(flags);
5524 +               spin_unlock_irqrestore(&uap->port.lock, flags);
5526         clk_disable(uap->clk);
5528 diff --git a/drivers/tty/serial/omap-serial.c b/drivers/tty/serial/omap-serial.c
5529 index 472ba3c813c1..e654cb421fb7 100644
5530 --- a/drivers/tty/serial/omap-serial.c
5531 +++ b/drivers/tty/serial/omap-serial.c
5532 @@ -1257,13 +1257,10 @@ serial_omap_console_write(struct console *co, const char *s,
5534         pm_runtime_get_sync(up->dev);
5536 -       local_irq_save(flags);
5537 -       if (up->port.sysrq)
5538 -               locked = 0;
5539 -       else if (oops_in_progress)
5540 -               locked = spin_trylock(&up->port.lock);
5541 +       if (up->port.sysrq || oops_in_progress)
5542 +               locked = spin_trylock_irqsave(&up->port.lock, flags);
5543         else
5544 -               spin_lock(&up->port.lock);
5545 +               spin_lock_irqsave(&up->port.lock, flags);
5547         /*
5548          * First save the IER then disable the interrupts
5549 @@ -1292,8 +1289,7 @@ serial_omap_console_write(struct console *co, const char *s,
5550         pm_runtime_mark_last_busy(up->dev);
5551         pm_runtime_put_autosuspend(up->dev);
5552         if (locked)
5553 -               spin_unlock(&up->port.lock);
5554 -       local_irq_restore(flags);
5555 +               spin_unlock_irqrestore(&up->port.lock, flags);
5558  static int __init
5559 diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c
5560 index fcc7aa248ce7..fb2c38d875f9 100644
5561 --- a/drivers/usb/core/hcd.c
5562 +++ b/drivers/usb/core/hcd.c
5563 @@ -1764,9 +1764,9 @@ static void __usb_hcd_giveback_urb(struct urb *urb)
5564          * and no one may trigger the above deadlock situation when
5565          * running complete() in tasklet.
5566          */
5567 -       local_irq_save(flags);
5568 +       local_irq_save_nort(flags);
5569         urb->complete(urb);
5570 -       local_irq_restore(flags);
5571 +       local_irq_restore_nort(flags);
5573         usb_anchor_resume_wakeups(anchor);
5574         atomic_dec(&urb->use_count);
5575 diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c
5576 index 7b107e43b1c4..f1e8534a1748 100644
5577 --- a/drivers/usb/gadget/function/f_fs.c
5578 +++ b/drivers/usb/gadget/function/f_fs.c
5579 @@ -1593,7 +1593,7 @@ static void ffs_data_put(struct ffs_data *ffs)
5580                 pr_info("%s(): freeing\n", __func__);
5581                 ffs_data_clear(ffs);
5582                 BUG_ON(waitqueue_active(&ffs->ev.waitq) ||
5583 -                      waitqueue_active(&ffs->ep0req_completion.wait));
5584 +                      swait_active(&ffs->ep0req_completion.wait));
5585                 kfree(ffs->dev_name);
5586                 kfree(ffs);
5587         }
5588 diff --git a/drivers/usb/gadget/legacy/inode.c b/drivers/usb/gadget/legacy/inode.c
5589 index b8534d3f8bb0..8fcaf02e21b0 100644
5590 --- a/drivers/usb/gadget/legacy/inode.c
5591 +++ b/drivers/usb/gadget/legacy/inode.c
5592 @@ -347,7 +347,7 @@ ep_io (struct ep_data *epdata, void *buf, unsigned len)
5593         spin_unlock_irq (&epdata->dev->lock);
5595         if (likely (value == 0)) {
5596 -               value = wait_event_interruptible (done.wait, done.done);
5597 +               value = swait_event_interruptible (done.wait, done.done);
5598                 if (value != 0) {
5599                         spin_lock_irq (&epdata->dev->lock);
5600                         if (likely (epdata->ep != NULL)) {
5601 @@ -356,7 +356,7 @@ ep_io (struct ep_data *epdata, void *buf, unsigned len)
5602                                 usb_ep_dequeue (epdata->ep, epdata->req);
5603                                 spin_unlock_irq (&epdata->dev->lock);
5605 -                               wait_event (done.wait, done.done);
5606 +                               swait_event (done.wait, done.done);
5607                                 if (epdata->status == -ECONNRESET)
5608                                         epdata->status = -EINTR;
5609                         } else {
5610 diff --git a/fs/aio.c b/fs/aio.c
5611 index 0fcb49ad67d4..211ebc21e4db 100644
5612 --- a/fs/aio.c
5613 +++ b/fs/aio.c
5614 @@ -40,6 +40,7 @@
5615  #include <linux/ramfs.h>
5616  #include <linux/percpu-refcount.h>
5617  #include <linux/mount.h>
5618 +#include <linux/swork.h>
5620  #include <asm/kmap_types.h>
5621  #include <asm/uaccess.h>
5622 @@ -115,7 +116,7 @@ struct kioctx {
5623         struct page             **ring_pages;
5624         long                    nr_pages;
5626 -       struct work_struct      free_work;
5627 +       struct swork_event      free_work;
5629         /*
5630          * signals when all in-flight requests are done
5631 @@ -258,6 +259,7 @@ static int __init aio_setup(void)
5632                 .mount          = aio_mount,
5633                 .kill_sb        = kill_anon_super,
5634         };
5635 +       BUG_ON(swork_get());
5636         aio_mnt = kern_mount(&aio_fs);
5637         if (IS_ERR(aio_mnt))
5638                 panic("Failed to create aio fs mount.");
5639 @@ -581,9 +583,9 @@ static int kiocb_cancel(struct aio_kiocb *kiocb)
5640         return cancel(&kiocb->common);
5643 -static void free_ioctx(struct work_struct *work)
5644 +static void free_ioctx(struct swork_event *sev)
5646 -       struct kioctx *ctx = container_of(work, struct kioctx, free_work);
5647 +       struct kioctx *ctx = container_of(sev, struct kioctx, free_work);
5649         pr_debug("freeing %p\n", ctx);
5651 @@ -602,8 +604,8 @@ static void free_ioctx_reqs(struct percpu_ref *ref)
5652         if (ctx->rq_wait && atomic_dec_and_test(&ctx->rq_wait->count))
5653                 complete(&ctx->rq_wait->comp);
5655 -       INIT_WORK(&ctx->free_work, free_ioctx);
5656 -       schedule_work(&ctx->free_work);
5657 +       INIT_SWORK(&ctx->free_work, free_ioctx);
5658 +       swork_queue(&ctx->free_work);
5661  /*
5662 @@ -611,9 +613,9 @@ static void free_ioctx_reqs(struct percpu_ref *ref)
5663   * and ctx->users has dropped to 0, so we know no more kiocbs can be submitted -
5664   * now it's safe to cancel any that need to be.
5665   */
5666 -static void free_ioctx_users(struct percpu_ref *ref)
5667 +static void free_ioctx_users_work(struct swork_event *sev)
5669 -       struct kioctx *ctx = container_of(ref, struct kioctx, users);
5670 +       struct kioctx *ctx = container_of(sev, struct kioctx, free_work);
5671         struct aio_kiocb *req;
5673         spin_lock_irq(&ctx->ctx_lock);
5674 @@ -632,6 +634,14 @@ static void free_ioctx_users(struct percpu_ref *ref)
5675         percpu_ref_put(&ctx->reqs);
5678 +static void free_ioctx_users(struct percpu_ref *ref)
5680 +       struct kioctx *ctx = container_of(ref, struct kioctx, users);
5682 +       INIT_SWORK(&ctx->free_work, free_ioctx_users_work);
5683 +       swork_queue(&ctx->free_work);
5686  static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
5688         unsigned i, new_nr;
5689 diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
5690 index a1fba4285277..3796769b4cd1 100644
5691 --- a/fs/autofs4/autofs_i.h
5692 +++ b/fs/autofs4/autofs_i.h
5693 @@ -31,6 +31,7 @@
5694  #include <linux/sched.h>
5695  #include <linux/mount.h>
5696  #include <linux/namei.h>
5697 +#include <linux/delay.h>
5698  #include <asm/current.h>
5699  #include <linux/uaccess.h>
5701 diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
5702 index d8e6d421c27f..2e689ab1306b 100644
5703 --- a/fs/autofs4/expire.c
5704 +++ b/fs/autofs4/expire.c
5705 @@ -148,7 +148,7 @@ static struct dentry *get_next_positive_dentry(struct dentry *prev,
5706                         parent = p->d_parent;
5707                         if (!spin_trylock(&parent->d_lock)) {
5708                                 spin_unlock(&p->d_lock);
5709 -                               cpu_relax();
5710 +                               cpu_chill();
5711                                 goto relock;
5712                         }
5713                         spin_unlock(&p->d_lock);
5714 diff --git a/fs/buffer.c b/fs/buffer.c
5715 index 5d8f496d624e..48074bd91ea3 100644
5716 --- a/fs/buffer.c
5717 +++ b/fs/buffer.c
5718 @@ -301,8 +301,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
5719          * decide that the page is now completely done.
5720          */
5721         first = page_buffers(page);
5722 -       local_irq_save(flags);
5723 -       bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
5724 +       flags = bh_uptodate_lock_irqsave(first);
5725         clear_buffer_async_read(bh);
5726         unlock_buffer(bh);
5727         tmp = bh;
5728 @@ -315,8 +314,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
5729                 }
5730                 tmp = tmp->b_this_page;
5731         } while (tmp != bh);
5732 -       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
5733 -       local_irq_restore(flags);
5734 +       bh_uptodate_unlock_irqrestore(first, flags);
5736         /*
5737          * If none of the buffers had errors and they are all
5738 @@ -328,9 +326,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
5739         return;
5741  still_busy:
5742 -       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
5743 -       local_irq_restore(flags);
5744 -       return;
5745 +       bh_uptodate_unlock_irqrestore(first, flags);
5748  /*
5749 @@ -358,8 +354,7 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate)
5750         }
5752         first = page_buffers(page);
5753 -       local_irq_save(flags);
5754 -       bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
5755 +       flags = bh_uptodate_lock_irqsave(first);
5757         clear_buffer_async_write(bh);
5758         unlock_buffer(bh);
5759 @@ -371,15 +366,12 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate)
5760                 }
5761                 tmp = tmp->b_this_page;
5762         }
5763 -       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
5764 -       local_irq_restore(flags);
5765 +       bh_uptodate_unlock_irqrestore(first, flags);
5766         end_page_writeback(page);
5767         return;
5769  still_busy:
5770 -       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
5771 -       local_irq_restore(flags);
5772 -       return;
5773 +       bh_uptodate_unlock_irqrestore(first, flags);
5775  EXPORT_SYMBOL(end_buffer_async_write);
5777 @@ -3383,6 +3375,7 @@ struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
5778         struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
5779         if (ret) {
5780                 INIT_LIST_HEAD(&ret->b_assoc_buffers);
5781 +               buffer_head_init_locks(ret);
5782                 preempt_disable();
5783                 __this_cpu_inc(bh_accounting.nr);
5784                 recalc_bh_state();
5785 diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
5786 index a27fc8791551..791aecb7c1ac 100644
5787 --- a/fs/cifs/readdir.c
5788 +++ b/fs/cifs/readdir.c
5789 @@ -80,7 +80,7 @@ cifs_prime_dcache(struct dentry *parent, struct qstr *name,
5790         struct inode *inode;
5791         struct super_block *sb = parent->d_sb;
5792         struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
5793 -       DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
5794 +       DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
5796         cifs_dbg(FYI, "%s: for %s\n", __func__, name->name);
5798 diff --git a/fs/dcache.c b/fs/dcache.c
5799 index 67957f5b325c..f0719b2f1be5 100644
5800 --- a/fs/dcache.c
5801 +++ b/fs/dcache.c
5802 @@ -19,6 +19,7 @@
5803  #include <linux/mm.h>
5804  #include <linux/fs.h>
5805  #include <linux/fsnotify.h>
5806 +#include <linux/delay.h>
5807  #include <linux/slab.h>
5808  #include <linux/init.h>
5809  #include <linux/hash.h>
5810 @@ -777,6 +778,8 @@ static inline bool fast_dput(struct dentry *dentry)
5811   */
5812  void dput(struct dentry *dentry)
5814 +       struct dentry *parent;
5816         if (unlikely(!dentry))
5817                 return;
5819 @@ -815,9 +818,18 @@ void dput(struct dentry *dentry)
5820         return;
5822  kill_it:
5823 -       dentry = dentry_kill(dentry);
5824 -       if (dentry) {
5825 -               cond_resched();
5826 +       parent = dentry_kill(dentry);
5827 +       if (parent) {
5828 +               int r;
5830 +               if (parent == dentry) {
5831 +                       /* the task with the highest priority won't schedule */
5832 +                       r = cond_resched();
5833 +                       if (!r)
5834 +                               cpu_chill();
5835 +               } else {
5836 +                       dentry = parent;
5837 +               }
5838                 goto repeat;
5839         }
5841 @@ -2352,7 +2364,7 @@ void d_delete(struct dentry * dentry)
5842         if (dentry->d_lockref.count == 1) {
5843                 if (!spin_trylock(&inode->i_lock)) {
5844                         spin_unlock(&dentry->d_lock);
5845 -                       cpu_relax();
5846 +                       cpu_chill();
5847                         goto again;
5848                 }
5849                 dentry->d_flags &= ~DCACHE_CANT_MOUNT;
5850 @@ -2397,9 +2409,10 @@ EXPORT_SYMBOL(d_rehash);
5851  static inline unsigned start_dir_add(struct inode *dir)
5854 +       preempt_disable_rt();
5855         for (;;) {
5856 -               unsigned n = dir->i_dir_seq;
5857 -               if (!(n & 1) && cmpxchg(&dir->i_dir_seq, n, n + 1) == n)
5858 +               unsigned n = dir->__i_dir_seq;
5859 +               if (!(n & 1) && cmpxchg(&dir->__i_dir_seq, n, n + 1) == n)
5860                         return n;
5861                 cpu_relax();
5862         }
5863 @@ -2407,26 +2420,30 @@ static inline unsigned start_dir_add(struct inode *dir)
5865  static inline void end_dir_add(struct inode *dir, unsigned n)
5867 -       smp_store_release(&dir->i_dir_seq, n + 2);
5868 +       smp_store_release(&dir->__i_dir_seq, n + 2);
5869 +       preempt_enable_rt();
5872  static void d_wait_lookup(struct dentry *dentry)
5874 -       if (d_in_lookup(dentry)) {
5875 -               DECLARE_WAITQUEUE(wait, current);
5876 -               add_wait_queue(dentry->d_wait, &wait);
5877 -               do {
5878 -                       set_current_state(TASK_UNINTERRUPTIBLE);
5879 -                       spin_unlock(&dentry->d_lock);
5880 -                       schedule();
5881 -                       spin_lock(&dentry->d_lock);
5882 -               } while (d_in_lookup(dentry));
5883 -       }
5884 +       struct swait_queue __wait;
5886 +       if (!d_in_lookup(dentry))
5887 +               return;
5889 +       INIT_LIST_HEAD(&__wait.task_list);
5890 +       do {
5891 +               prepare_to_swait(dentry->d_wait, &__wait, TASK_UNINTERRUPTIBLE);
5892 +               spin_unlock(&dentry->d_lock);
5893 +               schedule();
5894 +               spin_lock(&dentry->d_lock);
5895 +       } while (d_in_lookup(dentry));
5896 +       finish_swait(dentry->d_wait, &__wait);
5899  struct dentry *d_alloc_parallel(struct dentry *parent,
5900                                 const struct qstr *name,
5901 -                               wait_queue_head_t *wq)
5902 +                               struct swait_queue_head *wq)
5904         unsigned int hash = name->hash;
5905         struct hlist_bl_head *b = in_lookup_hash(parent, hash);
5906 @@ -2440,7 +2457,7 @@ struct dentry *d_alloc_parallel(struct dentry *parent,
5908  retry:
5909         rcu_read_lock();
5910 -       seq = smp_load_acquire(&parent->d_inode->i_dir_seq) & ~1;
5911 +       seq = smp_load_acquire(&parent->d_inode->__i_dir_seq) & ~1;
5912         r_seq = read_seqbegin(&rename_lock);
5913         dentry = __d_lookup_rcu(parent, name, &d_seq);
5914         if (unlikely(dentry)) {
5915 @@ -2462,7 +2479,7 @@ struct dentry *d_alloc_parallel(struct dentry *parent,
5916                 goto retry;
5917         }
5918         hlist_bl_lock(b);
5919 -       if (unlikely(parent->d_inode->i_dir_seq != seq)) {
5920 +       if (unlikely(parent->d_inode->__i_dir_seq != seq)) {
5921                 hlist_bl_unlock(b);
5922                 rcu_read_unlock();
5923                 goto retry;
5924 @@ -2535,7 +2552,7 @@ void __d_lookup_done(struct dentry *dentry)
5925         hlist_bl_lock(b);
5926         dentry->d_flags &= ~DCACHE_PAR_LOOKUP;
5927         __hlist_bl_del(&dentry->d_u.d_in_lookup_hash);
5928 -       wake_up_all(dentry->d_wait);
5929 +       swake_up_all(dentry->d_wait);
5930         dentry->d_wait = NULL;
5931         hlist_bl_unlock(b);
5932         INIT_HLIST_NODE(&dentry->d_u.d_alias);
5933 @@ -3632,6 +3649,11 @@ EXPORT_SYMBOL(d_genocide);
5935  void __init vfs_caches_init_early(void)
5937 +       int i;
5939 +       for (i = 0; i < ARRAY_SIZE(in_lookup_hashtable); i++)
5940 +               INIT_HLIST_BL_HEAD(&in_lookup_hashtable[i]);
5942         dcache_init_early();
5943         inode_init_early();
5945 diff --git a/fs/eventpoll.c b/fs/eventpoll.c
5946 index 3cbc30413add..41a94f552aab 100644
5947 --- a/fs/eventpoll.c
5948 +++ b/fs/eventpoll.c
5949 @@ -510,12 +510,12 @@ static int ep_poll_wakeup_proc(void *priv, void *cookie, int call_nests)
5950   */
5951  static void ep_poll_safewake(wait_queue_head_t *wq)
5953 -       int this_cpu = get_cpu();
5954 +       int this_cpu = get_cpu_light();
5956         ep_call_nested(&poll_safewake_ncalls, EP_MAX_NESTS,
5957                        ep_poll_wakeup_proc, NULL, wq, (void *) (long) this_cpu);
5959 -       put_cpu();
5960 +       put_cpu_light();
5963  static void ep_remove_wait_queue(struct eppoll_entry *pwq)
5964 diff --git a/fs/exec.c b/fs/exec.c
5965 index b8c43be24751..71f4c6ec2bb8 100644
5966 --- a/fs/exec.c
5967 +++ b/fs/exec.c
5968 @@ -1038,12 +1038,14 @@ static int exec_mmap(struct mm_struct *mm)
5969                 }
5970         }
5971         task_lock(tsk);
5972 +       preempt_disable_rt();
5973         active_mm = tsk->active_mm;
5974         tsk->mm = mm;
5975         tsk->active_mm = mm;
5976         activate_mm(active_mm, mm);
5977         tsk->mm->vmacache_seqnum = 0;
5978         vmacache_flush(tsk);
5979 +       preempt_enable_rt();
5980         task_unlock(tsk);
5981         if (old_mm) {
5982                 up_read(&old_mm->mmap_sem);
5983 diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
5984 index 0094923e5ebf..37fa06ef5417 100644
5985 --- a/fs/ext4/page-io.c
5986 +++ b/fs/ext4/page-io.c
5987 @@ -95,8 +95,7 @@ static void ext4_finish_bio(struct bio *bio)
5988                  * We check all buffers in the page under BH_Uptodate_Lock
5989                  * to avoid races with other end io clearing async_write flags
5990                  */
5991 -               local_irq_save(flags);
5992 -               bit_spin_lock(BH_Uptodate_Lock, &head->b_state);
5993 +               flags = bh_uptodate_lock_irqsave(head);
5994                 do {
5995                         if (bh_offset(bh) < bio_start ||
5996                             bh_offset(bh) + bh->b_size > bio_end) {
5997 @@ -108,8 +107,7 @@ static void ext4_finish_bio(struct bio *bio)
5998                         if (bio->bi_error)
5999                                 buffer_io_error(bh);
6000                 } while ((bh = bh->b_this_page) != head);
6001 -               bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
6002 -               local_irq_restore(flags);
6003 +               bh_uptodate_unlock_irqrestore(head, flags);
6004                 if (!under_io) {
6005  #ifdef CONFIG_EXT4_FS_ENCRYPTION
6006                         if (data_page)
6007 diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
6008 index 4bbad745415a..5f91ca248ab0 100644
6009 --- a/fs/fuse/dir.c
6010 +++ b/fs/fuse/dir.c
6011 @@ -1191,7 +1191,7 @@ static int fuse_direntplus_link(struct file *file,
6012         struct inode *dir = d_inode(parent);
6013         struct fuse_conn *fc;
6014         struct inode *inode;
6015 -       DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
6016 +       DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
6018         if (!o->nodeid) {
6019                 /*
6020 diff --git a/fs/inode.c b/fs/inode.c
6021 index 920aa0b1c6b0..3d6b5fd1bf06 100644
6022 --- a/fs/inode.c
6023 +++ b/fs/inode.c
6024 @@ -153,7 +153,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
6025         inode->i_bdev = NULL;
6026         inode->i_cdev = NULL;
6027         inode->i_link = NULL;
6028 -       inode->i_dir_seq = 0;
6029 +       inode->__i_dir_seq = 0;
6030         inode->i_rdev = 0;
6031         inode->dirtied_when = 0;
6033 diff --git a/fs/libfs.c b/fs/libfs.c
6034 index 9588780ad43e..9b37abd354c9 100644
6035 --- a/fs/libfs.c
6036 +++ b/fs/libfs.c
6037 @@ -89,7 +89,7 @@ static struct dentry *next_positive(struct dentry *parent,
6038                                     struct list_head *from,
6039                                     int count)
6041 -       unsigned *seq = &parent->d_inode->i_dir_seq, n;
6042 +       unsigned *seq = &parent->d_inode->__i_dir_seq, n;
6043         struct dentry *res;
6044         struct list_head *p;
6045         bool skipped;
6046 @@ -122,8 +122,9 @@ static struct dentry *next_positive(struct dentry *parent,
6047  static void move_cursor(struct dentry *cursor, struct list_head *after)
6049         struct dentry *parent = cursor->d_parent;
6050 -       unsigned n, *seq = &parent->d_inode->i_dir_seq;
6051 +       unsigned n, *seq = &parent->d_inode->__i_dir_seq;
6052         spin_lock(&parent->d_lock);
6053 +       preempt_disable_rt();
6054         for (;;) {
6055                 n = *seq;
6056                 if (!(n & 1) && cmpxchg(seq, n, n + 1) == n)
6057 @@ -136,6 +137,7 @@ static void move_cursor(struct dentry *cursor, struct list_head *after)
6058         else
6059                 list_add_tail(&cursor->d_child, &parent->d_subdirs);
6060         smp_store_release(seq, n + 2);
6061 +       preempt_enable_rt();
6062         spin_unlock(&parent->d_lock);
6065 diff --git a/fs/locks.c b/fs/locks.c
6066 index 22c5b4aa4961..269c6a44449a 100644
6067 --- a/fs/locks.c
6068 +++ b/fs/locks.c
6069 @@ -935,7 +935,7 @@ static int flock_lock_inode(struct inode *inode, struct file_lock *request)
6070                         return -ENOMEM;
6071         }
6073 -       percpu_down_read_preempt_disable(&file_rwsem);
6074 +       percpu_down_read(&file_rwsem);
6075         spin_lock(&ctx->flc_lock);
6076         if (request->fl_flags & FL_ACCESS)
6077                 goto find_conflict;
6078 @@ -976,7 +976,7 @@ static int flock_lock_inode(struct inode *inode, struct file_lock *request)
6080  out:
6081         spin_unlock(&ctx->flc_lock);
6082 -       percpu_up_read_preempt_enable(&file_rwsem);
6083 +       percpu_up_read(&file_rwsem);
6084         if (new_fl)
6085                 locks_free_lock(new_fl);
6086         locks_dispose_list(&dispose);
6087 @@ -1013,7 +1013,7 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request,
6088                 new_fl2 = locks_alloc_lock();
6089         }
6091 -       percpu_down_read_preempt_disable(&file_rwsem);
6092 +       percpu_down_read(&file_rwsem);
6093         spin_lock(&ctx->flc_lock);
6094         /*
6095          * New lock request. Walk all POSIX locks and look for conflicts. If
6096 @@ -1185,7 +1185,7 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request,
6097         }
6098   out:
6099         spin_unlock(&ctx->flc_lock);
6100 -       percpu_up_read_preempt_enable(&file_rwsem);
6101 +       percpu_up_read(&file_rwsem);
6102         /*
6103          * Free any unused locks.
6104          */
6105 @@ -1460,7 +1460,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
6106                 return error;
6107         }
6109 -       percpu_down_read_preempt_disable(&file_rwsem);
6110 +       percpu_down_read(&file_rwsem);
6111         spin_lock(&ctx->flc_lock);
6113         time_out_leases(inode, &dispose);
6114 @@ -1512,13 +1512,13 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
6115         locks_insert_block(fl, new_fl);
6116         trace_break_lease_block(inode, new_fl);
6117         spin_unlock(&ctx->flc_lock);
6118 -       percpu_up_read_preempt_enable(&file_rwsem);
6119 +       percpu_up_read(&file_rwsem);
6121         locks_dispose_list(&dispose);
6122         error = wait_event_interruptible_timeout(new_fl->fl_wait,
6123                                                 !new_fl->fl_next, break_time);
6125 -       percpu_down_read_preempt_disable(&file_rwsem);
6126 +       percpu_down_read(&file_rwsem);
6127         spin_lock(&ctx->flc_lock);
6128         trace_break_lease_unblock(inode, new_fl);
6129         locks_delete_block(new_fl);
6130 @@ -1535,7 +1535,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
6131         }
6132  out:
6133         spin_unlock(&ctx->flc_lock);
6134 -       percpu_up_read_preempt_enable(&file_rwsem);
6135 +       percpu_up_read(&file_rwsem);
6136         locks_dispose_list(&dispose);
6137         locks_free_lock(new_fl);
6138         return error;
6139 @@ -1609,7 +1609,7 @@ int fcntl_getlease(struct file *filp)
6141         ctx = smp_load_acquire(&inode->i_flctx);
6142         if (ctx && !list_empty_careful(&ctx->flc_lease)) {
6143 -               percpu_down_read_preempt_disable(&file_rwsem);
6144 +               percpu_down_read(&file_rwsem);
6145                 spin_lock(&ctx->flc_lock);
6146                 time_out_leases(inode, &dispose);
6147                 list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
6148 @@ -1619,7 +1619,7 @@ int fcntl_getlease(struct file *filp)
6149                         break;
6150                 }
6151                 spin_unlock(&ctx->flc_lock);
6152 -               percpu_up_read_preempt_enable(&file_rwsem);
6153 +               percpu_up_read(&file_rwsem);
6155                 locks_dispose_list(&dispose);
6156         }
6157 @@ -1694,7 +1694,7 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
6158                 return -EINVAL;
6159         }
6161 -       percpu_down_read_preempt_disable(&file_rwsem);
6162 +       percpu_down_read(&file_rwsem);
6163         spin_lock(&ctx->flc_lock);
6164         time_out_leases(inode, &dispose);
6165         error = check_conflicting_open(dentry, arg, lease->fl_flags);
6166 @@ -1765,7 +1765,7 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
6167                 lease->fl_lmops->lm_setup(lease, priv);
6168  out:
6169         spin_unlock(&ctx->flc_lock);
6170 -       percpu_up_read_preempt_enable(&file_rwsem);
6171 +       percpu_up_read(&file_rwsem);
6172         locks_dispose_list(&dispose);
6173         if (is_deleg)
6174                 inode_unlock(inode);
6175 @@ -1788,7 +1788,7 @@ static int generic_delete_lease(struct file *filp, void *owner)
6176                 return error;
6177         }
6179 -       percpu_down_read_preempt_disable(&file_rwsem);
6180 +       percpu_down_read(&file_rwsem);
6181         spin_lock(&ctx->flc_lock);
6182         list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
6183                 if (fl->fl_file == filp &&
6184 @@ -1801,7 +1801,7 @@ static int generic_delete_lease(struct file *filp, void *owner)
6185         if (victim)
6186                 error = fl->fl_lmops->lm_change(victim, F_UNLCK, &dispose);
6187         spin_unlock(&ctx->flc_lock);
6188 -       percpu_up_read_preempt_enable(&file_rwsem);
6189 +       percpu_up_read(&file_rwsem);
6190         locks_dispose_list(&dispose);
6191         return error;
6193 @@ -2532,13 +2532,13 @@ locks_remove_lease(struct file *filp, struct file_lock_context *ctx)
6194         if (list_empty(&ctx->flc_lease))
6195                 return;
6197 -       percpu_down_read_preempt_disable(&file_rwsem);
6198 +       percpu_down_read(&file_rwsem);
6199         spin_lock(&ctx->flc_lock);
6200         list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, fl_list)
6201                 if (filp == fl->fl_file)
6202                         lease_modify(fl, F_UNLCK, &dispose);
6203         spin_unlock(&ctx->flc_lock);
6204 -       percpu_up_read_preempt_enable(&file_rwsem);
6205 +       percpu_up_read(&file_rwsem);
6207         locks_dispose_list(&dispose);
6209 diff --git a/fs/namei.c b/fs/namei.c
6210 index e7d125c23aa6..072a2f724437 100644
6211 --- a/fs/namei.c
6212 +++ b/fs/namei.c
6213 @@ -1626,7 +1626,7 @@ static struct dentry *lookup_slow(const struct qstr *name,
6215         struct dentry *dentry = ERR_PTR(-ENOENT), *old;
6216         struct inode *inode = dir->d_inode;
6217 -       DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
6218 +       DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
6220         inode_lock_shared(inode);
6221         /* Don't go there if it's already dead */
6222 @@ -3089,7 +3089,7 @@ static int lookup_open(struct nameidata *nd, struct path *path,
6223         struct dentry *dentry;
6224         int error, create_error = 0;
6225         umode_t mode = op->mode;
6226 -       DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
6227 +       DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
6229         if (unlikely(IS_DEADDIR(dir_inode)))
6230                 return -ENOENT;
6231 diff --git a/fs/namespace.c b/fs/namespace.c
6232 index d7360f9897b4..da188c6966a3 100644
6233 --- a/fs/namespace.c
6234 +++ b/fs/namespace.c
6235 @@ -14,6 +14,7 @@
6236  #include <linux/mnt_namespace.h>
6237  #include <linux/user_namespace.h>
6238  #include <linux/namei.h>
6239 +#include <linux/delay.h>
6240  #include <linux/security.h>
6241  #include <linux/idr.h>
6242  #include <linux/init.h>                /* init_rootfs */
6243 @@ -357,8 +358,11 @@ int __mnt_want_write(struct vfsmount *m)
6244          * incremented count after it has set MNT_WRITE_HOLD.
6245          */
6246         smp_mb();
6247 -       while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD)
6248 -               cpu_relax();
6249 +       while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) {
6250 +               preempt_enable();
6251 +               cpu_chill();
6252 +               preempt_disable();
6253 +       }
6254         /*
6255          * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
6256          * be set to match its requirements. So we must not load that until
6257 diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
6258 index dff600ae0d74..d726d2e09353 100644
6259 --- a/fs/nfs/delegation.c
6260 +++ b/fs/nfs/delegation.c
6261 @@ -150,11 +150,11 @@ static int nfs_delegation_claim_opens(struct inode *inode,
6262                 sp = state->owner;
6263                 /* Block nfs4_proc_unlck */
6264                 mutex_lock(&sp->so_delegreturn_mutex);
6265 -               seq = raw_seqcount_begin(&sp->so_reclaim_seqcount);
6266 +               seq = read_seqbegin(&sp->so_reclaim_seqlock);
6267                 err = nfs4_open_delegation_recall(ctx, state, stateid, type);
6268                 if (!err)
6269                         err = nfs_delegation_claim_locks(ctx, state, stateid);
6270 -               if (!err && read_seqcount_retry(&sp->so_reclaim_seqcount, seq))
6271 +               if (!err && read_seqretry(&sp->so_reclaim_seqlock, seq))
6272                         err = -EAGAIN;
6273                 mutex_unlock(&sp->so_delegreturn_mutex);
6274                 put_nfs_open_context(ctx);
6275 diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
6276 index 1e5321d1ed22..2510f2be8557 100644
6277 --- a/fs/nfs/dir.c
6278 +++ b/fs/nfs/dir.c
6279 @@ -485,7 +485,7 @@ static
6280  void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
6282         struct qstr filename = QSTR_INIT(entry->name, entry->len);
6283 -       DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
6284 +       DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
6285         struct dentry *dentry;
6286         struct dentry *alias;
6287         struct inode *dir = d_inode(parent);
6288 @@ -1492,7 +1492,7 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry,
6289                     struct file *file, unsigned open_flags,
6290                     umode_t mode, int *opened)
6292 -       DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
6293 +       DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
6294         struct nfs_open_context *ctx;
6295         struct dentry *res;
6296         struct iattr attr = { .ia_valid = ATTR_OPEN };
6297 @@ -1807,7 +1807,11 @@ int nfs_rmdir(struct inode *dir, struct dentry *dentry)
6299         trace_nfs_rmdir_enter(dir, dentry);
6300         if (d_really_is_positive(dentry)) {
6301 +#ifdef CONFIG_PREEMPT_RT_BASE
6302 +               down(&NFS_I(d_inode(dentry))->rmdir_sem);
6303 +#else
6304                 down_write(&NFS_I(d_inode(dentry))->rmdir_sem);
6305 +#endif
6306                 error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name);
6307                 /* Ensure the VFS deletes this inode */
6308                 switch (error) {
6309 @@ -1817,7 +1821,11 @@ int nfs_rmdir(struct inode *dir, struct dentry *dentry)
6310                 case -ENOENT:
6311                         nfs_dentry_handle_enoent(dentry);
6312                 }
6313 +#ifdef CONFIG_PREEMPT_RT_BASE
6314 +               up(&NFS_I(d_inode(dentry))->rmdir_sem);
6315 +#else
6316                 up_write(&NFS_I(d_inode(dentry))->rmdir_sem);
6317 +#endif
6318         } else
6319                 error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name);
6320         trace_nfs_rmdir_exit(dir, dentry, error);
6321 diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
6322 index 76ae25661d3f..89159d298278 100644
6323 --- a/fs/nfs/inode.c
6324 +++ b/fs/nfs/inode.c
6325 @@ -1957,7 +1957,11 @@ static void init_once(void *foo)
6326         nfsi->nrequests = 0;
6327         nfsi->commit_info.ncommit = 0;
6328         atomic_set(&nfsi->commit_info.rpcs_out, 0);
6329 +#ifdef CONFIG_PREEMPT_RT_BASE
6330 +       sema_init(&nfsi->rmdir_sem, 1);
6331 +#else
6332         init_rwsem(&nfsi->rmdir_sem);
6333 +#endif
6334         nfs4_init_once(nfsi);
6337 diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
6338 index 1452177c822d..f43b01d54c59 100644
6339 --- a/fs/nfs/nfs4_fs.h
6340 +++ b/fs/nfs/nfs4_fs.h
6341 @@ -111,7 +111,7 @@ struct nfs4_state_owner {
6342         unsigned long        so_flags;
6343         struct list_head     so_states;
6344         struct nfs_seqid_counter so_seqid;
6345 -       seqcount_t           so_reclaim_seqcount;
6346 +       seqlock_t            so_reclaim_seqlock;
6347         struct mutex         so_delegreturn_mutex;
6348  };
6350 diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
6351 index 4638654e26f3..5dd6fd555c72 100644
6352 --- a/fs/nfs/nfs4proc.c
6353 +++ b/fs/nfs/nfs4proc.c
6354 @@ -2691,7 +2691,7 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
6355         unsigned int seq;
6356         int ret;
6358 -       seq = raw_seqcount_begin(&sp->so_reclaim_seqcount);
6359 +       seq = raw_seqcount_begin(&sp->so_reclaim_seqlock.seqcount);
6361         ret = _nfs4_proc_open(opendata);
6362         if (ret != 0)
6363 @@ -2729,7 +2729,7 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
6365         if (d_inode(dentry) == state->inode) {
6366                 nfs_inode_attach_open_context(ctx);
6367 -               if (read_seqcount_retry(&sp->so_reclaim_seqcount, seq))
6368 +               if (read_seqretry(&sp->so_reclaim_seqlock, seq))
6369                         nfs4_schedule_stateid_recovery(server, state);
6370         }
6371  out:
6372 diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
6373 index 71deeae6eefd..4be6999299dc 100644
6374 --- a/fs/nfs/nfs4state.c
6375 +++ b/fs/nfs/nfs4state.c
6376 @@ -488,7 +488,7 @@ nfs4_alloc_state_owner(struct nfs_server *server,
6377         nfs4_init_seqid_counter(&sp->so_seqid);
6378         atomic_set(&sp->so_count, 1);
6379         INIT_LIST_HEAD(&sp->so_lru);
6380 -       seqcount_init(&sp->so_reclaim_seqcount);
6381 +       seqlock_init(&sp->so_reclaim_seqlock);
6382         mutex_init(&sp->so_delegreturn_mutex);
6383         return sp;
6385 @@ -1498,8 +1498,12 @@ static int nfs4_reclaim_open_state(struct nfs4_state_owner *sp, const struct nfs
6386          * recovering after a network partition or a reboot from a
6387          * server that doesn't support a grace period.
6388          */
6389 +#ifdef CONFIG_PREEMPT_RT_FULL
6390 +       write_seqlock(&sp->so_reclaim_seqlock);
6391 +#else
6392 +       write_seqcount_begin(&sp->so_reclaim_seqlock.seqcount);
6393 +#endif
6394         spin_lock(&sp->so_lock);
6395 -       raw_write_seqcount_begin(&sp->so_reclaim_seqcount);
6396  restart:
6397         list_for_each_entry(state, &sp->so_states, open_states) {
6398                 if (!test_and_clear_bit(ops->state_flag_bit, &state->flags))
6399 @@ -1568,14 +1572,20 @@ static int nfs4_reclaim_open_state(struct nfs4_state_owner *sp, const struct nfs
6400                 spin_lock(&sp->so_lock);
6401                 goto restart;
6402         }
6403 -       raw_write_seqcount_end(&sp->so_reclaim_seqcount);
6404         spin_unlock(&sp->so_lock);
6405 +#ifdef CONFIG_PREEMPT_RT_FULL
6406 +       write_sequnlock(&sp->so_reclaim_seqlock);
6407 +#else
6408 +       write_seqcount_end(&sp->so_reclaim_seqlock.seqcount);
6409 +#endif
6410         return 0;
6411  out_err:
6412         nfs4_put_open_state(state);
6413 -       spin_lock(&sp->so_lock);
6414 -       raw_write_seqcount_end(&sp->so_reclaim_seqcount);
6415 -       spin_unlock(&sp->so_lock);
6416 +#ifdef CONFIG_PREEMPT_RT_FULL
6417 +       write_sequnlock(&sp->so_reclaim_seqlock);
6418 +#else
6419 +       write_seqcount_end(&sp->so_reclaim_seqlock.seqcount);
6420 +#endif
6421         return status;
6424 diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
6425 index 191aa577dd1f..58990c8f52e0 100644
6426 --- a/fs/nfs/unlink.c
6427 +++ b/fs/nfs/unlink.c
6428 @@ -12,7 +12,7 @@
6429  #include <linux/sunrpc/clnt.h>
6430  #include <linux/nfs_fs.h>
6431  #include <linux/sched.h>
6432 -#include <linux/wait.h>
6433 +#include <linux/swait.h>
6434  #include <linux/namei.h>
6435  #include <linux/fsnotify.h>
6437 @@ -51,6 +51,29 @@ static void nfs_async_unlink_done(struct rpc_task *task, void *calldata)
6438                 rpc_restart_call_prepare(task);
6441 +#ifdef CONFIG_PREEMPT_RT_BASE
6442 +static void nfs_down_anon(struct semaphore *sema)
6444 +       down(sema);
6447 +static void nfs_up_anon(struct semaphore *sema)
6449 +       up(sema);
6452 +#else
6453 +static void nfs_down_anon(struct rw_semaphore *rwsem)
6455 +       down_read_non_owner(rwsem);
6458 +static void nfs_up_anon(struct rw_semaphore *rwsem)
6460 +       up_read_non_owner(rwsem);
6462 +#endif
6464  /**
6465   * nfs_async_unlink_release - Release the sillydelete data.
6466   * @task: rpc_task of the sillydelete
6467 @@ -64,7 +87,7 @@ static void nfs_async_unlink_release(void *calldata)
6468         struct dentry *dentry = data->dentry;
6469         struct super_block *sb = dentry->d_sb;
6471 -       up_read_non_owner(&NFS_I(d_inode(dentry->d_parent))->rmdir_sem);
6472 +       nfs_up_anon(&NFS_I(d_inode(dentry->d_parent))->rmdir_sem);
6473         d_lookup_done(dentry);
6474         nfs_free_unlinkdata(data);
6475         dput(dentry);
6476 @@ -117,10 +140,10 @@ static int nfs_call_unlink(struct dentry *dentry, struct nfs_unlinkdata *data)
6477         struct inode *dir = d_inode(dentry->d_parent);
6478         struct dentry *alias;
6480 -       down_read_non_owner(&NFS_I(dir)->rmdir_sem);
6481 +       nfs_down_anon(&NFS_I(dir)->rmdir_sem);
6482         alias = d_alloc_parallel(dentry->d_parent, &data->args.name, &data->wq);
6483         if (IS_ERR(alias)) {
6484 -               up_read_non_owner(&NFS_I(dir)->rmdir_sem);
6485 +               nfs_up_anon(&NFS_I(dir)->rmdir_sem);
6486                 return 0;
6487         }
6488         if (!d_in_lookup(alias)) {
6489 @@ -142,7 +165,7 @@ static int nfs_call_unlink(struct dentry *dentry, struct nfs_unlinkdata *data)
6490                         ret = 0;
6491                 spin_unlock(&alias->d_lock);
6492                 dput(alias);
6493 -               up_read_non_owner(&NFS_I(dir)->rmdir_sem);
6494 +               nfs_up_anon(&NFS_I(dir)->rmdir_sem);
6495                 /*
6496                  * If we'd displaced old cached devname, free it.  At that
6497                  * point dentry is definitely not a root, so we won't need
6498 @@ -182,7 +205,7 @@ nfs_async_unlink(struct dentry *dentry, const struct qstr *name)
6499                 goto out_free_name;
6500         }
6501         data->res.dir_attr = &data->dir_attr;
6502 -       init_waitqueue_head(&data->wq);
6503 +       init_swait_queue_head(&data->wq);
6505         status = -EBUSY;
6506         spin_lock(&dentry->d_lock);
6507 diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
6508 index fe251f187ff8..e89da4fb14c2 100644
6509 --- a/fs/ntfs/aops.c
6510 +++ b/fs/ntfs/aops.c
6511 @@ -92,13 +92,13 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
6512                         ofs = 0;
6513                         if (file_ofs < init_size)
6514                                 ofs = init_size - file_ofs;
6515 -                       local_irq_save(flags);
6516 +                       local_irq_save_nort(flags);
6517                         kaddr = kmap_atomic(page);
6518                         memset(kaddr + bh_offset(bh) + ofs, 0,
6519                                         bh->b_size - ofs);
6520                         flush_dcache_page(page);
6521                         kunmap_atomic(kaddr);
6522 -                       local_irq_restore(flags);
6523 +                       local_irq_restore_nort(flags);
6524                 }
6525         } else {
6526                 clear_buffer_uptodate(bh);
6527 @@ -107,8 +107,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
6528                                 "0x%llx.", (unsigned long long)bh->b_blocknr);
6529         }
6530         first = page_buffers(page);
6531 -       local_irq_save(flags);
6532 -       bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
6533 +       flags = bh_uptodate_lock_irqsave(first);
6534         clear_buffer_async_read(bh);
6535         unlock_buffer(bh);
6536         tmp = bh;
6537 @@ -123,8 +122,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
6538                 }
6539                 tmp = tmp->b_this_page;
6540         } while (tmp != bh);
6541 -       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
6542 -       local_irq_restore(flags);
6543 +       bh_uptodate_unlock_irqrestore(first, flags);
6544         /*
6545          * If none of the buffers had errors then we can set the page uptodate,
6546          * but we first have to perform the post read mst fixups, if the
6547 @@ -145,13 +143,13 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
6548                 recs = PAGE_SIZE / rec_size;
6549                 /* Should have been verified before we got here... */
6550                 BUG_ON(!recs);
6551 -               local_irq_save(flags);
6552 +               local_irq_save_nort(flags);
6553                 kaddr = kmap_atomic(page);
6554                 for (i = 0; i < recs; i++)
6555                         post_read_mst_fixup((NTFS_RECORD*)(kaddr +
6556                                         i * rec_size), rec_size);
6557                 kunmap_atomic(kaddr);
6558 -               local_irq_restore(flags);
6559 +               local_irq_restore_nort(flags);
6560                 flush_dcache_page(page);
6561                 if (likely(page_uptodate && !PageError(page)))
6562                         SetPageUptodate(page);
6563 @@ -159,9 +157,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
6564         unlock_page(page);
6565         return;
6566  still_busy:
6567 -       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
6568 -       local_irq_restore(flags);
6569 -       return;
6570 +       bh_uptodate_unlock_irqrestore(first, flags);
6573  /**
6574 diff --git a/fs/proc/base.c b/fs/proc/base.c
6575 index e67fec3c9856..0edc16f95596 100644
6576 --- a/fs/proc/base.c
6577 +++ b/fs/proc/base.c
6578 @@ -1834,7 +1834,7 @@ bool proc_fill_cache(struct file *file, struct dir_context *ctx,
6580         child = d_hash_and_lookup(dir, &qname);
6581         if (!child) {
6582 -               DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
6583 +               DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
6584                 child = d_alloc_parallel(dir, &qname, &wq);
6585                 if (IS_ERR(child))
6586                         goto end_instantiate;
6587 diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
6588 index d4e37acd4821..000cea46434a 100644
6589 --- a/fs/proc/proc_sysctl.c
6590 +++ b/fs/proc/proc_sysctl.c
6591 @@ -632,7 +632,7 @@ static bool proc_sys_fill_cache(struct file *file,
6593         child = d_lookup(dir, &qname);
6594         if (!child) {
6595 -               DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
6596 +               DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
6597                 child = d_alloc_parallel(dir, &qname, &wq);
6598                 if (IS_ERR(child))
6599                         return false;
6600 diff --git a/fs/timerfd.c b/fs/timerfd.c
6601 index ab8dd1538381..5580853f57dd 100644
6602 --- a/fs/timerfd.c
6603 +++ b/fs/timerfd.c
6604 @@ -471,7 +471,10 @@ static int do_timerfd_settime(int ufd, int flags,
6605                                 break;
6606                 }
6607                 spin_unlock_irq(&ctx->wqh.lock);
6608 -               cpu_relax();
6609 +               if (isalarm(ctx))
6610 +                       hrtimer_wait_for_timer(&ctx->t.alarm.timer);
6611 +               else
6612 +                       hrtimer_wait_for_timer(&ctx->t.tmr);
6613         }
6615         /*
6616 diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
6617 index d31cd1ebd8e9..5ea3f933a52a 100644
6618 --- a/fs/xfs/xfs_aops.c
6619 +++ b/fs/xfs/xfs_aops.c
6620 @@ -112,8 +112,7 @@ xfs_finish_page_writeback(
6621         ASSERT(bvec->bv_offset + bvec->bv_len <= PAGE_SIZE);
6622         ASSERT((bvec->bv_len & (i_blocksize(inode) - 1)) == 0);
6624 -       local_irq_save(flags);
6625 -       bit_spin_lock(BH_Uptodate_Lock, &head->b_state);
6626 +       flags = bh_uptodate_lock_irqsave(head);
6627         do {
6628                 if (off >= bvec->bv_offset &&
6629                     off < bvec->bv_offset + bvec->bv_len) {
6630 @@ -136,8 +135,7 @@ xfs_finish_page_writeback(
6631                 }
6632                 off += bh->b_size;
6633         } while ((bh = bh->b_this_page) != head);
6634 -       bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
6635 -       local_irq_restore(flags);
6636 +       bh_uptodate_unlock_irqrestore(head, flags);
6638         if (!busy)
6639                 end_page_writeback(bvec->bv_page);
6640 diff --git a/include/acpi/platform/aclinux.h b/include/acpi/platform/aclinux.h
6641 index e861a24f06f2..b5c97d3059c7 100644
6642 --- a/include/acpi/platform/aclinux.h
6643 +++ b/include/acpi/platform/aclinux.h
6644 @@ -133,6 +133,7 @@
6646  #define acpi_cache_t                        struct kmem_cache
6647  #define acpi_spinlock                       spinlock_t *
6648 +#define acpi_raw_spinlock              raw_spinlock_t *
6649  #define acpi_cpu_flags                      unsigned long
6651  /* Use native linux version of acpi_os_allocate_zeroed */
6652 @@ -151,6 +152,20 @@
6653  #define ACPI_USE_ALTERNATE_PROTOTYPE_acpi_os_get_thread_id
6654  #define ACPI_USE_ALTERNATE_PROTOTYPE_acpi_os_create_lock
6656 +#define acpi_os_create_raw_lock(__handle)                      \
6657 +({                                                             \
6658 +        raw_spinlock_t *lock = ACPI_ALLOCATE(sizeof(*lock));   \
6659 +                                                               \
6660 +        if (lock) {                                            \
6661 +               *(__handle) = lock;                             \
6662 +               raw_spin_lock_init(*(__handle));                \
6663 +        }                                                      \
6664 +        lock ? AE_OK : AE_NO_MEMORY;                           \
6665 + })
6667 +#define acpi_os_delete_raw_lock(__handle)      kfree(__handle)
6670  /*
6671   * OSL interfaces used by debugger/disassembler
6672   */
6673 diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h
6674 index 6f96247226a4..fa53a21263c2 100644
6675 --- a/include/asm-generic/bug.h
6676 +++ b/include/asm-generic/bug.h
6677 @@ -215,6 +215,20 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
6678  # define WARN_ON_SMP(x)                        ({0;})
6679  #endif
6681 +#ifdef CONFIG_PREEMPT_RT_BASE
6682 +# define BUG_ON_RT(c)                  BUG_ON(c)
6683 +# define BUG_ON_NONRT(c)               do { } while (0)
6684 +# define WARN_ON_RT(condition)         WARN_ON(condition)
6685 +# define WARN_ON_NONRT(condition)      do { } while (0)
6686 +# define WARN_ON_ONCE_NONRT(condition) do { } while (0)
6687 +#else
6688 +# define BUG_ON_RT(c)                  do { } while (0)
6689 +# define BUG_ON_NONRT(c)               BUG_ON(c)
6690 +# define WARN_ON_RT(condition)         do { } while (0)
6691 +# define WARN_ON_NONRT(condition)      WARN_ON(condition)
6692 +# define WARN_ON_ONCE_NONRT(condition) WARN_ON_ONCE(condition)
6693 +#endif
6695  #endif /* __ASSEMBLY__ */
6697  #endif
6698 diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
6699 index 535ab2e13d2e..cfc246899473 100644
6700 --- a/include/linux/blk-mq.h
6701 +++ b/include/linux/blk-mq.h
6702 @@ -209,7 +209,7 @@ static inline u16 blk_mq_unique_tag_to_tag(u32 unique_tag)
6703         return unique_tag & BLK_MQ_UNIQUE_TAG_MASK;
6707 +void __blk_mq_complete_request_remote_work(struct work_struct *work);
6708  int blk_mq_request_started(struct request *rq);
6709  void blk_mq_start_request(struct request *rq);
6710  void blk_mq_end_request(struct request *rq, int error);
6711 diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
6712 index f6a816129856..ec7a4676f8a8 100644
6713 --- a/include/linux/blkdev.h
6714 +++ b/include/linux/blkdev.h
6715 @@ -89,6 +89,7 @@ struct request {
6716         struct list_head queuelist;
6717         union {
6718                 struct call_single_data csd;
6719 +               struct work_struct work;
6720                 u64 fifo_time;
6721         };
6723 @@ -467,7 +468,7 @@ struct request_queue {
6724         struct throtl_data *td;
6725  #endif
6726         struct rcu_head         rcu_head;
6727 -       wait_queue_head_t       mq_freeze_wq;
6728 +       struct swait_queue_head mq_freeze_wq;
6729         struct percpu_ref       q_usage_counter;
6730         struct list_head        all_q_node;
6732 diff --git a/include/linux/bottom_half.h b/include/linux/bottom_half.h
6733 index 8fdcb783197d..d07dbeec7bc1 100644
6734 --- a/include/linux/bottom_half.h
6735 +++ b/include/linux/bottom_half.h
6736 @@ -3,6 +3,39 @@
6738  #include <linux/preempt.h>
6740 +#ifdef CONFIG_PREEMPT_RT_FULL
6742 +extern void __local_bh_disable(void);
6743 +extern void _local_bh_enable(void);
6744 +extern void __local_bh_enable(void);
6746 +static inline void local_bh_disable(void)
6748 +       __local_bh_disable();
6751 +static inline void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
6753 +       __local_bh_disable();
6756 +static inline void local_bh_enable(void)
6758 +       __local_bh_enable();
6761 +static inline void __local_bh_enable_ip(unsigned long ip, unsigned int cnt)
6763 +       __local_bh_enable();
6766 +static inline void local_bh_enable_ip(unsigned long ip)
6768 +       __local_bh_enable();
6771 +#else
6773  #ifdef CONFIG_TRACE_IRQFLAGS
6774  extern void __local_bh_disable_ip(unsigned long ip, unsigned int cnt);
6775  #else
6776 @@ -30,5 +63,6 @@ static inline void local_bh_enable(void)
6778         __local_bh_enable_ip(_THIS_IP_, SOFTIRQ_DISABLE_OFFSET);
6780 +#endif
6782  #endif /* _LINUX_BH_H */
6783 diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
6784 index 4431ea2c8802..0744157a97ca 100644
6785 --- a/include/linux/buffer_head.h
6786 +++ b/include/linux/buffer_head.h
6787 @@ -75,8 +75,50 @@ struct buffer_head {
6788         struct address_space *b_assoc_map;      /* mapping this buffer is
6789                                                    associated with */
6790         atomic_t b_count;               /* users using this buffer_head */
6791 +#ifdef CONFIG_PREEMPT_RT_BASE
6792 +       spinlock_t b_uptodate_lock;
6793 +#if IS_ENABLED(CONFIG_JBD2)
6794 +       spinlock_t b_state_lock;
6795 +       spinlock_t b_journal_head_lock;
6796 +#endif
6797 +#endif
6798  };
6800 +static inline unsigned long bh_uptodate_lock_irqsave(struct buffer_head *bh)
6802 +       unsigned long flags;
6804 +#ifndef CONFIG_PREEMPT_RT_BASE
6805 +       local_irq_save(flags);
6806 +       bit_spin_lock(BH_Uptodate_Lock, &bh->b_state);
6807 +#else
6808 +       spin_lock_irqsave(&bh->b_uptodate_lock, flags);
6809 +#endif
6810 +       return flags;
6813 +static inline void
6814 +bh_uptodate_unlock_irqrestore(struct buffer_head *bh, unsigned long flags)
6816 +#ifndef CONFIG_PREEMPT_RT_BASE
6817 +       bit_spin_unlock(BH_Uptodate_Lock, &bh->b_state);
6818 +       local_irq_restore(flags);
6819 +#else
6820 +       spin_unlock_irqrestore(&bh->b_uptodate_lock, flags);
6821 +#endif
6824 +static inline void buffer_head_init_locks(struct buffer_head *bh)
6826 +#ifdef CONFIG_PREEMPT_RT_BASE
6827 +       spin_lock_init(&bh->b_uptodate_lock);
6828 +#if IS_ENABLED(CONFIG_JBD2)
6829 +       spin_lock_init(&bh->b_state_lock);
6830 +       spin_lock_init(&bh->b_journal_head_lock);
6831 +#endif
6832 +#endif
6835  /*
6836   * macro tricks to expand the set_buffer_foo(), clear_buffer_foo()
6837   * and buffer_foo() functions.
6838 diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
6839 index 6fb1c34cf805..ccd2a5addb56 100644
6840 --- a/include/linux/cgroup-defs.h
6841 +++ b/include/linux/cgroup-defs.h
6842 @@ -16,6 +16,7 @@
6843  #include <linux/percpu-refcount.h>
6844  #include <linux/percpu-rwsem.h>
6845  #include <linux/workqueue.h>
6846 +#include <linux/swork.h>
6848  #ifdef CONFIG_CGROUPS
6850 @@ -138,6 +139,7 @@ struct cgroup_subsys_state {
6851         /* percpu_ref killing and RCU release */
6852         struct rcu_head rcu_head;
6853         struct work_struct destroy_work;
6854 +       struct swork_event destroy_swork;
6855  };
6857  /*
6858 diff --git a/include/linux/completion.h b/include/linux/completion.h
6859 index 5d5aaae3af43..3bca1590e29f 100644
6860 --- a/include/linux/completion.h
6861 +++ b/include/linux/completion.h
6862 @@ -7,8 +7,7 @@
6863   * Atomic wait-for-completion handler data structures.
6864   * See kernel/sched/completion.c for details.
6865   */
6867 -#include <linux/wait.h>
6868 +#include <linux/swait.h>
6870  /*
6871   * struct completion - structure used to maintain state for a "completion"
6872 @@ -24,11 +23,11 @@
6873   */
6874  struct completion {
6875         unsigned int done;
6876 -       wait_queue_head_t wait;
6877 +       struct swait_queue_head wait;
6878  };
6880  #define COMPLETION_INITIALIZER(work) \
6881 -       { 0, __WAIT_QUEUE_HEAD_INITIALIZER((work).wait) }
6882 +       { 0, __SWAIT_QUEUE_HEAD_INITIALIZER((work).wait) }
6884  #define COMPLETION_INITIALIZER_ONSTACK(work) \
6885         ({ init_completion(&work); work; })
6886 @@ -73,7 +72,7 @@ struct completion {
6887  static inline void init_completion(struct completion *x)
6889         x->done = 0;
6890 -       init_waitqueue_head(&x->wait);
6891 +       init_swait_queue_head(&x->wait);
6894  /**
6895 diff --git a/include/linux/cpu.h b/include/linux/cpu.h
6896 index e571128ad99a..5e52d28c20c1 100644
6897 --- a/include/linux/cpu.h
6898 +++ b/include/linux/cpu.h
6899 @@ -182,6 +182,8 @@ extern void get_online_cpus(void);
6900  extern void put_online_cpus(void);
6901  extern void cpu_hotplug_disable(void);
6902  extern void cpu_hotplug_enable(void);
6903 +extern void pin_current_cpu(void);
6904 +extern void unpin_current_cpu(void);
6905  #define hotcpu_notifier(fn, pri)       cpu_notifier(fn, pri)
6906  #define __hotcpu_notifier(fn, pri)     __cpu_notifier(fn, pri)
6907  #define register_hotcpu_notifier(nb)   register_cpu_notifier(nb)
6908 @@ -199,6 +201,8 @@ static inline void cpu_hotplug_done(void) {}
6909  #define put_online_cpus()      do { } while (0)
6910  #define cpu_hotplug_disable()  do { } while (0)
6911  #define cpu_hotplug_enable()   do { } while (0)
6912 +static inline void pin_current_cpu(void) { }
6913 +static inline void unpin_current_cpu(void) { }
6914  #define hotcpu_notifier(fn, pri)       do { (void)(fn); } while (0)
6915  #define __hotcpu_notifier(fn, pri)     do { (void)(fn); } while (0)
6916  /* These aren't inline functions due to a GCC bug. */
6917 diff --git a/include/linux/dcache.h b/include/linux/dcache.h
6918 index ff295e166b2c..d532c60f3fb5 100644
6919 --- a/include/linux/dcache.h
6920 +++ b/include/linux/dcache.h
6921 @@ -11,6 +11,7 @@
6922  #include <linux/rcupdate.h>
6923  #include <linux/lockref.h>
6924  #include <linux/stringhash.h>
6925 +#include <linux/wait.h>
6927  struct path;
6928  struct vfsmount;
6929 @@ -100,7 +101,7 @@ struct dentry {
6931         union {
6932                 struct list_head d_lru;         /* LRU list */
6933 -               wait_queue_head_t *d_wait;      /* in-lookup ones only */
6934 +               struct swait_queue_head *d_wait;        /* in-lookup ones only */
6935         };
6936         struct list_head d_child;       /* child of parent list */
6937         struct list_head d_subdirs;     /* our children */
6938 @@ -230,7 +231,7 @@ extern void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op
6939  extern struct dentry * d_alloc(struct dentry *, const struct qstr *);
6940  extern struct dentry * d_alloc_pseudo(struct super_block *, const struct qstr *);
6941  extern struct dentry * d_alloc_parallel(struct dentry *, const struct qstr *,
6942 -                                       wait_queue_head_t *);
6943 +                                       struct swait_queue_head *);
6944  extern struct dentry * d_splice_alias(struct inode *, struct dentry *);
6945  extern struct dentry * d_add_ci(struct dentry *, struct inode *, struct qstr *);
6946  extern struct dentry * d_exact_alias(struct dentry *, struct inode *);
6947 diff --git a/include/linux/delay.h b/include/linux/delay.h
6948 index a6ecb34cf547..37caab306336 100644
6949 --- a/include/linux/delay.h
6950 +++ b/include/linux/delay.h
6951 @@ -52,4 +52,10 @@ static inline void ssleep(unsigned int seconds)
6952         msleep(seconds * 1000);
6955 +#ifdef CONFIG_PREEMPT_RT_FULL
6956 +extern void cpu_chill(void);
6957 +#else
6958 +# define cpu_chill()   cpu_relax()
6959 +#endif
6961  #endif /* defined(_LINUX_DELAY_H) */
6962 diff --git a/include/linux/fs.h b/include/linux/fs.h
6963 index d705ae084edd..ab1946f4a729 100644
6964 --- a/include/linux/fs.h
6965 +++ b/include/linux/fs.h
6966 @@ -688,7 +688,7 @@ struct inode {
6967                 struct block_device     *i_bdev;
6968                 struct cdev             *i_cdev;
6969                 char                    *i_link;
6970 -               unsigned                i_dir_seq;
6971 +               unsigned                __i_dir_seq;
6972         };
6974         __u32                   i_generation;
6975 diff --git a/include/linux/highmem.h b/include/linux/highmem.h
6976 index bb3f3297062a..a117a33ef72c 100644
6977 --- a/include/linux/highmem.h
6978 +++ b/include/linux/highmem.h
6979 @@ -7,6 +7,7 @@
6980  #include <linux/mm.h>
6981  #include <linux/uaccess.h>
6982  #include <linux/hardirq.h>
6983 +#include <linux/sched.h>
6985  #include <asm/cacheflush.h>
6987 @@ -65,7 +66,7 @@ static inline void kunmap(struct page *page)
6989  static inline void *kmap_atomic(struct page *page)
6991 -       preempt_disable();
6992 +       preempt_disable_nort();
6993         pagefault_disable();
6994         return page_address(page);
6996 @@ -74,7 +75,7 @@ static inline void *kmap_atomic(struct page *page)
6997  static inline void __kunmap_atomic(void *addr)
6999         pagefault_enable();
7000 -       preempt_enable();
7001 +       preempt_enable_nort();
7004  #define kmap_atomic_pfn(pfn)   kmap_atomic(pfn_to_page(pfn))
7005 @@ -86,32 +87,51 @@ static inline void __kunmap_atomic(void *addr)
7007  #if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32)
7009 +#ifndef CONFIG_PREEMPT_RT_FULL
7010  DECLARE_PER_CPU(int, __kmap_atomic_idx);
7011 +#endif
7013  static inline int kmap_atomic_idx_push(void)
7015 +#ifndef CONFIG_PREEMPT_RT_FULL
7016         int idx = __this_cpu_inc_return(__kmap_atomic_idx) - 1;
7018 -#ifdef CONFIG_DEBUG_HIGHMEM
7019 +# ifdef CONFIG_DEBUG_HIGHMEM
7020         WARN_ON_ONCE(in_irq() && !irqs_disabled());
7021         BUG_ON(idx >= KM_TYPE_NR);
7022 -#endif
7023 +# endif
7024         return idx;
7025 +#else
7026 +       current->kmap_idx++;
7027 +       BUG_ON(current->kmap_idx > KM_TYPE_NR);
7028 +       return current->kmap_idx - 1;
7029 +#endif
7032  static inline int kmap_atomic_idx(void)
7034 +#ifndef CONFIG_PREEMPT_RT_FULL
7035         return __this_cpu_read(__kmap_atomic_idx) - 1;
7036 +#else
7037 +       return current->kmap_idx - 1;
7038 +#endif
7041  static inline void kmap_atomic_idx_pop(void)
7043 -#ifdef CONFIG_DEBUG_HIGHMEM
7044 +#ifndef CONFIG_PREEMPT_RT_FULL
7045 +# ifdef CONFIG_DEBUG_HIGHMEM
7046         int idx = __this_cpu_dec_return(__kmap_atomic_idx);
7048         BUG_ON(idx < 0);
7049 -#else
7050 +# else
7051         __this_cpu_dec(__kmap_atomic_idx);
7052 +# endif
7053 +#else
7054 +       current->kmap_idx--;
7055 +# ifdef CONFIG_DEBUG_HIGHMEM
7056 +       BUG_ON(current->kmap_idx < 0);
7057 +# endif
7058  #endif
7061 diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
7062 index 5e00f80b1535..a34e10b55cde 100644
7063 --- a/include/linux/hrtimer.h
7064 +++ b/include/linux/hrtimer.h
7065 @@ -87,6 +87,9 @@ enum hrtimer_restart {
7066   * @function:  timer expiry callback function
7067   * @base:      pointer to the timer base (per cpu and per clock)
7068   * @state:     state information (See bit values above)
7069 + * @cb_entry:  list entry to defer timers from hardirq context
7070 + * @irqsafe:   timer can run in hardirq context
7071 + * @praecox:   timer expiry time if expired at the time of programming
7072   * @is_rel:    Set if the timer was armed relative
7073   * @start_pid:  timer statistics field to store the pid of the task which
7074   *             started the timer
7075 @@ -103,6 +106,11 @@ struct hrtimer {
7076         enum hrtimer_restart            (*function)(struct hrtimer *);
7077         struct hrtimer_clock_base       *base;
7078         u8                              state;
7079 +       struct list_head                cb_entry;
7080 +       int                             irqsafe;
7081 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
7082 +       ktime_t                         praecox;
7083 +#endif
7084         u8                              is_rel;
7085  #ifdef CONFIG_TIMER_STATS
7086         int                             start_pid;
7087 @@ -123,11 +131,7 @@ struct hrtimer_sleeper {
7088         struct task_struct *task;
7089  };
7091 -#ifdef CONFIG_64BIT
7092  # define HRTIMER_CLOCK_BASE_ALIGN      64
7093 -#else
7094 -# define HRTIMER_CLOCK_BASE_ALIGN      32
7095 -#endif
7097  /**
7098   * struct hrtimer_clock_base - the timer base for a specific clock
7099 @@ -136,6 +140,7 @@ struct hrtimer_sleeper {
7100   *                     timer to a base on another cpu.
7101   * @clockid:           clock id for per_cpu support
7102   * @active:            red black tree root node for the active timers
7103 + * @expired:           list head for deferred timers.
7104   * @get_time:          function to retrieve the current time of the clock
7105   * @offset:            offset of this clock to the monotonic base
7106   */
7107 @@ -144,6 +149,7 @@ struct hrtimer_clock_base {
7108         int                     index;
7109         clockid_t               clockid;
7110         struct timerqueue_head  active;
7111 +       struct list_head        expired;
7112         ktime_t                 (*get_time)(void);
7113         ktime_t                 offset;
7114  } __attribute__((__aligned__(HRTIMER_CLOCK_BASE_ALIGN)));
7115 @@ -187,6 +193,7 @@ struct hrtimer_cpu_base {
7116         raw_spinlock_t                  lock;
7117         seqcount_t                      seq;
7118         struct hrtimer                  *running;
7119 +       struct hrtimer                  *running_soft;
7120         unsigned int                    cpu;
7121         unsigned int                    active_bases;
7122         unsigned int                    clock_was_set_seq;
7123 @@ -202,6 +209,9 @@ struct hrtimer_cpu_base {
7124         unsigned int                    nr_retries;
7125         unsigned int                    nr_hangs;
7126         unsigned int                    max_hang_time;
7127 +#endif
7128 +#ifdef CONFIG_PREEMPT_RT_BASE
7129 +       wait_queue_head_t               wait;
7130  #endif
7131         struct hrtimer_clock_base       clock_base[HRTIMER_MAX_CLOCK_BASES];
7132  } ____cacheline_aligned;
7133 @@ -412,6 +422,13 @@ static inline void hrtimer_restart(struct hrtimer *timer)
7134         hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
7137 +/* Softirq preemption could deadlock timer removal */
7138 +#ifdef CONFIG_PREEMPT_RT_BASE
7139 +  extern void hrtimer_wait_for_timer(const struct hrtimer *timer);
7140 +#else
7141 +# define hrtimer_wait_for_timer(timer) do { cpu_relax(); } while (0)
7142 +#endif
7144  /* Query timers: */
7145  extern ktime_t __hrtimer_get_remaining(const struct hrtimer *timer, bool adjust);
7147 @@ -436,9 +453,15 @@ static inline int hrtimer_is_queued(struct hrtimer *timer)
7148   * Helper function to check, whether the timer is running the callback
7149   * function
7150   */
7151 -static inline int hrtimer_callback_running(struct hrtimer *timer)
7152 +static inline int hrtimer_callback_running(const struct hrtimer *timer)
7154 -       return timer->base->cpu_base->running == timer;
7155 +       if (timer->base->cpu_base->running == timer)
7156 +               return 1;
7157 +#ifdef CONFIG_PREEMPT_RT_BASE
7158 +       if (timer->base->cpu_base->running_soft == timer)
7159 +               return 1;
7160 +#endif
7161 +       return 0;
7164  /* Forward a hrtimer so it expires after now: */
7165 diff --git a/include/linux/idr.h b/include/linux/idr.h
7166 index 083d61e92706..5899796f50cb 100644
7167 --- a/include/linux/idr.h
7168 +++ b/include/linux/idr.h
7169 @@ -95,10 +95,14 @@ bool idr_is_empty(struct idr *idp);
7170   * Each idr_preload() should be matched with an invocation of this
7171   * function.  See idr_preload() for details.
7172   */
7173 +#ifdef CONFIG_PREEMPT_RT_FULL
7174 +void idr_preload_end(void);
7175 +#else
7176  static inline void idr_preload_end(void)
7178         preempt_enable();
7180 +#endif
7182  /**
7183   * idr_find - return pointer for given id
7184 diff --git a/include/linux/init_task.h b/include/linux/init_task.h
7185 index 325f649d77ff..a56e263f5005 100644
7186 --- a/include/linux/init_task.h
7187 +++ b/include/linux/init_task.h
7188 @@ -150,6 +150,12 @@ extern struct task_group root_task_group;
7189  # define INIT_PERF_EVENTS(tsk)
7190  #endif
7192 +#ifdef CONFIG_PREEMPT_RT_BASE
7193 +# define INIT_TIMER_LIST               .posix_timer_list = NULL,
7194 +#else
7195 +# define INIT_TIMER_LIST
7196 +#endif
7198  #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
7199  # define INIT_VTIME(tsk)                                               \
7200         .vtime_seqcount = SEQCNT_ZERO(tsk.vtime_seqcount),      \
7201 @@ -164,6 +170,7 @@ extern struct task_group root_task_group;
7202  #ifdef CONFIG_RT_MUTEXES
7203  # define INIT_RT_MUTEXES(tsk)                                          \
7204         .pi_waiters = RB_ROOT,                                          \
7205 +       .pi_top_task = NULL,                                            \
7206         .pi_waiters_leftmost = NULL,
7207  #else
7208  # define INIT_RT_MUTEXES(tsk)
7209 @@ -250,6 +257,7 @@ extern struct task_group root_task_group;
7210         .cpu_timers     = INIT_CPU_TIMERS(tsk.cpu_timers),              \
7211         .pi_lock        = __RAW_SPIN_LOCK_UNLOCKED(tsk.pi_lock),        \
7212         .timer_slack_ns = 50000, /* 50 usec default slack */            \
7213 +       INIT_TIMER_LIST                                                 \
7214         .pids = {                                                       \
7215                 [PIDTYPE_PID]  = INIT_PID_LINK(PIDTYPE_PID),            \
7216                 [PIDTYPE_PGID] = INIT_PID_LINK(PIDTYPE_PGID),           \
7217 diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
7218 index 72f0721f75e7..480972ae47d3 100644
7219 --- a/include/linux/interrupt.h
7220 +++ b/include/linux/interrupt.h
7221 @@ -14,6 +14,7 @@
7222  #include <linux/hrtimer.h>
7223  #include <linux/kref.h>
7224  #include <linux/workqueue.h>
7225 +#include <linux/swork.h>
7227  #include <linux/atomic.h>
7228  #include <asm/ptrace.h>
7229 @@ -61,6 +62,7 @@
7230   *                interrupt handler after suspending interrupts. For system
7231   *                wakeup devices users need to implement wakeup detection in
7232   *                their interrupt handlers.
7233 + * IRQF_NO_SOFTIRQ_CALL - Do not process softirqs in the irq thread context (RT)
7234   */
7235  #define IRQF_SHARED            0x00000080
7236  #define IRQF_PROBE_SHARED      0x00000100
7237 @@ -74,6 +76,7 @@
7238  #define IRQF_NO_THREAD         0x00010000
7239  #define IRQF_EARLY_RESUME      0x00020000
7240  #define IRQF_COND_SUSPEND      0x00040000
7241 +#define IRQF_NO_SOFTIRQ_CALL   0x00080000
7243  #define IRQF_TIMER             (__IRQF_TIMER | IRQF_NO_SUSPEND | IRQF_NO_THREAD)
7245 @@ -196,7 +199,7 @@ extern void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id);
7246  #ifdef CONFIG_LOCKDEP
7247  # define local_irq_enable_in_hardirq() do { } while (0)
7248  #else
7249 -# define local_irq_enable_in_hardirq() local_irq_enable()
7250 +# define local_irq_enable_in_hardirq() local_irq_enable_nort()
7251  #endif
7253  extern void disable_irq_nosync(unsigned int irq);
7254 @@ -216,6 +219,7 @@ extern void resume_device_irqs(void);
7255   * struct irq_affinity_notify - context for notification of IRQ affinity changes
7256   * @irq:               Interrupt to which notification applies
7257   * @kref:              Reference count, for internal use
7258 + * @swork:             Swork item, for internal use
7259   * @work:              Work item, for internal use
7260   * @notify:            Function to be called on change.  This will be
7261   *                     called in process context.
7262 @@ -227,7 +231,11 @@ extern void resume_device_irqs(void);
7263  struct irq_affinity_notify {
7264         unsigned int irq;
7265         struct kref kref;
7266 +#ifdef CONFIG_PREEMPT_RT_BASE
7267 +       struct swork_event swork;
7268 +#else
7269         struct work_struct work;
7270 +#endif
7271         void (*notify)(struct irq_affinity_notify *, const cpumask_t *mask);
7272         void (*release)(struct kref *ref);
7273  };
7274 @@ -406,9 +414,13 @@ extern int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
7275                                  bool state);
7277  #ifdef CONFIG_IRQ_FORCED_THREADING
7278 +# ifndef CONFIG_PREEMPT_RT_BASE
7279  extern bool force_irqthreads;
7280 +# else
7281 +#  define force_irqthreads     (true)
7282 +# endif
7283  #else
7284 -#define force_irqthreads       (0)
7285 +#define force_irqthreads       (false)
7286  #endif
7288  #ifndef __ARCH_SET_SOFTIRQ_PENDING
7289 @@ -465,9 +477,10 @@ struct softirq_action
7290         void    (*action)(struct softirq_action *);
7291  };
7293 +#ifndef CONFIG_PREEMPT_RT_FULL
7294  asmlinkage void do_softirq(void);
7295  asmlinkage void __do_softirq(void);
7297 +static inline void thread_do_softirq(void) { do_softirq(); }
7298  #ifdef __ARCH_HAS_DO_SOFTIRQ
7299  void do_softirq_own_stack(void);
7300  #else
7301 @@ -476,13 +489,25 @@ static inline void do_softirq_own_stack(void)
7302         __do_softirq();
7304  #endif
7305 +#else
7306 +extern void thread_do_softirq(void);
7307 +#endif
7309  extern void open_softirq(int nr, void (*action)(struct softirq_action *));
7310  extern void softirq_init(void);
7311  extern void __raise_softirq_irqoff(unsigned int nr);
7312 +#ifdef CONFIG_PREEMPT_RT_FULL
7313 +extern void __raise_softirq_irqoff_ksoft(unsigned int nr);
7314 +#else
7315 +static inline void __raise_softirq_irqoff_ksoft(unsigned int nr)
7317 +       __raise_softirq_irqoff(nr);
7319 +#endif
7321  extern void raise_softirq_irqoff(unsigned int nr);
7322  extern void raise_softirq(unsigned int nr);
7323 +extern void softirq_check_pending_idle(void);
7325  DECLARE_PER_CPU(struct task_struct *, ksoftirqd);
7327 @@ -504,8 +529,9 @@ static inline struct task_struct *this_cpu_ksoftirqd(void)
7328       to be executed on some cpu at least once after this.
7329     * If the tasklet is already scheduled, but its execution is still not
7330       started, it will be executed only once.
7331 -   * If this tasklet is already running on another CPU (or schedule is called
7332 -     from tasklet itself), it is rescheduled for later.
7333 +   * If this tasklet is already running on another CPU, it is rescheduled
7334 +     for later.
7335 +   * Schedule must not be called from the tasklet itself (a lockup occurs)
7336     * Tasklet is strictly serialized wrt itself, but not
7337       wrt another tasklets. If client needs some intertask synchronization,
7338       he makes it with spinlocks.
7339 @@ -530,27 +556,36 @@ struct tasklet_struct name = { NULL, 0, ATOMIC_INIT(1), func, data }
7340  enum
7342         TASKLET_STATE_SCHED,    /* Tasklet is scheduled for execution */
7343 -       TASKLET_STATE_RUN       /* Tasklet is running (SMP only) */
7344 +       TASKLET_STATE_RUN,      /* Tasklet is running (SMP only) */
7345 +       TASKLET_STATE_PENDING   /* Tasklet is pending */
7346  };
7348 -#ifdef CONFIG_SMP
7349 +#define TASKLET_STATEF_SCHED   (1 << TASKLET_STATE_SCHED)
7350 +#define TASKLET_STATEF_RUN     (1 << TASKLET_STATE_RUN)
7351 +#define TASKLET_STATEF_PENDING (1 << TASKLET_STATE_PENDING)
7353 +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
7354  static inline int tasklet_trylock(struct tasklet_struct *t)
7356         return !test_and_set_bit(TASKLET_STATE_RUN, &(t)->state);
7359 +static inline int tasklet_tryunlock(struct tasklet_struct *t)
7361 +       return cmpxchg(&t->state, TASKLET_STATEF_RUN, 0) == TASKLET_STATEF_RUN;
7364  static inline void tasklet_unlock(struct tasklet_struct *t)
7366         smp_mb__before_atomic();
7367         clear_bit(TASKLET_STATE_RUN, &(t)->state);
7370 -static inline void tasklet_unlock_wait(struct tasklet_struct *t)
7372 -       while (test_bit(TASKLET_STATE_RUN, &(t)->state)) { barrier(); }
7374 +extern void tasklet_unlock_wait(struct tasklet_struct *t);
7376  #else
7377  #define tasklet_trylock(t) 1
7378 +#define tasklet_tryunlock(t)   1
7379  #define tasklet_unlock_wait(t) do { } while (0)
7380  #define tasklet_unlock(t) do { } while (0)
7381  #endif
7382 @@ -599,12 +634,7 @@ static inline void tasklet_disable(struct tasklet_struct *t)
7383         smp_mb();
7386 -static inline void tasklet_enable(struct tasklet_struct *t)
7388 -       smp_mb__before_atomic();
7389 -       atomic_dec(&t->count);
7392 +extern void tasklet_enable(struct tasklet_struct *t);
7393  extern void tasklet_kill(struct tasklet_struct *t);
7394  extern void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu);
7395  extern void tasklet_init(struct tasklet_struct *t,
7396 @@ -635,6 +665,12 @@ void tasklet_hrtimer_cancel(struct tasklet_hrtimer *ttimer)
7397         tasklet_kill(&ttimer->tasklet);
7400 +#ifdef CONFIG_PREEMPT_RT_FULL
7401 +extern void softirq_early_init(void);
7402 +#else
7403 +static inline void softirq_early_init(void) { }
7404 +#endif
7406  /*
7407   * Autoprobing for irqs:
7408   *
7409 diff --git a/include/linux/irq.h b/include/linux/irq.h
7410 index 39e3254e5769..8ebac94fbb9f 100644
7411 --- a/include/linux/irq.h
7412 +++ b/include/linux/irq.h
7413 @@ -72,6 +72,7 @@ enum irqchip_irq_state;
7414   * IRQ_IS_POLLED               - Always polled by another interrupt. Exclude
7415   *                               it from the spurious interrupt detection
7416   *                               mechanism and from core side polling.
7417 + * IRQ_NO_SOFTIRQ_CALL         - No softirq processing in the irq thread context (RT)
7418   * IRQ_DISABLE_UNLAZY          - Disable lazy irq disable
7419   */
7420  enum {
7421 @@ -99,13 +100,14 @@ enum {
7422         IRQ_PER_CPU_DEVID       = (1 << 17),
7423         IRQ_IS_POLLED           = (1 << 18),
7424         IRQ_DISABLE_UNLAZY      = (1 << 19),
7425 +       IRQ_NO_SOFTIRQ_CALL     = (1 << 20),
7426  };
7428  #define IRQF_MODIFY_MASK       \
7429         (IRQ_TYPE_SENSE_MASK | IRQ_NOPROBE | IRQ_NOREQUEST | \
7430          IRQ_NOAUTOEN | IRQ_MOVE_PCNTXT | IRQ_LEVEL | IRQ_NO_BALANCING | \
7431          IRQ_PER_CPU | IRQ_NESTED_THREAD | IRQ_NOTHREAD | IRQ_PER_CPU_DEVID | \
7432 -        IRQ_IS_POLLED | IRQ_DISABLE_UNLAZY)
7433 +        IRQ_IS_POLLED | IRQ_DISABLE_UNLAZY | IRQ_NO_SOFTIRQ_CALL)
7435  #define IRQ_NO_BALANCING_MASK  (IRQ_PER_CPU | IRQ_NO_BALANCING)
7437 diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h
7438 index 47b9ebd4a74f..2543aab05daa 100644
7439 --- a/include/linux/irq_work.h
7440 +++ b/include/linux/irq_work.h
7441 @@ -16,6 +16,7 @@
7442  #define IRQ_WORK_BUSY          2UL
7443  #define IRQ_WORK_FLAGS         3UL
7444  #define IRQ_WORK_LAZY          4UL /* Doesn't want IPI, wait for tick */
7445 +#define IRQ_WORK_HARD_IRQ      8UL /* Run hard IRQ context, even on RT */
7447  struct irq_work {
7448         unsigned long flags;
7449 @@ -51,4 +52,10 @@ static inline bool irq_work_needs_cpu(void) { return false; }
7450  static inline void irq_work_run(void) { }
7451  #endif
7453 +#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_PREEMPT_RT_FULL)
7454 +void irq_work_tick_soft(void);
7455 +#else
7456 +static inline void irq_work_tick_soft(void) { }
7457 +#endif
7459  #endif /* _LINUX_IRQ_WORK_H */
7460 diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
7461 index c9be57931b58..eeeb540971ae 100644
7462 --- a/include/linux/irqdesc.h
7463 +++ b/include/linux/irqdesc.h
7464 @@ -66,6 +66,7 @@ struct irq_desc {
7465         unsigned int            irqs_unhandled;
7466         atomic_t                threads_handled;
7467         int                     threads_handled_last;
7468 +       u64                     random_ip;
7469         raw_spinlock_t          lock;
7470         struct cpumask          *percpu_enabled;
7471         const struct cpumask    *percpu_affinity;
7472 diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h
7473 index 5dd1272d1ab2..9b77034f7c5e 100644
7474 --- a/include/linux/irqflags.h
7475 +++ b/include/linux/irqflags.h
7476 @@ -25,8 +25,6 @@
7477  # define trace_softirqs_enabled(p)     ((p)->softirqs_enabled)
7478  # define trace_hardirq_enter() do { current->hardirq_context++; } while (0)
7479  # define trace_hardirq_exit()  do { current->hardirq_context--; } while (0)
7480 -# define lockdep_softirq_enter()       do { current->softirq_context++; } while (0)
7481 -# define lockdep_softirq_exit()        do { current->softirq_context--; } while (0)
7482  # define INIT_TRACE_IRQFLAGS   .softirqs_enabled = 1,
7483  #else
7484  # define trace_hardirqs_on()           do { } while (0)
7485 @@ -39,9 +37,15 @@
7486  # define trace_softirqs_enabled(p)     0
7487  # define trace_hardirq_enter()         do { } while (0)
7488  # define trace_hardirq_exit()          do { } while (0)
7489 +# define INIT_TRACE_IRQFLAGS
7490 +#endif
7492 +#if defined(CONFIG_TRACE_IRQFLAGS) && !defined(CONFIG_PREEMPT_RT_FULL)
7493 +# define lockdep_softirq_enter() do { current->softirq_context++; } while (0)
7494 +# define lockdep_softirq_exit()         do { current->softirq_context--; } while (0)
7495 +#else
7496  # define lockdep_softirq_enter()       do { } while (0)
7497  # define lockdep_softirq_exit()                do { } while (0)
7498 -# define INIT_TRACE_IRQFLAGS
7499  #endif
7501  #if defined(CONFIG_IRQSOFF_TRACER) || \
7502 @@ -148,4 +152,23 @@
7504  #define irqs_disabled_flags(flags) raw_irqs_disabled_flags(flags)
7507 + * local_irq* variants depending on RT/!RT
7508 + */
7509 +#ifdef CONFIG_PREEMPT_RT_FULL
7510 +# define local_irq_disable_nort()      do { } while (0)
7511 +# define local_irq_enable_nort()       do { } while (0)
7512 +# define local_irq_save_nort(flags)    local_save_flags(flags)
7513 +# define local_irq_restore_nort(flags) (void)(flags)
7514 +# define local_irq_disable_rt()                local_irq_disable()
7515 +# define local_irq_enable_rt()         local_irq_enable()
7516 +#else
7517 +# define local_irq_disable_nort()      local_irq_disable()
7518 +# define local_irq_enable_nort()       local_irq_enable()
7519 +# define local_irq_save_nort(flags)    local_irq_save(flags)
7520 +# define local_irq_restore_nort(flags) local_irq_restore(flags)
7521 +# define local_irq_disable_rt()                do { } while (0)
7522 +# define local_irq_enable_rt()         do { } while (0)
7523 +#endif
7525  #endif
7526 diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
7527 index dfaa1f4dcb0c..d57dd06544a1 100644
7528 --- a/include/linux/jbd2.h
7529 +++ b/include/linux/jbd2.h
7530 @@ -347,32 +347,56 @@ static inline struct journal_head *bh2jh(struct buffer_head *bh)
7532  static inline void jbd_lock_bh_state(struct buffer_head *bh)
7534 +#ifndef CONFIG_PREEMPT_RT_BASE
7535         bit_spin_lock(BH_State, &bh->b_state);
7536 +#else
7537 +       spin_lock(&bh->b_state_lock);
7538 +#endif
7541  static inline int jbd_trylock_bh_state(struct buffer_head *bh)
7543 +#ifndef CONFIG_PREEMPT_RT_BASE
7544         return bit_spin_trylock(BH_State, &bh->b_state);
7545 +#else
7546 +       return spin_trylock(&bh->b_state_lock);
7547 +#endif
7550  static inline int jbd_is_locked_bh_state(struct buffer_head *bh)
7552 +#ifndef CONFIG_PREEMPT_RT_BASE
7553         return bit_spin_is_locked(BH_State, &bh->b_state);
7554 +#else
7555 +       return spin_is_locked(&bh->b_state_lock);
7556 +#endif
7559  static inline void jbd_unlock_bh_state(struct buffer_head *bh)
7561 +#ifndef CONFIG_PREEMPT_RT_BASE
7562         bit_spin_unlock(BH_State, &bh->b_state);
7563 +#else
7564 +       spin_unlock(&bh->b_state_lock);
7565 +#endif
7568  static inline void jbd_lock_bh_journal_head(struct buffer_head *bh)
7570 +#ifndef CONFIG_PREEMPT_RT_BASE
7571         bit_spin_lock(BH_JournalHead, &bh->b_state);
7572 +#else
7573 +       spin_lock(&bh->b_journal_head_lock);
7574 +#endif
7577  static inline void jbd_unlock_bh_journal_head(struct buffer_head *bh)
7579 +#ifndef CONFIG_PREEMPT_RT_BASE
7580         bit_spin_unlock(BH_JournalHead, &bh->b_state);
7581 +#else
7582 +       spin_unlock(&bh->b_journal_head_lock);
7583 +#endif
7586  #define J_ASSERT(assert)       BUG_ON(!(assert))
7587 diff --git a/include/linux/kdb.h b/include/linux/kdb.h
7588 index 410decacff8f..0861bebfc188 100644
7589 --- a/include/linux/kdb.h
7590 +++ b/include/linux/kdb.h
7591 @@ -167,6 +167,7 @@ extern __printf(2, 0) int vkdb_printf(enum kdb_msgsrc src, const char *fmt,
7592  extern __printf(1, 2) int kdb_printf(const char *, ...);
7593  typedef __printf(1, 2) int (*kdb_printf_t)(const char *, ...);
7595 +#define in_kdb_printk()        (kdb_trap_printk)
7596  extern void kdb_init(int level);
7598  /* Access to kdb specific polling devices */
7599 @@ -201,6 +202,7 @@ extern int kdb_register_flags(char *, kdb_func_t, char *, char *,
7600  extern int kdb_unregister(char *);
7601  #else /* ! CONFIG_KGDB_KDB */
7602  static inline __printf(1, 2) int kdb_printf(const char *fmt, ...) { return 0; }
7603 +#define in_kdb_printk() (0)
7604  static inline void kdb_init(int level) {}
7605  static inline int kdb_register(char *cmd, kdb_func_t func, char *usage,
7606                                char *help, short minlen) { return 0; }
7607 diff --git a/include/linux/kernel.h b/include/linux/kernel.h
7608 index bc6ed52a39b9..7894d55e4998 100644
7609 --- a/include/linux/kernel.h
7610 +++ b/include/linux/kernel.h
7611 @@ -194,6 +194,9 @@ extern int _cond_resched(void);
7612   */
7613  # define might_sleep() \
7614         do { __might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0)
7616 +# define might_sleep_no_state_check() \
7617 +       do { ___might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0)
7618  # define sched_annotate_sleep()        (current->task_state_change = 0)
7619  #else
7620    static inline void ___might_sleep(const char *file, int line,
7621 @@ -201,6 +204,7 @@ extern int _cond_resched(void);
7622    static inline void __might_sleep(const char *file, int line,
7623                                    int preempt_offset) { }
7624  # define might_sleep() do { might_resched(); } while (0)
7625 +# define might_sleep_no_state_check() do { might_resched(); } while (0)
7626  # define sched_annotate_sleep() do { } while (0)
7627  #endif
7629 @@ -488,6 +492,7 @@ extern enum system_states {
7630         SYSTEM_HALT,
7631         SYSTEM_POWER_OFF,
7632         SYSTEM_RESTART,
7633 +       SYSTEM_SUSPEND,
7634  } system_state;
7636  #define TAINT_PROPRIETARY_MODULE       0
7637 diff --git a/include/linux/list_bl.h b/include/linux/list_bl.h
7638 index cb483305e1f5..4e5062316bb6 100644
7639 --- a/include/linux/list_bl.h
7640 +++ b/include/linux/list_bl.h
7641 @@ -2,6 +2,7 @@
7642  #define _LINUX_LIST_BL_H
7644  #include <linux/list.h>
7645 +#include <linux/spinlock.h>
7646  #include <linux/bit_spinlock.h>
7648  /*
7649 @@ -32,13 +33,24 @@
7651  struct hlist_bl_head {
7652         struct hlist_bl_node *first;
7653 +#ifdef CONFIG_PREEMPT_RT_BASE
7654 +       raw_spinlock_t lock;
7655 +#endif
7656  };
7658  struct hlist_bl_node {
7659         struct hlist_bl_node *next, **pprev;
7660  };
7661 -#define INIT_HLIST_BL_HEAD(ptr) \
7662 -       ((ptr)->first = NULL)
7664 +#ifdef CONFIG_PREEMPT_RT_BASE
7665 +#define INIT_HLIST_BL_HEAD(h)          \
7666 +do {                                   \
7667 +       (h)->first = NULL;              \
7668 +       raw_spin_lock_init(&(h)->lock); \
7669 +} while (0)
7670 +#else
7671 +#define INIT_HLIST_BL_HEAD(h) (h)->first = NULL
7672 +#endif
7674  static inline void INIT_HLIST_BL_NODE(struct hlist_bl_node *h)
7676 @@ -118,12 +130,26 @@ static inline void hlist_bl_del_init(struct hlist_bl_node *n)
7678  static inline void hlist_bl_lock(struct hlist_bl_head *b)
7680 +#ifndef CONFIG_PREEMPT_RT_BASE
7681         bit_spin_lock(0, (unsigned long *)b);
7682 +#else
7683 +       raw_spin_lock(&b->lock);
7684 +#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
7685 +       __set_bit(0, (unsigned long *)b);
7686 +#endif
7687 +#endif
7690  static inline void hlist_bl_unlock(struct hlist_bl_head *b)
7692 +#ifndef CONFIG_PREEMPT_RT_BASE
7693         __bit_spin_unlock(0, (unsigned long *)b);
7694 +#else
7695 +#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
7696 +       __clear_bit(0, (unsigned long *)b);
7697 +#endif
7698 +       raw_spin_unlock(&b->lock);
7699 +#endif
7702  static inline bool hlist_bl_is_locked(struct hlist_bl_head *b)
7703 diff --git a/include/linux/locallock.h b/include/linux/locallock.h
7704 new file mode 100644
7705 index 000000000000..280f884a05a3
7706 --- /dev/null
7707 +++ b/include/linux/locallock.h
7708 @@ -0,0 +1,287 @@
7709 +#ifndef _LINUX_LOCALLOCK_H
7710 +#define _LINUX_LOCALLOCK_H
7712 +#include <linux/percpu.h>
7713 +#include <linux/spinlock.h>
7715 +#ifdef CONFIG_PREEMPT_RT_BASE
7717 +#ifdef CONFIG_DEBUG_SPINLOCK
7718 +# define LL_WARN(cond) WARN_ON(cond)
7719 +#else
7720 +# define LL_WARN(cond) do { } while (0)
7721 +#endif
7724 + * per cpu lock based substitute for local_irq_*()
7725 + */
7726 +struct local_irq_lock {
7727 +       spinlock_t              lock;
7728 +       struct task_struct      *owner;
7729 +       int                     nestcnt;
7730 +       unsigned long           flags;
7733 +#define DEFINE_LOCAL_IRQ_LOCK(lvar)                                    \
7734 +       DEFINE_PER_CPU(struct local_irq_lock, lvar) = {                 \
7735 +               .lock = __SPIN_LOCK_UNLOCKED((lvar).lock) }
7737 +#define DECLARE_LOCAL_IRQ_LOCK(lvar)                                   \
7738 +       DECLARE_PER_CPU(struct local_irq_lock, lvar)
7740 +#define local_irq_lock_init(lvar)                                      \
7741 +       do {                                                            \
7742 +               int __cpu;                                              \
7743 +               for_each_possible_cpu(__cpu)                            \
7744 +                       spin_lock_init(&per_cpu(lvar, __cpu).lock);     \
7745 +       } while (0)
7748 + * spin_lock|trylock|unlock_local flavour that does not migrate disable
7749 + * used for __local_lock|trylock|unlock where get_local_var/put_local_var
7750 + * already takes care of the migrate_disable/enable
7751 + * for CONFIG_PREEMPT_BASE map to the normal spin_* calls.
7752 + */
7753 +#ifdef CONFIG_PREEMPT_RT_FULL
7754 +# define spin_lock_local(lock)                 rt_spin_lock__no_mg(lock)
7755 +# define spin_trylock_local(lock)              rt_spin_trylock__no_mg(lock)
7756 +# define spin_unlock_local(lock)               rt_spin_unlock__no_mg(lock)
7757 +#else
7758 +# define spin_lock_local(lock)                 spin_lock(lock)
7759 +# define spin_trylock_local(lock)              spin_trylock(lock)
7760 +# define spin_unlock_local(lock)               spin_unlock(lock)
7761 +#endif
7763 +static inline void __local_lock(struct local_irq_lock *lv)
7765 +       if (lv->owner != current) {
7766 +               spin_lock_local(&lv->lock);
7767 +               LL_WARN(lv->owner);
7768 +               LL_WARN(lv->nestcnt);
7769 +               lv->owner = current;
7770 +       }
7771 +       lv->nestcnt++;
7774 +#define local_lock(lvar)                                       \
7775 +       do { __local_lock(&get_local_var(lvar)); } while (0)
7777 +#define local_lock_on(lvar, cpu)                               \
7778 +       do { __local_lock(&per_cpu(lvar, cpu)); } while (0)
7780 +static inline int __local_trylock(struct local_irq_lock *lv)
7782 +       if (lv->owner != current && spin_trylock_local(&lv->lock)) {
7783 +               LL_WARN(lv->owner);
7784 +               LL_WARN(lv->nestcnt);
7785 +               lv->owner = current;
7786 +               lv->nestcnt = 1;
7787 +               return 1;
7788 +       } else if (lv->owner == current) {
7789 +               lv->nestcnt++;
7790 +               return 1;
7791 +       }
7792 +       return 0;
7795 +#define local_trylock(lvar)                                            \
7796 +       ({                                                              \
7797 +               int __locked;                                           \
7798 +               __locked = __local_trylock(&get_local_var(lvar));       \
7799 +               if (!__locked)                                          \
7800 +                       put_local_var(lvar);                            \
7801 +               __locked;                                               \
7802 +       })
7804 +static inline void __local_unlock(struct local_irq_lock *lv)
7806 +       LL_WARN(lv->nestcnt == 0);
7807 +       LL_WARN(lv->owner != current);
7808 +       if (--lv->nestcnt)
7809 +               return;
7811 +       lv->owner = NULL;
7812 +       spin_unlock_local(&lv->lock);
7815 +#define local_unlock(lvar)                                     \
7816 +       do {                                                    \
7817 +               __local_unlock(this_cpu_ptr(&lvar));            \
7818 +               put_local_var(lvar);                            \
7819 +       } while (0)
7821 +#define local_unlock_on(lvar, cpu)                       \
7822 +       do { __local_unlock(&per_cpu(lvar, cpu)); } while (0)
7824 +static inline void __local_lock_irq(struct local_irq_lock *lv)
7826 +       spin_lock_irqsave(&lv->lock, lv->flags);
7827 +       LL_WARN(lv->owner);
7828 +       LL_WARN(lv->nestcnt);
7829 +       lv->owner = current;
7830 +       lv->nestcnt = 1;
7833 +#define local_lock_irq(lvar)                                           \
7834 +       do { __local_lock_irq(&get_local_var(lvar)); } while (0)
7836 +#define local_lock_irq_on(lvar, cpu)                                   \
7837 +       do { __local_lock_irq(&per_cpu(lvar, cpu)); } while (0)
7839 +static inline void __local_unlock_irq(struct local_irq_lock *lv)
7841 +       LL_WARN(!lv->nestcnt);
7842 +       LL_WARN(lv->owner != current);
7843 +       lv->owner = NULL;
7844 +       lv->nestcnt = 0;
7845 +       spin_unlock_irq(&lv->lock);
7848 +#define local_unlock_irq(lvar)                                         \
7849 +       do {                                                            \
7850 +               __local_unlock_irq(this_cpu_ptr(&lvar));                \
7851 +               put_local_var(lvar);                                    \
7852 +       } while (0)
7854 +#define local_unlock_irq_on(lvar, cpu)                                 \
7855 +       do {                                                            \
7856 +               __local_unlock_irq(&per_cpu(lvar, cpu));                \
7857 +       } while (0)
7859 +static inline int __local_lock_irqsave(struct local_irq_lock *lv)
7861 +       if (lv->owner != current) {
7862 +               __local_lock_irq(lv);
7863 +               return 0;
7864 +       } else {
7865 +               lv->nestcnt++;
7866 +               return 1;
7867 +       }
7870 +#define local_lock_irqsave(lvar, _flags)                               \
7871 +       do {                                                            \
7872 +               if (__local_lock_irqsave(&get_local_var(lvar)))         \
7873 +                       put_local_var(lvar);                            \
7874 +               _flags = __this_cpu_read(lvar.flags);                   \
7875 +       } while (0)
7877 +#define local_lock_irqsave_on(lvar, _flags, cpu)                       \
7878 +       do {                                                            \
7879 +               __local_lock_irqsave(&per_cpu(lvar, cpu));              \
7880 +               _flags = per_cpu(lvar, cpu).flags;                      \
7881 +       } while (0)
7883 +static inline int __local_unlock_irqrestore(struct local_irq_lock *lv,
7884 +                                           unsigned long flags)
7886 +       LL_WARN(!lv->nestcnt);
7887 +       LL_WARN(lv->owner != current);
7888 +       if (--lv->nestcnt)
7889 +               return 0;
7891 +       lv->owner = NULL;
7892 +       spin_unlock_irqrestore(&lv->lock, lv->flags);
7893 +       return 1;
7896 +#define local_unlock_irqrestore(lvar, flags)                           \
7897 +       do {                                                            \
7898 +               if (__local_unlock_irqrestore(this_cpu_ptr(&lvar), flags)) \
7899 +                       put_local_var(lvar);                            \
7900 +       } while (0)
7902 +#define local_unlock_irqrestore_on(lvar, flags, cpu)                   \
7903 +       do {                                                            \
7904 +               __local_unlock_irqrestore(&per_cpu(lvar, cpu), flags);  \
7905 +       } while (0)
7907 +#define local_spin_trylock_irq(lvar, lock)                             \
7908 +       ({                                                              \
7909 +               int __locked;                                           \
7910 +               local_lock_irq(lvar);                                   \
7911 +               __locked = spin_trylock(lock);                          \
7912 +               if (!__locked)                                          \
7913 +                       local_unlock_irq(lvar);                         \
7914 +               __locked;                                               \
7915 +       })
7917 +#define local_spin_lock_irq(lvar, lock)                                        \
7918 +       do {                                                            \
7919 +               local_lock_irq(lvar);                                   \
7920 +               spin_lock(lock);                                        \
7921 +       } while (0)
7923 +#define local_spin_unlock_irq(lvar, lock)                              \
7924 +       do {                                                            \
7925 +               spin_unlock(lock);                                      \
7926 +               local_unlock_irq(lvar);                                 \
7927 +       } while (0)
7929 +#define local_spin_lock_irqsave(lvar, lock, flags)                     \
7930 +       do {                                                            \
7931 +               local_lock_irqsave(lvar, flags);                        \
7932 +               spin_lock(lock);                                        \
7933 +       } while (0)
7935 +#define local_spin_unlock_irqrestore(lvar, lock, flags)                        \
7936 +       do {                                                            \
7937 +               spin_unlock(lock);                                      \
7938 +               local_unlock_irqrestore(lvar, flags);                   \
7939 +       } while (0)
7941 +#define get_locked_var(lvar, var)                                      \
7942 +       (*({                                                            \
7943 +               local_lock(lvar);                                       \
7944 +               this_cpu_ptr(&var);                                     \
7945 +       }))
7947 +#define put_locked_var(lvar, var)      local_unlock(lvar);
7949 +#define local_lock_cpu(lvar)                                           \
7950 +       ({                                                              \
7951 +               local_lock(lvar);                                       \
7952 +               smp_processor_id();                                     \
7953 +       })
7955 +#define local_unlock_cpu(lvar)                 local_unlock(lvar)
7957 +#else /* PREEMPT_RT_BASE */
7959 +#define DEFINE_LOCAL_IRQ_LOCK(lvar)            __typeof__(const int) lvar
7960 +#define DECLARE_LOCAL_IRQ_LOCK(lvar)           extern __typeof__(const int) lvar
7962 +static inline void local_irq_lock_init(int lvar) { }
7964 +#define local_trylock(lvar)                                    \
7965 +       ({                                                      \
7966 +               preempt_disable();                              \
7967 +               1;                                              \
7968 +       })
7970 +#define local_lock(lvar)                       preempt_disable()
7971 +#define local_unlock(lvar)                     preempt_enable()
7972 +#define local_lock_irq(lvar)                   local_irq_disable()
7973 +#define local_lock_irq_on(lvar, cpu)           local_irq_disable()
7974 +#define local_unlock_irq(lvar)                 local_irq_enable()
7975 +#define local_unlock_irq_on(lvar, cpu)         local_irq_enable()
7976 +#define local_lock_irqsave(lvar, flags)                local_irq_save(flags)
7977 +#define local_unlock_irqrestore(lvar, flags)   local_irq_restore(flags)
7979 +#define local_spin_trylock_irq(lvar, lock)     spin_trylock_irq(lock)
7980 +#define local_spin_lock_irq(lvar, lock)                spin_lock_irq(lock)
7981 +#define local_spin_unlock_irq(lvar, lock)      spin_unlock_irq(lock)
7982 +#define local_spin_lock_irqsave(lvar, lock, flags)     \
7983 +       spin_lock_irqsave(lock, flags)
7984 +#define local_spin_unlock_irqrestore(lvar, lock, flags)        \
7985 +       spin_unlock_irqrestore(lock, flags)
7987 +#define get_locked_var(lvar, var)              get_cpu_var(var)
7988 +#define put_locked_var(lvar, var)              put_cpu_var(var)
7990 +#define local_lock_cpu(lvar)                   get_cpu()
7991 +#define local_unlock_cpu(lvar)                 put_cpu()
7993 +#endif
7995 +#endif
7996 diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
7997 index e8471c2ca83a..08bde1a7a987 100644
7998 --- a/include/linux/mm_types.h
7999 +++ b/include/linux/mm_types.h
8000 @@ -11,6 +11,7 @@
8001  #include <linux/completion.h>
8002  #include <linux/cpumask.h>
8003  #include <linux/uprobes.h>
8004 +#include <linux/rcupdate.h>
8005  #include <linux/page-flags-layout.h>
8006  #include <linux/workqueue.h>
8007  #include <asm/page.h>
8008 @@ -513,6 +514,9 @@ struct mm_struct {
8009         bool tlb_flush_batched;
8010  #endif
8011         struct uprobes_state uprobes_state;
8012 +#ifdef CONFIG_PREEMPT_RT_BASE
8013 +       struct rcu_head delayed_drop;
8014 +#endif
8015  #ifdef CONFIG_X86_INTEL_MPX
8016         /* address of the bounds directory */
8017         void __user *bd_addr;
8018 diff --git a/include/linux/module.h b/include/linux/module.h
8019 index 0c3207d26ac0..5944baaa3f28 100644
8020 --- a/include/linux/module.h
8021 +++ b/include/linux/module.h
8022 @@ -496,6 +496,7 @@ static inline int module_is_live(struct module *mod)
8023  struct module *__module_text_address(unsigned long addr);
8024  struct module *__module_address(unsigned long addr);
8025  bool is_module_address(unsigned long addr);
8026 +bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr);
8027  bool is_module_percpu_address(unsigned long addr);
8028  bool is_module_text_address(unsigned long addr);
8030 @@ -663,6 +664,11 @@ static inline bool is_module_percpu_address(unsigned long addr)
8031         return false;
8034 +static inline bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr)
8036 +       return false;
8039  static inline bool is_module_text_address(unsigned long addr)
8041         return false;
8042 diff --git a/include/linux/mutex.h b/include/linux/mutex.h
8043 index 2cb7531e7d7a..b3fdfc820216 100644
8044 --- a/include/linux/mutex.h
8045 +++ b/include/linux/mutex.h
8046 @@ -19,6 +19,17 @@
8047  #include <asm/processor.h>
8048  #include <linux/osq_lock.h>
8050 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
8051 +# define __DEP_MAP_MUTEX_INITIALIZER(lockname) \
8052 +       , .dep_map = { .name = #lockname }
8053 +#else
8054 +# define __DEP_MAP_MUTEX_INITIALIZER(lockname)
8055 +#endif
8057 +#ifdef CONFIG_PREEMPT_RT_FULL
8058 +# include <linux/mutex_rt.h>
8059 +#else
8061  /*
8062   * Simple, straightforward mutexes with strict semantics:
8063   *
8064 @@ -99,13 +110,6 @@ do {                                                        \
8065  static inline void mutex_destroy(struct mutex *lock) {}
8066  #endif
8068 -#ifdef CONFIG_DEBUG_LOCK_ALLOC
8069 -# define __DEP_MAP_MUTEX_INITIALIZER(lockname) \
8070 -               , .dep_map = { .name = #lockname }
8071 -#else
8072 -# define __DEP_MAP_MUTEX_INITIALIZER(lockname)
8073 -#endif
8075  #define __MUTEX_INITIALIZER(lockname) \
8076                 { .count = ATOMIC_INIT(1) \
8077                 , .wait_lock = __SPIN_LOCK_UNLOCKED(lockname.wait_lock) \
8078 @@ -173,6 +177,8 @@ extern int __must_check mutex_lock_killable(struct mutex *lock);
8079  extern int mutex_trylock(struct mutex *lock);
8080  extern void mutex_unlock(struct mutex *lock);
8082 +#endif /* !PREEMPT_RT_FULL */
8084  extern int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock);
8086  #endif /* __LINUX_MUTEX_H */
8087 diff --git a/include/linux/mutex_rt.h b/include/linux/mutex_rt.h
8088 new file mode 100644
8089 index 000000000000..e0284edec655
8090 --- /dev/null
8091 +++ b/include/linux/mutex_rt.h
8092 @@ -0,0 +1,89 @@
8093 +#ifndef __LINUX_MUTEX_RT_H
8094 +#define __LINUX_MUTEX_RT_H
8096 +#ifndef __LINUX_MUTEX_H
8097 +#error "Please include mutex.h"
8098 +#endif
8100 +#include <linux/rtmutex.h>
8102 +/* FIXME: Just for __lockfunc */
8103 +#include <linux/spinlock.h>
8105 +struct mutex {
8106 +       struct rt_mutex         lock;
8107 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
8108 +       struct lockdep_map      dep_map;
8109 +#endif
8112 +#define __MUTEX_INITIALIZER(mutexname)                                 \
8113 +       {                                                               \
8114 +               .lock = __RT_MUTEX_INITIALIZER(mutexname.lock)          \
8115 +               __DEP_MAP_MUTEX_INITIALIZER(mutexname)                  \
8116 +       }
8118 +#define DEFINE_MUTEX(mutexname)                                                \
8119 +       struct mutex mutexname = __MUTEX_INITIALIZER(mutexname)
8121 +extern void __mutex_do_init(struct mutex *lock, const char *name, struct lock_class_key *key);
8122 +extern void __lockfunc _mutex_lock(struct mutex *lock);
8123 +extern int __lockfunc _mutex_lock_interruptible(struct mutex *lock);
8124 +extern int __lockfunc _mutex_lock_killable(struct mutex *lock);
8125 +extern void __lockfunc _mutex_lock_nested(struct mutex *lock, int subclass);
8126 +extern void __lockfunc _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest_lock);
8127 +extern int __lockfunc _mutex_lock_interruptible_nested(struct mutex *lock, int subclass);
8128 +extern int __lockfunc _mutex_lock_killable_nested(struct mutex *lock, int subclass);
8129 +extern int __lockfunc _mutex_trylock(struct mutex *lock);
8130 +extern void __lockfunc _mutex_unlock(struct mutex *lock);
8132 +#define mutex_is_locked(l)             rt_mutex_is_locked(&(l)->lock)
8133 +#define mutex_lock(l)                  _mutex_lock(l)
8134 +#define mutex_lock_interruptible(l)    _mutex_lock_interruptible(l)
8135 +#define mutex_lock_killable(l)         _mutex_lock_killable(l)
8136 +#define mutex_trylock(l)               _mutex_trylock(l)
8137 +#define mutex_unlock(l)                        _mutex_unlock(l)
8139 +#ifdef CONFIG_DEBUG_MUTEXES
8140 +#define mutex_destroy(l)               rt_mutex_destroy(&(l)->lock)
8141 +#else
8142 +static inline void mutex_destroy(struct mutex *lock) {}
8143 +#endif
8145 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
8146 +# define mutex_lock_nested(l, s)       _mutex_lock_nested(l, s)
8147 +# define mutex_lock_interruptible_nested(l, s) \
8148 +                                       _mutex_lock_interruptible_nested(l, s)
8149 +# define mutex_lock_killable_nested(l, s) \
8150 +                                       _mutex_lock_killable_nested(l, s)
8152 +# define mutex_lock_nest_lock(lock, nest_lock)                         \
8153 +do {                                                                   \
8154 +       typecheck(struct lockdep_map *, &(nest_lock)->dep_map);         \
8155 +       _mutex_lock_nest_lock(lock, &(nest_lock)->dep_map);             \
8156 +} while (0)
8158 +#else
8159 +# define mutex_lock_nested(l, s)       _mutex_lock(l)
8160 +# define mutex_lock_interruptible_nested(l, s) \
8161 +                                       _mutex_lock_interruptible(l)
8162 +# define mutex_lock_killable_nested(l, s) \
8163 +                                       _mutex_lock_killable(l)
8164 +# define mutex_lock_nest_lock(lock, nest_lock) mutex_lock(lock)
8165 +#endif
8167 +# define mutex_init(mutex)                             \
8168 +do {                                                   \
8169 +       static struct lock_class_key __key;             \
8170 +                                                       \
8171 +       rt_mutex_init(&(mutex)->lock);                  \
8172 +       __mutex_do_init((mutex), #mutex, &__key);       \
8173 +} while (0)
8175 +# define __mutex_init(mutex, name, key)                        \
8176 +do {                                                   \
8177 +       rt_mutex_init(&(mutex)->lock);                  \
8178 +       __mutex_do_init((mutex), name, key);            \
8179 +} while (0)
8181 +#endif
8182 diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
8183 index 47c7f5b8f675..85fc72b8a92b 100644
8184 --- a/include/linux/netdevice.h
8185 +++ b/include/linux/netdevice.h
8186 @@ -396,7 +396,19 @@ typedef enum rx_handler_result rx_handler_result_t;
8187  typedef rx_handler_result_t rx_handler_func_t(struct sk_buff **pskb);
8189  void __napi_schedule(struct napi_struct *n);
8192 + * When PREEMPT_RT_FULL is defined, all device interrupt handlers
8193 + * run as threads, and they can also be preempted (without PREEMPT_RT
8194 + * interrupt threads can not be preempted). Which means that calling
8195 + * __napi_schedule_irqoff() from an interrupt handler can be preempted
8196 + * and can corrupt the napi->poll_list.
8197 + */
8198 +#ifdef CONFIG_PREEMPT_RT_FULL
8199 +#define __napi_schedule_irqoff(n) __napi_schedule(n)
8200 +#else
8201  void __napi_schedule_irqoff(struct napi_struct *n);
8202 +#endif
8204  static inline bool napi_disable_pending(struct napi_struct *n)
8206 @@ -2464,14 +2476,53 @@ void netdev_freemem(struct net_device *dev);
8207  void synchronize_net(void);
8208  int init_dummy_netdev(struct net_device *dev);
8210 -DECLARE_PER_CPU(int, xmit_recursion);
8211  #define XMIT_RECURSION_LIMIT   10
8212 +#ifdef CONFIG_PREEMPT_RT_FULL
8213 +static inline int dev_recursion_level(void)
8215 +       return current->xmit_recursion;
8218 +static inline int xmit_rec_read(void)
8220 +       return current->xmit_recursion;
8223 +static inline void xmit_rec_inc(void)
8225 +       current->xmit_recursion++;
8228 +static inline void xmit_rec_dec(void)
8230 +       current->xmit_recursion--;
8233 +#else
8235 +DECLARE_PER_CPU(int, xmit_recursion);
8237  static inline int dev_recursion_level(void)
8239         return this_cpu_read(xmit_recursion);
8242 +static inline int xmit_rec_read(void)
8244 +       return __this_cpu_read(xmit_recursion);
8247 +static inline void xmit_rec_inc(void)
8249 +       __this_cpu_inc(xmit_recursion);
8252 +static inline void xmit_rec_dec(void)
8254 +       __this_cpu_dec(xmit_recursion);
8256 +#endif
8258  struct net_device *dev_get_by_index(struct net *net, int ifindex);
8259  struct net_device *__dev_get_by_index(struct net *net, int ifindex);
8260  struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex);
8261 @@ -2856,6 +2907,7 @@ struct softnet_data {
8262         unsigned int            dropped;
8263         struct sk_buff_head     input_pkt_queue;
8264         struct napi_struct      backlog;
8265 +       struct sk_buff_head     tofree_queue;
8267  };
8269 diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
8270 index 2ad1a2b289b5..b4d10155af54 100644
8271 --- a/include/linux/netfilter/x_tables.h
8272 +++ b/include/linux/netfilter/x_tables.h
8273 @@ -4,6 +4,7 @@
8275  #include <linux/netdevice.h>
8276  #include <linux/static_key.h>
8277 +#include <linux/locallock.h>
8278  #include <uapi/linux/netfilter/x_tables.h>
8280  /* Test a struct->invflags and a boolean for inequality */
8281 @@ -300,6 +301,8 @@ void xt_free_table_info(struct xt_table_info *info);
8282   */
8283  DECLARE_PER_CPU(seqcount_t, xt_recseq);
8285 +DECLARE_LOCAL_IRQ_LOCK(xt_write_lock);
8287  /* xt_tee_enabled - true if x_tables needs to handle reentrancy
8288   *
8289   * Enabled if current ip(6)tables ruleset has at least one -j TEE rule.
8290 @@ -320,6 +323,9 @@ static inline unsigned int xt_write_recseq_begin(void)
8292         unsigned int addend;
8294 +       /* RT protection */
8295 +       local_lock(xt_write_lock);
8297         /*
8298          * Low order bit of sequence is set if we already
8299          * called xt_write_recseq_begin().
8300 @@ -350,6 +356,7 @@ static inline void xt_write_recseq_end(unsigned int addend)
8301         /* this is kind of a write_seqcount_end(), but addend is 0 or 1 */
8302         smp_wmb();
8303         __this_cpu_add(xt_recseq.sequence, addend);
8304 +       local_unlock(xt_write_lock);
8307  /*
8308 diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
8309 index 810124b33327..d54ca43d571f 100644
8310 --- a/include/linux/nfs_fs.h
8311 +++ b/include/linux/nfs_fs.h
8312 @@ -165,7 +165,11 @@ struct nfs_inode {
8314         /* Readers: in-flight sillydelete RPC calls */
8315         /* Writers: rmdir */
8316 +#ifdef CONFIG_PREEMPT_RT_BASE
8317 +       struct semaphore        rmdir_sem;
8318 +#else
8319         struct rw_semaphore     rmdir_sem;
8320 +#endif
8322  #if IS_ENABLED(CONFIG_NFS_V4)
8323         struct nfs4_cached_acl  *nfs4_acl;
8324 diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
8325 index 3bf867a0c3b3..71c6bdd14c8a 100644
8326 --- a/include/linux/nfs_xdr.h
8327 +++ b/include/linux/nfs_xdr.h
8328 @@ -1490,7 +1490,7 @@ struct nfs_unlinkdata {
8329         struct nfs_removeargs args;
8330         struct nfs_removeres res;
8331         struct dentry *dentry;
8332 -       wait_queue_head_t wq;
8333 +       struct swait_queue_head wq;
8334         struct rpc_cred *cred;
8335         struct nfs_fattr dir_attr;
8336         long timeout;
8337 diff --git a/include/linux/notifier.h b/include/linux/notifier.h
8338 index 4149868de4e6..babe5b9bcb91 100644
8339 --- a/include/linux/notifier.h
8340 +++ b/include/linux/notifier.h
8341 @@ -6,7 +6,7 @@
8342   *
8343   *                             Alan Cox <Alan.Cox@linux.org>
8344   */
8347  #ifndef _LINUX_NOTIFIER_H
8348  #define _LINUX_NOTIFIER_H
8349  #include <linux/errno.h>
8350 @@ -42,9 +42,7 @@
8351   * in srcu_notifier_call_chain(): no cache bounces and no memory barriers.
8352   * As compensation, srcu_notifier_chain_unregister() is rather expensive.
8353   * SRCU notifier chains should be used when the chain will be called very
8354 - * often but notifier_blocks will seldom be removed.  Also, SRCU notifier
8355 - * chains are slightly more difficult to use because they require special
8356 - * runtime initialization.
8357 + * often but notifier_blocks will seldom be removed.
8358   */
8360  struct notifier_block;
8361 @@ -90,7 +88,7 @@ struct srcu_notifier_head {
8362                 (name)->head = NULL;            \
8363         } while (0)
8365 -/* srcu_notifier_heads must be initialized and cleaned up dynamically */
8366 +/* srcu_notifier_heads must be cleaned up dynamically */
8367  extern void srcu_init_notifier_head(struct srcu_notifier_head *nh);
8368  #define srcu_cleanup_notifier_head(name)       \
8369                 cleanup_srcu_struct(&(name)->srcu);
8370 @@ -103,7 +101,13 @@ extern void srcu_init_notifier_head(struct srcu_notifier_head *nh);
8371                 .head = NULL }
8372  #define RAW_NOTIFIER_INIT(name)        {                               \
8373                 .head = NULL }
8374 -/* srcu_notifier_heads cannot be initialized statically */
8376 +#define SRCU_NOTIFIER_INIT(name, pcpu)                         \
8377 +       {                                                       \
8378 +               .mutex = __MUTEX_INITIALIZER(name.mutex),       \
8379 +               .head = NULL,                                   \
8380 +               .srcu = __SRCU_STRUCT_INIT(name.srcu, pcpu),    \
8381 +       }
8383  #define ATOMIC_NOTIFIER_HEAD(name)                             \
8384         struct atomic_notifier_head name =                      \
8385 @@ -115,6 +119,18 @@ extern void srcu_init_notifier_head(struct srcu_notifier_head *nh);
8386         struct raw_notifier_head name =                         \
8387                 RAW_NOTIFIER_INIT(name)
8389 +#define _SRCU_NOTIFIER_HEAD(name, mod)                         \
8390 +       static DEFINE_PER_CPU(struct srcu_struct_array,         \
8391 +                       name##_head_srcu_array);                \
8392 +       mod struct srcu_notifier_head name =                    \
8393 +                       SRCU_NOTIFIER_INIT(name, name##_head_srcu_array)
8395 +#define SRCU_NOTIFIER_HEAD(name)                               \
8396 +       _SRCU_NOTIFIER_HEAD(name, )
8398 +#define SRCU_NOTIFIER_HEAD_STATIC(name)                                \
8399 +       _SRCU_NOTIFIER_HEAD(name, static)
8401  #ifdef __KERNEL__
8403  extern int atomic_notifier_chain_register(struct atomic_notifier_head *nh,
8404 @@ -184,12 +200,12 @@ static inline int notifier_to_errno(int ret)
8406  /*
8407   *     Declared notifiers so far. I can imagine quite a few more chains
8408 - *     over time (eg laptop power reset chains, reboot chain (to clean 
8409 + *     over time (eg laptop power reset chains, reboot chain (to clean
8410   *     device units up), device [un]mount chain, module load/unload chain,
8411 - *     low memory chain, screenblank chain (for plug in modular screenblankers) 
8412 + *     low memory chain, screenblank chain (for plug in modular screenblankers)
8413   *     VC switch chains (for loadable kernel svgalib VC switch helpers) etc...
8414   */
8417  /* CPU notfiers are defined in include/linux/cpu.h. */
8419  /* netdevice notifiers are defined in include/linux/netdevice.h */
8420 diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h
8421 index 5b2e6159b744..ea940f451606 100644
8422 --- a/include/linux/percpu-rwsem.h
8423 +++ b/include/linux/percpu-rwsem.h
8424 @@ -4,7 +4,7 @@
8425  #include <linux/atomic.h>
8426  #include <linux/rwsem.h>
8427  #include <linux/percpu.h>
8428 -#include <linux/wait.h>
8429 +#include <linux/swait.h>
8430  #include <linux/rcu_sync.h>
8431  #include <linux/lockdep.h>
8433 @@ -12,7 +12,7 @@ struct percpu_rw_semaphore {
8434         struct rcu_sync         rss;
8435         unsigned int __percpu   *read_count;
8436         struct rw_semaphore     rw_sem;
8437 -       wait_queue_head_t       writer;
8438 +       struct swait_queue_head writer;
8439         int                     readers_block;
8440  };
8442 @@ -22,13 +22,13 @@ static struct percpu_rw_semaphore name = {                          \
8443         .rss = __RCU_SYNC_INITIALIZER(name.rss, RCU_SCHED_SYNC),        \
8444         .read_count = &__percpu_rwsem_rc_##name,                        \
8445         .rw_sem = __RWSEM_INITIALIZER(name.rw_sem),                     \
8446 -       .writer = __WAIT_QUEUE_HEAD_INITIALIZER(name.writer),           \
8447 +       .writer = __SWAIT_QUEUE_HEAD_INITIALIZER(name.writer),          \
8450  extern int __percpu_down_read(struct percpu_rw_semaphore *, int);
8451  extern void __percpu_up_read(struct percpu_rw_semaphore *);
8453 -static inline void percpu_down_read_preempt_disable(struct percpu_rw_semaphore *sem)
8454 +static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
8456         might_sleep();
8458 @@ -46,16 +46,10 @@ static inline void percpu_down_read_preempt_disable(struct percpu_rw_semaphore *
8459         __this_cpu_inc(*sem->read_count);
8460         if (unlikely(!rcu_sync_is_idle(&sem->rss)))
8461                 __percpu_down_read(sem, false); /* Unconditional memory barrier */
8462 -       barrier();
8463         /*
8464 -        * The barrier() prevents the compiler from
8465 +        * The preempt_enable() prevents the compiler from
8466          * bleeding the critical section out.
8467          */
8470 -static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
8472 -       percpu_down_read_preempt_disable(sem);
8473         preempt_enable();
8476 @@ -82,13 +76,9 @@ static inline int percpu_down_read_trylock(struct percpu_rw_semaphore *sem)
8477         return ret;
8480 -static inline void percpu_up_read_preempt_enable(struct percpu_rw_semaphore *sem)
8481 +static inline void percpu_up_read(struct percpu_rw_semaphore *sem)
8483 -       /*
8484 -        * The barrier() prevents the compiler from
8485 -        * bleeding the critical section out.
8486 -        */
8487 -       barrier();
8488 +       preempt_disable();
8489         /*
8490          * Same as in percpu_down_read().
8491          */
8492 @@ -101,12 +91,6 @@ static inline void percpu_up_read_preempt_enable(struct percpu_rw_semaphore *sem
8493         rwsem_release(&sem->rw_sem.dep_map, 1, _RET_IP_);
8496 -static inline void percpu_up_read(struct percpu_rw_semaphore *sem)
8498 -       preempt_disable();
8499 -       percpu_up_read_preempt_enable(sem);
8502  extern void percpu_down_write(struct percpu_rw_semaphore *);
8503  extern void percpu_up_write(struct percpu_rw_semaphore *);
8505 diff --git a/include/linux/percpu.h b/include/linux/percpu.h
8506 index 56939d3f6e53..b988bf40ad3e 100644
8507 --- a/include/linux/percpu.h
8508 +++ b/include/linux/percpu.h
8509 @@ -18,6 +18,35 @@
8510  #define PERCPU_MODULE_RESERVE          0
8511  #endif
8513 +#ifdef CONFIG_PREEMPT_RT_FULL
8515 +#define get_local_var(var) (*({        \
8516 +       migrate_disable();      \
8517 +       this_cpu_ptr(&var);     }))
8519 +#define put_local_var(var) do {        \
8520 +       (void)&(var);           \
8521 +       migrate_enable();       \
8522 +} while (0)
8524 +# define get_local_ptr(var) ({ \
8525 +       migrate_disable();      \
8526 +       this_cpu_ptr(var);      })
8528 +# define put_local_ptr(var) do {       \
8529 +       (void)(var);                    \
8530 +       migrate_enable();               \
8531 +} while (0)
8533 +#else
8535 +#define get_local_var(var)     get_cpu_var(var)
8536 +#define put_local_var(var)     put_cpu_var(var)
8537 +#define get_local_ptr(var)     get_cpu_ptr(var)
8538 +#define put_local_ptr(var)     put_cpu_ptr(var)
8540 +#endif
8542  /* minimum unit size, also is the maximum supported allocation size */
8543  #define PCPU_MIN_UNIT_SIZE             PFN_ALIGN(32 << 10)
8545 @@ -110,6 +139,7 @@ extern int __init pcpu_page_first_chunk(size_t reserved_size,
8546  #endif
8548  extern void __percpu *__alloc_reserved_percpu(size_t size, size_t align);
8549 +extern bool __is_kernel_percpu_address(unsigned long addr, unsigned long *can_addr);
8550  extern bool is_kernel_percpu_address(unsigned long addr);
8552  #if !defined(CONFIG_SMP) || !defined(CONFIG_HAVE_SETUP_PER_CPU_AREA)
8553 diff --git a/include/linux/pid.h b/include/linux/pid.h
8554 index 97b745ddece5..01a5460a0c85 100644
8555 --- a/include/linux/pid.h
8556 +++ b/include/linux/pid.h
8557 @@ -2,6 +2,7 @@
8558  #define _LINUX_PID_H
8560  #include <linux/rcupdate.h>
8561 +#include <linux/atomic.h>
8563  enum pid_type
8565 diff --git a/include/linux/preempt.h b/include/linux/preempt.h
8566 index 7eeceac52dea..f97c54265904 100644
8567 --- a/include/linux/preempt.h
8568 +++ b/include/linux/preempt.h
8569 @@ -50,7 +50,11 @@
8570  #define HARDIRQ_OFFSET (1UL << HARDIRQ_SHIFT)
8571  #define NMI_OFFSET     (1UL << NMI_SHIFT)
8573 -#define SOFTIRQ_DISABLE_OFFSET (2 * SOFTIRQ_OFFSET)
8574 +#ifndef CONFIG_PREEMPT_RT_FULL
8575 +# define SOFTIRQ_DISABLE_OFFSET                (2 * SOFTIRQ_OFFSET)
8576 +#else
8577 +# define SOFTIRQ_DISABLE_OFFSET                (0)
8578 +#endif
8580  /* We use the MSB mostly because its available */
8581  #define PREEMPT_NEED_RESCHED   0x80000000
8582 @@ -59,9 +63,15 @@
8583  #include <asm/preempt.h>
8585  #define hardirq_count()        (preempt_count() & HARDIRQ_MASK)
8586 -#define softirq_count()        (preempt_count() & SOFTIRQ_MASK)
8587  #define irq_count()    (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK \
8588                                  | NMI_MASK))
8589 +#ifndef CONFIG_PREEMPT_RT_FULL
8590 +# define softirq_count()       (preempt_count() & SOFTIRQ_MASK)
8591 +# define in_serving_softirq()  (softirq_count() & SOFTIRQ_OFFSET)
8592 +#else
8593 +# define softirq_count()       (0UL)
8594 +extern int in_serving_softirq(void);
8595 +#endif
8597  /*
8598   * Are we doing bottom half or hardware interrupt processing?
8599 @@ -79,7 +89,6 @@
8600  #define in_irq()               (hardirq_count())
8601  #define in_softirq()           (softirq_count())
8602  #define in_interrupt()         (irq_count())
8603 -#define in_serving_softirq()   (softirq_count() & SOFTIRQ_OFFSET)
8604  #define in_nmi()               (preempt_count() & NMI_MASK)
8605  #define in_task()              (!(preempt_count() & \
8606                                    (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)))
8607 @@ -96,7 +105,11 @@
8608  /*
8609   * The preempt_count offset after spin_lock()
8610   */
8611 +#if !defined(CONFIG_PREEMPT_RT_FULL)
8612  #define PREEMPT_LOCK_OFFSET    PREEMPT_DISABLE_OFFSET
8613 +#else
8614 +#define PREEMPT_LOCK_OFFSET    0
8615 +#endif
8617  /*
8618   * The preempt_count offset needed for things like:
8619 @@ -145,6 +158,20 @@ extern void preempt_count_sub(int val);
8620  #define preempt_count_inc() preempt_count_add(1)
8621  #define preempt_count_dec() preempt_count_sub(1)
8623 +#ifdef CONFIG_PREEMPT_LAZY
8624 +#define add_preempt_lazy_count(val)    do { preempt_lazy_count() += (val); } while (0)
8625 +#define sub_preempt_lazy_count(val)    do { preempt_lazy_count() -= (val); } while (0)
8626 +#define inc_preempt_lazy_count()       add_preempt_lazy_count(1)
8627 +#define dec_preempt_lazy_count()       sub_preempt_lazy_count(1)
8628 +#define preempt_lazy_count()           (current_thread_info()->preempt_lazy_count)
8629 +#else
8630 +#define add_preempt_lazy_count(val)    do { } while (0)
8631 +#define sub_preempt_lazy_count(val)    do { } while (0)
8632 +#define inc_preempt_lazy_count()       do { } while (0)
8633 +#define dec_preempt_lazy_count()       do { } while (0)
8634 +#define preempt_lazy_count()           (0)
8635 +#endif
8637  #ifdef CONFIG_PREEMPT_COUNT
8639  #define preempt_disable() \
8640 @@ -153,13 +180,25 @@ do { \
8641         barrier(); \
8642  } while (0)
8644 +#define preempt_lazy_disable() \
8645 +do { \
8646 +       inc_preempt_lazy_count(); \
8647 +       barrier(); \
8648 +} while (0)
8650  #define sched_preempt_enable_no_resched() \
8651  do { \
8652         barrier(); \
8653         preempt_count_dec(); \
8654  } while (0)
8656 -#define preempt_enable_no_resched() sched_preempt_enable_no_resched()
8657 +#ifdef CONFIG_PREEMPT_RT_BASE
8658 +# define preempt_enable_no_resched() sched_preempt_enable_no_resched()
8659 +# define preempt_check_resched_rt() preempt_check_resched()
8660 +#else
8661 +# define preempt_enable_no_resched() preempt_enable()
8662 +# define preempt_check_resched_rt() barrier();
8663 +#endif
8665  #define preemptible()  (preempt_count() == 0 && !irqs_disabled())
8667 @@ -184,6 +223,13 @@ do { \
8668                 __preempt_schedule(); \
8669  } while (0)
8671 +#define preempt_lazy_enable() \
8672 +do { \
8673 +       dec_preempt_lazy_count(); \
8674 +       barrier(); \
8675 +       preempt_check_resched(); \
8676 +} while (0)
8678  #else /* !CONFIG_PREEMPT */
8679  #define preempt_enable() \
8680  do { \
8681 @@ -229,6 +275,7 @@ do { \
8682  #define preempt_disable_notrace()              barrier()
8683  #define preempt_enable_no_resched_notrace()    barrier()
8684  #define preempt_enable_notrace()               barrier()
8685 +#define preempt_check_resched_rt()             barrier()
8686  #define preemptible()                          0
8688  #endif /* CONFIG_PREEMPT_COUNT */
8689 @@ -249,10 +296,31 @@ do { \
8690  } while (0)
8691  #define preempt_fold_need_resched() \
8692  do { \
8693 -       if (tif_need_resched()) \
8694 +       if (tif_need_resched_now()) \
8695                 set_preempt_need_resched(); \
8696  } while (0)
8698 +#ifdef CONFIG_PREEMPT_RT_FULL
8699 +# define preempt_disable_rt()          preempt_disable()
8700 +# define preempt_enable_rt()           preempt_enable()
8701 +# define preempt_disable_nort()                barrier()
8702 +# define preempt_enable_nort()         barrier()
8703 +# ifdef CONFIG_SMP
8704 +   extern void migrate_disable(void);
8705 +   extern void migrate_enable(void);
8706 +# else /* CONFIG_SMP */
8707 +#  define migrate_disable()            barrier()
8708 +#  define migrate_enable()             barrier()
8709 +# endif /* CONFIG_SMP */
8710 +#else
8711 +# define preempt_disable_rt()          barrier()
8712 +# define preempt_enable_rt()           barrier()
8713 +# define preempt_disable_nort()                preempt_disable()
8714 +# define preempt_enable_nort()         preempt_enable()
8715 +# define migrate_disable()             preempt_disable()
8716 +# define migrate_enable()              preempt_enable()
8717 +#endif
8719  #ifdef CONFIG_PREEMPT_NOTIFIERS
8721  struct preempt_notifier;
8722 diff --git a/include/linux/printk.h b/include/linux/printk.h
8723 index eac1af8502bb..37e647af0b0b 100644
8724 --- a/include/linux/printk.h
8725 +++ b/include/linux/printk.h
8726 @@ -126,9 +126,11 @@ struct va_format {
8727  #ifdef CONFIG_EARLY_PRINTK
8728  extern asmlinkage __printf(1, 2)
8729  void early_printk(const char *fmt, ...);
8730 +extern void printk_kill(void);
8731  #else
8732  static inline __printf(1, 2) __cold
8733  void early_printk(const char *s, ...) { }
8734 +static inline void printk_kill(void) { }
8735  #endif
8737  #ifdef CONFIG_PRINTK_NMI
8738 diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
8739 index af3581b8a451..277295039c8f 100644
8740 --- a/include/linux/radix-tree.h
8741 +++ b/include/linux/radix-tree.h
8742 @@ -292,6 +292,8 @@ unsigned int radix_tree_gang_lookup_slot(struct radix_tree_root *root,
8743  int radix_tree_preload(gfp_t gfp_mask);
8744  int radix_tree_maybe_preload(gfp_t gfp_mask);
8745  int radix_tree_maybe_preload_order(gfp_t gfp_mask, int order);
8746 +void radix_tree_preload_end(void);
8748  void radix_tree_init(void);
8749  void *radix_tree_tag_set(struct radix_tree_root *root,
8750                         unsigned long index, unsigned int tag);
8751 @@ -314,11 +316,6 @@ unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root,
8752  int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag);
8753  unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item);
8755 -static inline void radix_tree_preload_end(void)
8757 -       preempt_enable();
8760  /**
8761   * struct radix_tree_iter - radix tree iterator state
8762   *
8763 diff --git a/include/linux/random.h b/include/linux/random.h
8764 index 16ab429735a7..9d0fecb5b6c2 100644
8765 --- a/include/linux/random.h
8766 +++ b/include/linux/random.h
8767 @@ -31,7 +31,7 @@ static inline void add_latent_entropy(void) {}
8769  extern void add_input_randomness(unsigned int type, unsigned int code,
8770                                  unsigned int value) __latent_entropy;
8771 -extern void add_interrupt_randomness(int irq, int irq_flags) __latent_entropy;
8772 +extern void add_interrupt_randomness(int irq, int irq_flags, __u64 ip) __latent_entropy;
8774  extern void get_random_bytes(void *buf, int nbytes);
8775  extern int add_random_ready_callback(struct random_ready_callback *rdy);
8776 diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h
8777 index e585018498d5..25c64474fc27 100644
8778 --- a/include/linux/rbtree.h
8779 +++ b/include/linux/rbtree.h
8780 @@ -31,7 +31,7 @@
8782  #include <linux/kernel.h>
8783  #include <linux/stddef.h>
8784 -#include <linux/rcupdate.h>
8785 +#include <linux/rcu_assign_pointer.h>
8787  struct rb_node {
8788         unsigned long  __rb_parent_color;
8789 diff --git a/include/linux/rbtree_augmented.h b/include/linux/rbtree_augmented.h
8790 index d076183e49be..36bfb4dd57ae 100644
8791 --- a/include/linux/rbtree_augmented.h
8792 +++ b/include/linux/rbtree_augmented.h
8793 @@ -26,6 +26,7 @@
8795  #include <linux/compiler.h>
8796  #include <linux/rbtree.h>
8797 +#include <linux/rcupdate.h>
8799  /*
8800   * Please note - only struct rb_augment_callbacks and the prototypes for
8801 diff --git a/include/linux/rcu_assign_pointer.h b/include/linux/rcu_assign_pointer.h
8802 new file mode 100644
8803 index 000000000000..7066962a4379
8804 --- /dev/null
8805 +++ b/include/linux/rcu_assign_pointer.h
8806 @@ -0,0 +1,54 @@
8807 +#ifndef __LINUX_RCU_ASSIGN_POINTER_H__
8808 +#define __LINUX_RCU_ASSIGN_POINTER_H__
8809 +#include <linux/compiler.h>
8810 +#include <asm/barrier.h>
8812 +/**
8813 + * RCU_INITIALIZER() - statically initialize an RCU-protected global variable
8814 + * @v: The value to statically initialize with.
8815 + */
8816 +#define RCU_INITIALIZER(v) (typeof(*(v)) __force __rcu *)(v)
8818 +/**
8819 + * rcu_assign_pointer() - assign to RCU-protected pointer
8820 + * @p: pointer to assign to
8821 + * @v: value to assign (publish)
8822 + *
8823 + * Assigns the specified value to the specified RCU-protected
8824 + * pointer, ensuring that any concurrent RCU readers will see
8825 + * any prior initialization.
8826 + *
8827 + * Inserts memory barriers on architectures that require them
8828 + * (which is most of them), and also prevents the compiler from
8829 + * reordering the code that initializes the structure after the pointer
8830 + * assignment.  More importantly, this call documents which pointers
8831 + * will be dereferenced by RCU read-side code.
8832 + *
8833 + * In some special cases, you may use RCU_INIT_POINTER() instead
8834 + * of rcu_assign_pointer().  RCU_INIT_POINTER() is a bit faster due
8835 + * to the fact that it does not constrain either the CPU or the compiler.
8836 + * That said, using RCU_INIT_POINTER() when you should have used
8837 + * rcu_assign_pointer() is a very bad thing that results in
8838 + * impossible-to-diagnose memory corruption.  So please be careful.
8839 + * See the RCU_INIT_POINTER() comment header for details.
8840 + *
8841 + * Note that rcu_assign_pointer() evaluates each of its arguments only
8842 + * once, appearances notwithstanding.  One of the "extra" evaluations
8843 + * is in typeof() and the other visible only to sparse (__CHECKER__),
8844 + * neither of which actually execute the argument.  As with most cpp
8845 + * macros, this execute-arguments-only-once property is important, so
8846 + * please be careful when making changes to rcu_assign_pointer() and the
8847 + * other macros that it invokes.
8848 + */
8849 +#define rcu_assign_pointer(p, v)                                             \
8850 +({                                                                           \
8851 +       uintptr_t _r_a_p__v = (uintptr_t)(v);                                 \
8852 +                                                                             \
8853 +       if (__builtin_constant_p(v) && (_r_a_p__v) == (uintptr_t)NULL)        \
8854 +               WRITE_ONCE((p), (typeof(p))(_r_a_p__v));                      \
8855 +       else                                                                  \
8856 +               smp_store_release(&p, RCU_INITIALIZER((typeof(p))_r_a_p__v)); \
8857 +       _r_a_p__v;                                                            \
8860 +#endif
8861 diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
8862 index 01f71e1d2e94..30cc001d0d5a 100644
8863 --- a/include/linux/rcupdate.h
8864 +++ b/include/linux/rcupdate.h
8865 @@ -46,6 +46,7 @@
8866  #include <linux/compiler.h>
8867  #include <linux/ktime.h>
8868  #include <linux/irqflags.h>
8869 +#include <linux/rcu_assign_pointer.h>
8871  #include <asm/barrier.h>
8873 @@ -178,6 +179,9 @@ void call_rcu(struct rcu_head *head,
8875  #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
8877 +#ifdef CONFIG_PREEMPT_RT_FULL
8878 +#define call_rcu_bh    call_rcu
8879 +#else
8880  /**
8881   * call_rcu_bh() - Queue an RCU for invocation after a quicker grace period.
8882   * @head: structure to be used for queueing the RCU updates.
8883 @@ -201,6 +205,7 @@ void call_rcu(struct rcu_head *head,
8884   */
8885  void call_rcu_bh(struct rcu_head *head,
8886                  rcu_callback_t func);
8887 +#endif
8889  /**
8890   * call_rcu_sched() - Queue an RCU for invocation after sched grace period.
8891 @@ -301,6 +306,11 @@ void synchronize_rcu(void);
8892   * types of kernel builds, the rcu_read_lock() nesting depth is unknowable.
8893   */
8894  #define rcu_preempt_depth() (current->rcu_read_lock_nesting)
8895 +#ifndef CONFIG_PREEMPT_RT_FULL
8896 +#define sched_rcu_preempt_depth()      rcu_preempt_depth()
8897 +#else
8898 +static inline int sched_rcu_preempt_depth(void) { return 0; }
8899 +#endif
8901  #else /* #ifdef CONFIG_PREEMPT_RCU */
8903 @@ -326,6 +336,8 @@ static inline int rcu_preempt_depth(void)
8904         return 0;
8907 +#define sched_rcu_preempt_depth()      rcu_preempt_depth()
8909  #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
8911  /* Internal to kernel */
8912 @@ -505,7 +517,14 @@ extern struct lockdep_map rcu_callback_map;
8913  int debug_lockdep_rcu_enabled(void);
8915  int rcu_read_lock_held(void);
8916 +#ifdef CONFIG_PREEMPT_RT_FULL
8917 +static inline int rcu_read_lock_bh_held(void)
8919 +       return rcu_read_lock_held();
8921 +#else
8922  int rcu_read_lock_bh_held(void);
8923 +#endif
8925  /**
8926   * rcu_read_lock_sched_held() - might we be in RCU-sched read-side critical section?
8927 @@ -625,54 +644,6 @@ static inline void rcu_preempt_sleep_check(void)
8928         ((typeof(*p) __force __kernel *)(________p1)); \
8929  })
8931 -/**
8932 - * RCU_INITIALIZER() - statically initialize an RCU-protected global variable
8933 - * @v: The value to statically initialize with.
8934 - */
8935 -#define RCU_INITIALIZER(v) (typeof(*(v)) __force __rcu *)(v)
8937 -/**
8938 - * rcu_assign_pointer() - assign to RCU-protected pointer
8939 - * @p: pointer to assign to
8940 - * @v: value to assign (publish)
8941 - *
8942 - * Assigns the specified value to the specified RCU-protected
8943 - * pointer, ensuring that any concurrent RCU readers will see
8944 - * any prior initialization.
8945 - *
8946 - * Inserts memory barriers on architectures that require them
8947 - * (which is most of them), and also prevents the compiler from
8948 - * reordering the code that initializes the structure after the pointer
8949 - * assignment.  More importantly, this call documents which pointers
8950 - * will be dereferenced by RCU read-side code.
8951 - *
8952 - * In some special cases, you may use RCU_INIT_POINTER() instead
8953 - * of rcu_assign_pointer().  RCU_INIT_POINTER() is a bit faster due
8954 - * to the fact that it does not constrain either the CPU or the compiler.
8955 - * That said, using RCU_INIT_POINTER() when you should have used
8956 - * rcu_assign_pointer() is a very bad thing that results in
8957 - * impossible-to-diagnose memory corruption.  So please be careful.
8958 - * See the RCU_INIT_POINTER() comment header for details.
8959 - *
8960 - * Note that rcu_assign_pointer() evaluates each of its arguments only
8961 - * once, appearances notwithstanding.  One of the "extra" evaluations
8962 - * is in typeof() and the other visible only to sparse (__CHECKER__),
8963 - * neither of which actually execute the argument.  As with most cpp
8964 - * macros, this execute-arguments-only-once property is important, so
8965 - * please be careful when making changes to rcu_assign_pointer() and the
8966 - * other macros that it invokes.
8967 - */
8968 -#define rcu_assign_pointer(p, v)                                             \
8969 -({                                                                           \
8970 -       uintptr_t _r_a_p__v = (uintptr_t)(v);                                 \
8971 -                                                                             \
8972 -       if (__builtin_constant_p(v) && (_r_a_p__v) == (uintptr_t)NULL)        \
8973 -               WRITE_ONCE((p), (typeof(p))(_r_a_p__v));                      \
8974 -       else                                                                  \
8975 -               smp_store_release(&p, RCU_INITIALIZER((typeof(p))_r_a_p__v)); \
8976 -       _r_a_p__v;                                                            \
8979  /**
8980   * rcu_access_pointer() - fetch RCU pointer with no dereferencing
8981   * @p: The pointer to read
8982 @@ -951,10 +922,14 @@ static inline void rcu_read_unlock(void)
8983  static inline void rcu_read_lock_bh(void)
8985         local_bh_disable();
8986 +#ifdef CONFIG_PREEMPT_RT_FULL
8987 +       rcu_read_lock();
8988 +#else
8989         __acquire(RCU_BH);
8990         rcu_lock_acquire(&rcu_bh_lock_map);
8991         RCU_LOCKDEP_WARN(!rcu_is_watching(),
8992                          "rcu_read_lock_bh() used illegally while idle");
8993 +#endif
8996  /*
8997 @@ -964,10 +939,14 @@ static inline void rcu_read_lock_bh(void)
8998   */
8999  static inline void rcu_read_unlock_bh(void)
9001 +#ifdef CONFIG_PREEMPT_RT_FULL
9002 +       rcu_read_unlock();
9003 +#else
9004         RCU_LOCKDEP_WARN(!rcu_is_watching(),
9005                          "rcu_read_unlock_bh() used illegally while idle");
9006         rcu_lock_release(&rcu_bh_lock_map);
9007         __release(RCU_BH);
9008 +#endif
9009         local_bh_enable();
9012 diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
9013 index 63a4e4cf40a5..08ab12df2863 100644
9014 --- a/include/linux/rcutree.h
9015 +++ b/include/linux/rcutree.h
9016 @@ -44,7 +44,11 @@ static inline void rcu_virt_note_context_switch(int cpu)
9017         rcu_note_context_switch();
9020 +#ifdef CONFIG_PREEMPT_RT_FULL
9021 +# define synchronize_rcu_bh    synchronize_rcu
9022 +#else
9023  void synchronize_rcu_bh(void);
9024 +#endif
9025  void synchronize_sched_expedited(void);
9026  void synchronize_rcu_expedited(void);
9028 @@ -72,7 +76,11 @@ static inline void synchronize_rcu_bh_expedited(void)
9031  void rcu_barrier(void);
9032 +#ifdef CONFIG_PREEMPT_RT_FULL
9033 +# define rcu_barrier_bh                rcu_barrier
9034 +#else
9035  void rcu_barrier_bh(void);
9036 +#endif
9037  void rcu_barrier_sched(void);
9038  unsigned long get_state_synchronize_rcu(void);
9039  void cond_synchronize_rcu(unsigned long oldstate);
9040 @@ -82,17 +90,14 @@ void cond_synchronize_sched(unsigned long oldstate);
9041  extern unsigned long rcutorture_testseq;
9042  extern unsigned long rcutorture_vernum;
9043  unsigned long rcu_batches_started(void);
9044 -unsigned long rcu_batches_started_bh(void);
9045  unsigned long rcu_batches_started_sched(void);
9046  unsigned long rcu_batches_completed(void);
9047 -unsigned long rcu_batches_completed_bh(void);
9048  unsigned long rcu_batches_completed_sched(void);
9049  unsigned long rcu_exp_batches_completed(void);
9050  unsigned long rcu_exp_batches_completed_sched(void);
9051  void show_rcu_gp_kthreads(void);
9053  void rcu_force_quiescent_state(void);
9054 -void rcu_bh_force_quiescent_state(void);
9055  void rcu_sched_force_quiescent_state(void);
9057  void rcu_idle_enter(void);
9058 @@ -109,6 +114,16 @@ extern int rcu_scheduler_active __read_mostly;
9060  bool rcu_is_watching(void);
9062 +#ifndef CONFIG_PREEMPT_RT_FULL
9063 +void rcu_bh_force_quiescent_state(void);
9064 +unsigned long rcu_batches_started_bh(void);
9065 +unsigned long rcu_batches_completed_bh(void);
9066 +#else
9067 +# define rcu_bh_force_quiescent_state  rcu_force_quiescent_state
9068 +# define rcu_batches_completed_bh      rcu_batches_completed
9069 +# define rcu_batches_started_bh                rcu_batches_completed
9070 +#endif
9072  void rcu_all_qs(void);
9074  /* RCUtree hotplug events */
9075 diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h
9076 index 1abba5ce2a2f..294a8b4875f1 100644
9077 --- a/include/linux/rtmutex.h
9078 +++ b/include/linux/rtmutex.h
9079 @@ -13,11 +13,15 @@
9080  #define __LINUX_RT_MUTEX_H
9082  #include <linux/linkage.h>
9083 +#include <linux/spinlock_types_raw.h>
9084  #include <linux/rbtree.h>
9085 -#include <linux/spinlock_types.h>
9087  extern int max_lock_depth; /* for sysctl */
9089 +#ifdef CONFIG_DEBUG_MUTEXES
9090 +#include <linux/debug_locks.h>
9091 +#endif
9093  /**
9094   * The rt_mutex structure
9095   *
9096 @@ -31,8 +35,8 @@ struct rt_mutex {
9097         struct rb_root          waiters;
9098         struct rb_node          *waiters_leftmost;
9099         struct task_struct      *owner;
9100 -#ifdef CONFIG_DEBUG_RT_MUTEXES
9101         int                     save_state;
9102 +#ifdef CONFIG_DEBUG_RT_MUTEXES
9103         const char              *name, *file;
9104         int                     line;
9105         void                    *magic;
9106 @@ -55,22 +59,33 @@ struct hrtimer_sleeper;
9107  # define rt_mutex_debug_check_no_locks_held(task)      do { } while (0)
9108  #endif
9110 +# define rt_mutex_init(mutex)                                  \
9111 +       do {                                                    \
9112 +               raw_spin_lock_init(&(mutex)->wait_lock);        \
9113 +               __rt_mutex_init(mutex, #mutex);                 \
9114 +       } while (0)
9116  #ifdef CONFIG_DEBUG_RT_MUTEXES
9117  # define __DEBUG_RT_MUTEX_INITIALIZER(mutexname) \
9118         , .name = #mutexname, .file = __FILE__, .line = __LINE__
9119 -# define rt_mutex_init(mutex)                  __rt_mutex_init(mutex, __func__)
9120   extern void rt_mutex_debug_task_free(struct task_struct *tsk);
9121  #else
9122  # define __DEBUG_RT_MUTEX_INITIALIZER(mutexname)
9123 -# define rt_mutex_init(mutex)                  __rt_mutex_init(mutex, NULL)
9124  # define rt_mutex_debug_task_free(t)                   do { } while (0)
9125  #endif
9127 -#define __RT_MUTEX_INITIALIZER(mutexname) \
9128 -       { .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \
9129 +#define __RT_MUTEX_INITIALIZER_PLAIN(mutexname) \
9130 +        .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \
9131         , .waiters = RB_ROOT \
9132         , .owner = NULL \
9133 -       __DEBUG_RT_MUTEX_INITIALIZER(mutexname)}
9134 +       __DEBUG_RT_MUTEX_INITIALIZER(mutexname)
9136 +#define __RT_MUTEX_INITIALIZER(mutexname) \
9137 +       { __RT_MUTEX_INITIALIZER_PLAIN(mutexname) }
9139 +#define __RT_MUTEX_INITIALIZER_SAVE_STATE(mutexname) \
9140 +       { __RT_MUTEX_INITIALIZER_PLAIN(mutexname)    \
9141 +       , .save_state = 1 }
9143  #define DEFINE_RT_MUTEX(mutexname) \
9144         struct rt_mutex mutexname = __RT_MUTEX_INITIALIZER(mutexname)
9145 @@ -90,7 +105,9 @@ extern void __rt_mutex_init(struct rt_mutex *lock, const char *name);
9146  extern void rt_mutex_destroy(struct rt_mutex *lock);
9148  extern void rt_mutex_lock(struct rt_mutex *lock);
9149 +extern int rt_mutex_lock_state(struct rt_mutex *lock, int state);
9150  extern int rt_mutex_lock_interruptible(struct rt_mutex *lock);
9151 +extern int rt_mutex_lock_killable(struct rt_mutex *lock);
9152  extern int rt_mutex_timed_lock(struct rt_mutex *lock,
9153                                struct hrtimer_sleeper *timeout);
9155 diff --git a/include/linux/rwlock_rt.h b/include/linux/rwlock_rt.h
9156 new file mode 100644
9157 index 000000000000..49ed2d45d3be
9158 --- /dev/null
9159 +++ b/include/linux/rwlock_rt.h
9160 @@ -0,0 +1,99 @@
9161 +#ifndef __LINUX_RWLOCK_RT_H
9162 +#define __LINUX_RWLOCK_RT_H
9164 +#ifndef __LINUX_SPINLOCK_H
9165 +#error Do not include directly. Use spinlock.h
9166 +#endif
9168 +#define rwlock_init(rwl)                               \
9169 +do {                                                   \
9170 +       static struct lock_class_key __key;             \
9171 +                                                       \
9172 +       rt_mutex_init(&(rwl)->lock);                    \
9173 +       __rt_rwlock_init(rwl, #rwl, &__key);            \
9174 +} while (0)
9176 +extern void __lockfunc rt_write_lock(rwlock_t *rwlock);
9177 +extern void __lockfunc rt_read_lock(rwlock_t *rwlock);
9178 +extern int __lockfunc rt_write_trylock(rwlock_t *rwlock);
9179 +extern int __lockfunc rt_write_trylock_irqsave(rwlock_t *trylock, unsigned long *flags);
9180 +extern int __lockfunc rt_read_trylock(rwlock_t *rwlock);
9181 +extern void __lockfunc rt_write_unlock(rwlock_t *rwlock);
9182 +extern void __lockfunc rt_read_unlock(rwlock_t *rwlock);
9183 +extern unsigned long __lockfunc rt_write_lock_irqsave(rwlock_t *rwlock);
9184 +extern unsigned long __lockfunc rt_read_lock_irqsave(rwlock_t *rwlock);
9185 +extern void __rt_rwlock_init(rwlock_t *rwlock, char *name, struct lock_class_key *key);
9187 +#define read_trylock(lock)     __cond_lock(lock, rt_read_trylock(lock))
9188 +#define write_trylock(lock)    __cond_lock(lock, rt_write_trylock(lock))
9190 +#define write_trylock_irqsave(lock, flags)     \
9191 +       __cond_lock(lock, rt_write_trylock_irqsave(lock, &flags))
9193 +#define read_lock_irqsave(lock, flags)                 \
9194 +       do {                                            \
9195 +               typecheck(unsigned long, flags);        \
9196 +               flags = rt_read_lock_irqsave(lock);     \
9197 +       } while (0)
9199 +#define write_lock_irqsave(lock, flags)                        \
9200 +       do {                                            \
9201 +               typecheck(unsigned long, flags);        \
9202 +               flags = rt_write_lock_irqsave(lock);    \
9203 +       } while (0)
9205 +#define read_lock(lock)                rt_read_lock(lock)
9207 +#define read_lock_bh(lock)                             \
9208 +       do {                                            \
9209 +               local_bh_disable();                     \
9210 +               rt_read_lock(lock);                     \
9211 +       } while (0)
9213 +#define read_lock_irq(lock)    read_lock(lock)
9215 +#define write_lock(lock)       rt_write_lock(lock)
9217 +#define write_lock_bh(lock)                            \
9218 +       do {                                            \
9219 +               local_bh_disable();                     \
9220 +               rt_write_lock(lock);                    \
9221 +       } while (0)
9223 +#define write_lock_irq(lock)   write_lock(lock)
9225 +#define read_unlock(lock)      rt_read_unlock(lock)
9227 +#define read_unlock_bh(lock)                           \
9228 +       do {                                            \
9229 +               rt_read_unlock(lock);                   \
9230 +               local_bh_enable();                      \
9231 +       } while (0)
9233 +#define read_unlock_irq(lock)  read_unlock(lock)
9235 +#define write_unlock(lock)     rt_write_unlock(lock)
9237 +#define write_unlock_bh(lock)                          \
9238 +       do {                                            \
9239 +               rt_write_unlock(lock);                  \
9240 +               local_bh_enable();                      \
9241 +       } while (0)
9243 +#define write_unlock_irq(lock) write_unlock(lock)
9245 +#define read_unlock_irqrestore(lock, flags)            \
9246 +       do {                                            \
9247 +               typecheck(unsigned long, flags);        \
9248 +               (void) flags;                           \
9249 +               rt_read_unlock(lock);                   \
9250 +       } while (0)
9252 +#define write_unlock_irqrestore(lock, flags) \
9253 +       do {                                            \
9254 +               typecheck(unsigned long, flags);        \
9255 +               (void) flags;                           \
9256 +               rt_write_unlock(lock);                  \
9257 +       } while (0)
9259 +#endif
9260 diff --git a/include/linux/rwlock_types.h b/include/linux/rwlock_types.h
9261 index cc0072e93e36..5317cd957292 100644
9262 --- a/include/linux/rwlock_types.h
9263 +++ b/include/linux/rwlock_types.h
9264 @@ -1,6 +1,10 @@
9265  #ifndef __LINUX_RWLOCK_TYPES_H
9266  #define __LINUX_RWLOCK_TYPES_H
9268 +#if !defined(__LINUX_SPINLOCK_TYPES_H)
9269 +# error "Do not include directly, include spinlock_types.h"
9270 +#endif
9272  /*
9273   * include/linux/rwlock_types.h - generic rwlock type definitions
9274   *                               and initializers
9275 diff --git a/include/linux/rwlock_types_rt.h b/include/linux/rwlock_types_rt.h
9276 new file mode 100644
9277 index 000000000000..51b28d775fe1
9278 --- /dev/null
9279 +++ b/include/linux/rwlock_types_rt.h
9280 @@ -0,0 +1,33 @@
9281 +#ifndef __LINUX_RWLOCK_TYPES_RT_H
9282 +#define __LINUX_RWLOCK_TYPES_RT_H
9284 +#ifndef __LINUX_SPINLOCK_TYPES_H
9285 +#error "Do not include directly. Include spinlock_types.h instead"
9286 +#endif
9289 + * rwlocks - rtmutex which allows single reader recursion
9290 + */
9291 +typedef struct {
9292 +       struct rt_mutex         lock;
9293 +       int                     read_depth;
9294 +       unsigned int            break_lock;
9295 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
9296 +       struct lockdep_map      dep_map;
9297 +#endif
9298 +} rwlock_t;
9300 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
9301 +# define RW_DEP_MAP_INIT(lockname)     .dep_map = { .name = #lockname }
9302 +#else
9303 +# define RW_DEP_MAP_INIT(lockname)
9304 +#endif
9306 +#define __RW_LOCK_UNLOCKED(name) \
9307 +       { .lock = __RT_MUTEX_INITIALIZER_SAVE_STATE(name.lock), \
9308 +         RW_DEP_MAP_INIT(name) }
9310 +#define DEFINE_RWLOCK(name) \
9311 +       rwlock_t name = __RW_LOCK_UNLOCKED(name)
9313 +#endif
9314 diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
9315 index dd1d14250340..aa2ac1f65c2d 100644
9316 --- a/include/linux/rwsem.h
9317 +++ b/include/linux/rwsem.h
9318 @@ -19,6 +19,10 @@
9319  #include <linux/osq_lock.h>
9320  #endif
9322 +#ifdef CONFIG_PREEMPT_RT_FULL
9323 +#include <linux/rwsem_rt.h>
9324 +#else /* PREEMPT_RT_FULL */
9326  struct rw_semaphore;
9328  #ifdef CONFIG_RWSEM_GENERIC_SPINLOCK
9329 @@ -106,6 +110,13 @@ static inline int rwsem_is_contended(struct rw_semaphore *sem)
9330         return !list_empty(&sem->wait_list);
9333 +#endif /* !PREEMPT_RT_FULL */
9336 + * The functions below are the same for all rwsem implementations including
9337 + * the RT specific variant.
9338 + */
9340  /*
9341   * lock for reading
9342   */
9343 diff --git a/include/linux/rwsem_rt.h b/include/linux/rwsem_rt.h
9344 new file mode 100644
9345 index 000000000000..2ffbf093ae92
9346 --- /dev/null
9347 +++ b/include/linux/rwsem_rt.h
9348 @@ -0,0 +1,67 @@
9349 +#ifndef _LINUX_RWSEM_RT_H
9350 +#define _LINUX_RWSEM_RT_H
9352 +#ifndef _LINUX_RWSEM_H
9353 +#error "Include rwsem.h"
9354 +#endif
9356 +#include <linux/rtmutex.h>
9357 +#include <linux/swait.h>
9359 +#define READER_BIAS            (1U << 31)
9360 +#define WRITER_BIAS            (1U << 30)
9362 +struct rw_semaphore {
9363 +       atomic_t                readers;
9364 +       struct rt_mutex         rtmutex;
9365 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
9366 +       struct lockdep_map      dep_map;
9367 +#endif
9370 +#define __RWSEM_INITIALIZER(name)                              \
9371 +{                                                              \
9372 +       .readers = ATOMIC_INIT(READER_BIAS),                    \
9373 +       .rtmutex = __RT_MUTEX_INITIALIZER(name.rtmutex),        \
9374 +       RW_DEP_MAP_INIT(name)                                   \
9377 +#define DECLARE_RWSEM(lockname) \
9378 +       struct rw_semaphore lockname = __RWSEM_INITIALIZER(lockname)
9380 +extern void  __rwsem_init(struct rw_semaphore *rwsem, const char *name,
9381 +                         struct lock_class_key *key);
9383 +#define __init_rwsem(sem, name, key)                   \
9384 +do {                                                   \
9385 +               rt_mutex_init(&(sem)->rtmutex);         \
9386 +               __rwsem_init((sem), (name), (key));     \
9387 +} while (0)
9389 +#define init_rwsem(sem)                                        \
9390 +do {                                                   \
9391 +       static struct lock_class_key __key;             \
9392 +                                                       \
9393 +       __init_rwsem((sem), #sem, &__key);              \
9394 +} while (0)
9396 +static inline int rwsem_is_locked(struct rw_semaphore *sem)
9398 +       return atomic_read(&sem->readers) != READER_BIAS;
9401 +static inline int rwsem_is_contended(struct rw_semaphore *sem)
9403 +       return atomic_read(&sem->readers) > 0;
9406 +extern void __down_read(struct rw_semaphore *sem);
9407 +extern int __down_read_trylock(struct rw_semaphore *sem);
9408 +extern void __down_write(struct rw_semaphore *sem);
9409 +extern int __must_check __down_write_killable(struct rw_semaphore *sem);
9410 +extern int __down_write_trylock(struct rw_semaphore *sem);
9411 +extern void __up_read(struct rw_semaphore *sem);
9412 +extern void __up_write(struct rw_semaphore *sem);
9413 +extern void __downgrade_write(struct rw_semaphore *sem);
9415 +#endif
9416 diff --git a/include/linux/sched.h b/include/linux/sched.h
9417 index a4d0afc009a7..e775696b480a 100644
9418 --- a/include/linux/sched.h
9419 +++ b/include/linux/sched.h
9420 @@ -26,6 +26,7 @@ struct sched_param {
9421  #include <linux/nodemask.h>
9422  #include <linux/mm_types.h>
9423  #include <linux/preempt.h>
9424 +#include <asm/kmap_types.h>
9426  #include <asm/page.h>
9427  #include <asm/ptrace.h>
9428 @@ -236,17 +237,13 @@ extern char ___assert_task_state[1 - 2*!!(
9430  /* Convenience macros for the sake of wake_up */
9431  #define TASK_NORMAL            (TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE)
9432 -#define TASK_ALL               (TASK_NORMAL | __TASK_STOPPED | __TASK_TRACED)
9434  /* get_task_state() */
9435  #define TASK_REPORT            (TASK_RUNNING | TASK_INTERRUPTIBLE | \
9436                                  TASK_UNINTERRUPTIBLE | __TASK_STOPPED | \
9437                                  __TASK_TRACED | EXIT_ZOMBIE | EXIT_DEAD)
9439 -#define task_is_traced(task)   ((task->state & __TASK_TRACED) != 0)
9440  #define task_is_stopped(task)  ((task->state & __TASK_STOPPED) != 0)
9441 -#define task_is_stopped_or_traced(task)        \
9442 -                       ((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0)
9443  #define task_contributes_to_load(task) \
9444                                 ((task->state & TASK_UNINTERRUPTIBLE) != 0 && \
9445                                  (task->flags & PF_FROZEN) == 0 && \
9446 @@ -312,6 +309,11 @@ extern char ___assert_task_state[1 - 2*!!(
9448  #endif
9450 +#define __set_current_state_no_track(state_value)      \
9451 +       do { current->state = (state_value); } while (0)
9452 +#define set_current_state_no_track(state_value)                \
9453 +       set_mb(current->state, (state_value))
9455  /* Task command name length */
9456  #define TASK_COMM_LEN 16
9458 @@ -1022,9 +1024,31 @@ struct wake_q_head {
9459  #define WAKE_Q(name)                                   \
9460         struct wake_q_head name = { WAKE_Q_TAIL, &name.first }
9462 -extern void wake_q_add(struct wake_q_head *head,
9463 -                      struct task_struct *task);
9464 -extern void wake_up_q(struct wake_q_head *head);
9465 +extern void __wake_q_add(struct wake_q_head *head,
9466 +                        struct task_struct *task, bool sleeper);
9467 +static inline void wake_q_add(struct wake_q_head *head,
9468 +                             struct task_struct *task)
9470 +       __wake_q_add(head, task, false);
9473 +static inline void wake_q_add_sleeper(struct wake_q_head *head,
9474 +                                     struct task_struct *task)
9476 +       __wake_q_add(head, task, true);
9479 +extern void __wake_up_q(struct wake_q_head *head, bool sleeper);
9481 +static inline void wake_up_q(struct wake_q_head *head)
9483 +       __wake_up_q(head, false);
9486 +static inline void wake_up_q_sleeper(struct wake_q_head *head)
9488 +       __wake_up_q(head, true);
9491  /*
9492   * sched-domains (multiprocessor balancing) declarations:
9493 @@ -1491,6 +1515,7 @@ struct task_struct {
9494         struct thread_info thread_info;
9495  #endif
9496         volatile long state;    /* -1 unrunnable, 0 runnable, >0 stopped */
9497 +       volatile long saved_state; /* saved state for "spinlock sleepers" */
9498         void *stack;
9499         atomic_t usage;
9500         unsigned int flags;     /* per process flags, defined below */
9501 @@ -1530,6 +1555,13 @@ struct task_struct {
9502  #endif
9504         unsigned int policy;
9505 +#ifdef CONFIG_PREEMPT_RT_FULL
9506 +       int migrate_disable;
9507 +       int migrate_disable_update;
9508 +# ifdef CONFIG_SCHED_DEBUG
9509 +       int migrate_disable_atomic;
9510 +# endif
9511 +#endif
9512         int nr_cpus_allowed;
9513         cpumask_t cpus_allowed;
9515 @@ -1668,6 +1700,9 @@ struct task_struct {
9517         struct task_cputime cputime_expires;
9518         struct list_head cpu_timers[3];
9519 +#ifdef CONFIG_PREEMPT_RT_BASE
9520 +       struct task_struct *posix_timer_list;
9521 +#endif
9523  /* process credentials */
9524         const struct cred __rcu *ptracer_cred; /* Tracer's credentials at attach */
9525 @@ -1699,10 +1734,15 @@ struct task_struct {
9526  /* signal handlers */
9527         struct signal_struct *signal;
9528         struct sighand_struct *sighand;
9529 +       struct sigqueue *sigqueue_cache;
9531         sigset_t blocked, real_blocked;
9532         sigset_t saved_sigmask; /* restored if set_restore_sigmask() was used */
9533         struct sigpending pending;
9534 +#ifdef CONFIG_PREEMPT_RT_FULL
9535 +       /* TODO: move me into ->restart_block ? */
9536 +       struct siginfo forced_info;
9537 +#endif
9539         unsigned long sas_ss_sp;
9540         size_t sas_ss_size;
9541 @@ -1728,11 +1768,14 @@ struct task_struct {
9542         raw_spinlock_t pi_lock;
9544         struct wake_q_node wake_q;
9545 +       struct wake_q_node wake_q_sleeper;
9547  #ifdef CONFIG_RT_MUTEXES
9548         /* PI waiters blocked on a rt_mutex held by this task */
9549         struct rb_root pi_waiters;
9550         struct rb_node *pi_waiters_leftmost;
9551 +       /* Updated under owner's pi_lock and rq lock */
9552 +       struct task_struct      *pi_top_task;
9553         /* Deadlock detection and priority inheritance handling */
9554         struct rt_mutex_waiter *pi_blocked_on;
9555  #endif
9556 @@ -1931,6 +1974,12 @@ struct task_struct {
9557         /* bitmask and counter of trace recursion */
9558         unsigned long trace_recursion;
9559  #endif /* CONFIG_TRACING */
9560 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
9561 +       u64 preempt_timestamp_hist;
9562 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
9563 +       long timer_offset;
9564 +#endif
9565 +#endif
9566  #ifdef CONFIG_KCOV
9567         /* Coverage collection mode enabled for this task (0 if disabled). */
9568         enum kcov_mode kcov_mode;
9569 @@ -1956,8 +2005,22 @@ struct task_struct {
9570         unsigned int    sequential_io;
9571         unsigned int    sequential_io_avg;
9572  #endif
9573 +#ifdef CONFIG_PREEMPT_RT_BASE
9574 +       struct rcu_head put_rcu;
9575 +       int softirq_nestcnt;
9576 +       unsigned int softirqs_raised;
9577 +#endif
9578 +#ifdef CONFIG_PREEMPT_RT_FULL
9579 +# if defined CONFIG_HIGHMEM || defined CONFIG_X86_32
9580 +       int kmap_idx;
9581 +       pte_t kmap_pte[KM_TYPE_NR];
9582 +# endif
9583 +#endif
9584  #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
9585         unsigned long   task_state_change;
9586 +#endif
9587 +#ifdef CONFIG_PREEMPT_RT_FULL
9588 +       int xmit_recursion;
9589  #endif
9590         int pagefault_disabled;
9591  #ifdef CONFIG_MMU
9592 @@ -1998,14 +2061,6 @@ static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t)
9594  #endif
9596 -/* Future-safe accessor for struct task_struct's cpus_allowed. */
9597 -#define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
9599 -static inline int tsk_nr_cpus_allowed(struct task_struct *p)
9601 -       return p->nr_cpus_allowed;
9604  #define TNF_MIGRATED   0x01
9605  #define TNF_NO_GROUP   0x02
9606  #define TNF_SHARED     0x04
9607 @@ -2225,6 +2280,15 @@ extern struct pid *cad_pid;
9608  extern void free_task(struct task_struct *tsk);
9609  #define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0)
9611 +#ifdef CONFIG_PREEMPT_RT_BASE
9612 +extern void __put_task_struct_cb(struct rcu_head *rhp);
9614 +static inline void put_task_struct(struct task_struct *t)
9616 +       if (atomic_dec_and_test(&t->usage))
9617 +               call_rcu(&t->put_rcu, __put_task_struct_cb);
9619 +#else
9620  extern void __put_task_struct(struct task_struct *t);
9622  static inline void put_task_struct(struct task_struct *t)
9623 @@ -2232,6 +2296,7 @@ static inline void put_task_struct(struct task_struct *t)
9624         if (atomic_dec_and_test(&t->usage))
9625                 __put_task_struct(t);
9627 +#endif
9629  struct task_struct *task_rcu_dereference(struct task_struct **ptask);
9630  struct task_struct *try_get_task_struct(struct task_struct **ptask);
9631 @@ -2273,6 +2338,7 @@ extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut,
9632  /*
9633   * Per process flags
9634   */
9635 +#define PF_IN_SOFTIRQ  0x00000001      /* Task is serving softirq */
9636  #define PF_EXITING     0x00000004      /* getting shut down */
9637  #define PF_EXITPIDONE  0x00000008      /* pi exit done on shut down */
9638  #define PF_VCPU                0x00000010      /* I'm a virtual CPU */
9639 @@ -2441,6 +2507,10 @@ extern void do_set_cpus_allowed(struct task_struct *p,
9641  extern int set_cpus_allowed_ptr(struct task_struct *p,
9642                                 const struct cpumask *new_mask);
9643 +int migrate_me(void);
9644 +void tell_sched_cpu_down_begin(int cpu);
9645 +void tell_sched_cpu_down_done(int cpu);
9647  #else
9648  static inline void do_set_cpus_allowed(struct task_struct *p,
9649                                       const struct cpumask *new_mask)
9650 @@ -2453,6 +2523,9 @@ static inline int set_cpus_allowed_ptr(struct task_struct *p,
9651                 return -EINVAL;
9652         return 0;
9654 +static inline int migrate_me(void) { return 0; }
9655 +static inline void tell_sched_cpu_down_begin(int cpu) { }
9656 +static inline void tell_sched_cpu_down_done(int cpu) { }
9657  #endif
9659  #ifdef CONFIG_NO_HZ_COMMON
9660 @@ -2691,6 +2764,7 @@ extern void xtime_update(unsigned long ticks);
9662  extern int wake_up_state(struct task_struct *tsk, unsigned int state);
9663  extern int wake_up_process(struct task_struct *tsk);
9664 +extern int wake_up_lock_sleeper(struct task_struct * tsk);
9665  extern void wake_up_new_task(struct task_struct *tsk);
9666  #ifdef CONFIG_SMP
9667   extern void kick_process(struct task_struct *tsk);
9668 @@ -2899,6 +2973,17 @@ static inline void mmdrop(struct mm_struct *mm)
9669                 __mmdrop(mm);
9672 +#ifdef CONFIG_PREEMPT_RT_BASE
9673 +extern void __mmdrop_delayed(struct rcu_head *rhp);
9674 +static inline void mmdrop_delayed(struct mm_struct *mm)
9676 +       if (atomic_dec_and_test(&mm->mm_count))
9677 +               call_rcu(&mm->delayed_drop, __mmdrop_delayed);
9679 +#else
9680 +# define mmdrop_delayed(mm)    mmdrop(mm)
9681 +#endif
9683  static inline void mmdrop_async_fn(struct work_struct *work)
9685         struct mm_struct *mm = container_of(work, struct mm_struct, async_put_work);
9686 @@ -3291,6 +3376,43 @@ static inline int test_tsk_need_resched(struct task_struct *tsk)
9687         return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED));
9690 +#ifdef CONFIG_PREEMPT_LAZY
9691 +static inline void set_tsk_need_resched_lazy(struct task_struct *tsk)
9693 +       set_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY);
9696 +static inline void clear_tsk_need_resched_lazy(struct task_struct *tsk)
9698 +       clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY);
9701 +static inline int test_tsk_need_resched_lazy(struct task_struct *tsk)
9703 +       return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY));
9706 +static inline int need_resched_lazy(void)
9708 +       return test_thread_flag(TIF_NEED_RESCHED_LAZY);
9711 +static inline int need_resched_now(void)
9713 +       return test_thread_flag(TIF_NEED_RESCHED);
9716 +#else
9717 +static inline void clear_tsk_need_resched_lazy(struct task_struct *tsk) { }
9718 +static inline int need_resched_lazy(void) { return 0; }
9720 +static inline int need_resched_now(void)
9722 +       return test_thread_flag(TIF_NEED_RESCHED);
9725 +#endif
9727  static inline int restart_syscall(void)
9729         set_tsk_thread_flag(current, TIF_SIGPENDING);
9730 @@ -3322,6 +3444,51 @@ static inline int signal_pending_state(long state, struct task_struct *p)
9731         return (state & TASK_INTERRUPTIBLE) || __fatal_signal_pending(p);
9734 +static inline bool __task_is_stopped_or_traced(struct task_struct *task)
9736 +       if (task->state & (__TASK_STOPPED | __TASK_TRACED))
9737 +               return true;
9738 +#ifdef CONFIG_PREEMPT_RT_FULL
9739 +       if (task->saved_state & (__TASK_STOPPED | __TASK_TRACED))
9740 +               return true;
9741 +#endif
9742 +       return false;
9745 +static inline bool task_is_stopped_or_traced(struct task_struct *task)
9747 +       bool traced_stopped;
9749 +#ifdef CONFIG_PREEMPT_RT_FULL
9750 +       unsigned long flags;
9752 +       raw_spin_lock_irqsave(&task->pi_lock, flags);
9753 +       traced_stopped = __task_is_stopped_or_traced(task);
9754 +       raw_spin_unlock_irqrestore(&task->pi_lock, flags);
9755 +#else
9756 +       traced_stopped = __task_is_stopped_or_traced(task);
9757 +#endif
9758 +       return traced_stopped;
9761 +static inline bool task_is_traced(struct task_struct *task)
9763 +       bool traced = false;
9765 +       if (task->state & __TASK_TRACED)
9766 +               return true;
9767 +#ifdef CONFIG_PREEMPT_RT_FULL
9768 +       /* in case the task is sleeping on tasklist_lock */
9769 +       raw_spin_lock_irq(&task->pi_lock);
9770 +       if (task->state & __TASK_TRACED)
9771 +               traced = true;
9772 +       else if (task->saved_state & __TASK_TRACED)
9773 +               traced = true;
9774 +       raw_spin_unlock_irq(&task->pi_lock);
9775 +#endif
9776 +       return traced;
9779  /*
9780   * cond_resched() and cond_resched_lock(): latency reduction via
9781   * explicit rescheduling in places that are safe. The return
9782 @@ -3347,12 +3514,16 @@ extern int __cond_resched_lock(spinlock_t *lock);
9783         __cond_resched_lock(lock);                              \
9784  })
9786 +#ifndef CONFIG_PREEMPT_RT_FULL
9787  extern int __cond_resched_softirq(void);
9789  #define cond_resched_softirq() ({                                      \
9790         ___might_sleep(__FILE__, __LINE__, SOFTIRQ_DISABLE_OFFSET);     \
9791         __cond_resched_softirq();                                       \
9792  })
9793 +#else
9794 +# define cond_resched_softirq()                cond_resched()
9795 +#endif
9797  static inline void cond_resched_rcu(void)
9799 @@ -3527,6 +3698,31 @@ static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
9801  #endif /* CONFIG_SMP */
9803 +static inline int __migrate_disabled(struct task_struct *p)
9805 +#ifdef CONFIG_PREEMPT_RT_FULL
9806 +       return p->migrate_disable;
9807 +#else
9808 +       return 0;
9809 +#endif
9812 +/* Future-safe accessor for struct task_struct's cpus_allowed. */
9813 +static inline const struct cpumask *tsk_cpus_allowed(struct task_struct *p)
9815 +       if (__migrate_disabled(p))
9816 +               return cpumask_of(task_cpu(p));
9818 +       return &p->cpus_allowed;
9821 +static inline int tsk_nr_cpus_allowed(struct task_struct *p)
9823 +       if (__migrate_disabled(p))
9824 +               return 1;
9825 +       return p->nr_cpus_allowed;
9828  extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask);
9829  extern long sched_getaffinity(pid_t pid, struct cpumask *mask);
9831 diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h
9832 index a30b172df6e1..db3e91f2bc03 100644
9833 --- a/include/linux/sched/rt.h
9834 +++ b/include/linux/sched/rt.h
9835 @@ -16,27 +16,20 @@ static inline int rt_task(struct task_struct *p)
9838  #ifdef CONFIG_RT_MUTEXES
9839 -extern int rt_mutex_getprio(struct task_struct *p);
9840 -extern void rt_mutex_setprio(struct task_struct *p, int prio);
9841 -extern int rt_mutex_get_effective_prio(struct task_struct *task, int newprio);
9842 -extern struct task_struct *rt_mutex_get_top_task(struct task_struct *task);
9844 + * Must hold either p->pi_lock or task_rq(p)->lock.
9845 + */
9846 +static inline struct task_struct *rt_mutex_get_top_task(struct task_struct *p)
9848 +       return p->pi_top_task;
9850 +extern void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task);
9851  extern void rt_mutex_adjust_pi(struct task_struct *p);
9852  static inline bool tsk_is_pi_blocked(struct task_struct *tsk)
9854         return tsk->pi_blocked_on != NULL;
9856  #else
9857 -static inline int rt_mutex_getprio(struct task_struct *p)
9859 -       return p->normal_prio;
9862 -static inline int rt_mutex_get_effective_prio(struct task_struct *task,
9863 -                                             int newprio)
9865 -       return newprio;
9868  static inline struct task_struct *rt_mutex_get_top_task(struct task_struct *task)
9870         return NULL;
9871 diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
9872 index ead97654c4e9..3d7223ffdd3b 100644
9873 --- a/include/linux/seqlock.h
9874 +++ b/include/linux/seqlock.h
9875 @@ -220,20 +220,30 @@ static inline int read_seqcount_retry(const seqcount_t *s, unsigned start)
9876         return __read_seqcount_retry(s, start);
9881 -static inline void raw_write_seqcount_begin(seqcount_t *s)
9882 +static inline void __raw_write_seqcount_begin(seqcount_t *s)
9884         s->sequence++;
9885         smp_wmb();
9888 -static inline void raw_write_seqcount_end(seqcount_t *s)
9889 +static inline void raw_write_seqcount_begin(seqcount_t *s)
9891 +       preempt_disable_rt();
9892 +       __raw_write_seqcount_begin(s);
9895 +static inline void __raw_write_seqcount_end(seqcount_t *s)
9897         smp_wmb();
9898         s->sequence++;
9901 +static inline void raw_write_seqcount_end(seqcount_t *s)
9903 +       __raw_write_seqcount_end(s);
9904 +       preempt_enable_rt();
9907  /**
9908   * raw_write_seqcount_barrier - do a seq write barrier
9909   * @s: pointer to seqcount_t
9910 @@ -428,10 +438,32 @@ typedef struct {
9911  /*
9912   * Read side functions for starting and finalizing a read side section.
9913   */
9914 +#ifndef CONFIG_PREEMPT_RT_FULL
9915  static inline unsigned read_seqbegin(const seqlock_t *sl)
9917         return read_seqcount_begin(&sl->seqcount);
9919 +#else
9921 + * Starvation safe read side for RT
9922 + */
9923 +static inline unsigned read_seqbegin(seqlock_t *sl)
9925 +       unsigned ret;
9927 +repeat:
9928 +       ret = ACCESS_ONCE(sl->seqcount.sequence);
9929 +       if (unlikely(ret & 1)) {
9930 +               /*
9931 +                * Take the lock and let the writer proceed (i.e. evtl
9932 +                * boost it), otherwise we could loop here forever.
9933 +                */
9934 +               spin_unlock_wait(&sl->lock);
9935 +               goto repeat;
9936 +       }
9937 +       return ret;
9939 +#endif
9941  static inline unsigned read_seqretry(const seqlock_t *sl, unsigned start)
9943 @@ -446,36 +478,45 @@ static inline unsigned read_seqretry(const seqlock_t *sl, unsigned start)
9944  static inline void write_seqlock(seqlock_t *sl)
9946         spin_lock(&sl->lock);
9947 -       write_seqcount_begin(&sl->seqcount);
9948 +       __raw_write_seqcount_begin(&sl->seqcount);
9951 +static inline int try_write_seqlock(seqlock_t *sl)
9953 +       if (spin_trylock(&sl->lock)) {
9954 +               __raw_write_seqcount_begin(&sl->seqcount);
9955 +               return 1;
9956 +       }
9957 +       return 0;
9960  static inline void write_sequnlock(seqlock_t *sl)
9962 -       write_seqcount_end(&sl->seqcount);
9963 +       __raw_write_seqcount_end(&sl->seqcount);
9964         spin_unlock(&sl->lock);
9967  static inline void write_seqlock_bh(seqlock_t *sl)
9969         spin_lock_bh(&sl->lock);
9970 -       write_seqcount_begin(&sl->seqcount);
9971 +       __raw_write_seqcount_begin(&sl->seqcount);
9974  static inline void write_sequnlock_bh(seqlock_t *sl)
9976 -       write_seqcount_end(&sl->seqcount);
9977 +       __raw_write_seqcount_end(&sl->seqcount);
9978         spin_unlock_bh(&sl->lock);
9981  static inline void write_seqlock_irq(seqlock_t *sl)
9983         spin_lock_irq(&sl->lock);
9984 -       write_seqcount_begin(&sl->seqcount);
9985 +       __raw_write_seqcount_begin(&sl->seqcount);
9988  static inline void write_sequnlock_irq(seqlock_t *sl)
9990 -       write_seqcount_end(&sl->seqcount);
9991 +       __raw_write_seqcount_end(&sl->seqcount);
9992         spin_unlock_irq(&sl->lock);
9995 @@ -484,7 +525,7 @@ static inline unsigned long __write_seqlock_irqsave(seqlock_t *sl)
9996         unsigned long flags;
9998         spin_lock_irqsave(&sl->lock, flags);
9999 -       write_seqcount_begin(&sl->seqcount);
10000 +       __raw_write_seqcount_begin(&sl->seqcount);
10001         return flags;
10004 @@ -494,7 +535,7 @@ static inline unsigned long __write_seqlock_irqsave(seqlock_t *sl)
10005  static inline void
10006  write_sequnlock_irqrestore(seqlock_t *sl, unsigned long flags)
10008 -       write_seqcount_end(&sl->seqcount);
10009 +       __raw_write_seqcount_end(&sl->seqcount);
10010         spin_unlock_irqrestore(&sl->lock, flags);
10013 diff --git a/include/linux/signal.h b/include/linux/signal.h
10014 index b63f63eaa39c..295540fdfc72 100644
10015 --- a/include/linux/signal.h
10016 +++ b/include/linux/signal.h
10017 @@ -233,6 +233,7 @@ static inline void init_sigpending(struct sigpending *sig)
10020  extern void flush_sigqueue(struct sigpending *queue);
10021 +extern void flush_task_sigqueue(struct task_struct *tsk);
10023  /* Test if 'sig' is valid signal. Use this instead of testing _NSIG directly */
10024  static inline int valid_signal(unsigned long sig)
10025 diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
10026 index 601dfa849d30..dca387a8fa6b 100644
10027 --- a/include/linux/skbuff.h
10028 +++ b/include/linux/skbuff.h
10029 @@ -284,6 +284,7 @@ struct sk_buff_head {
10031         __u32           qlen;
10032         spinlock_t      lock;
10033 +       raw_spinlock_t  raw_lock;
10034  };
10036  struct sk_buff;
10037 @@ -1573,6 +1574,12 @@ static inline void skb_queue_head_init(struct sk_buff_head *list)
10038         __skb_queue_head_init(list);
10041 +static inline void skb_queue_head_init_raw(struct sk_buff_head *list)
10043 +       raw_spin_lock_init(&list->raw_lock);
10044 +       __skb_queue_head_init(list);
10047  static inline void skb_queue_head_init_class(struct sk_buff_head *list,
10048                 struct lock_class_key *class)
10050 diff --git a/include/linux/smp.h b/include/linux/smp.h
10051 index 8e0cb7a0f836..891c533724f5 100644
10052 --- a/include/linux/smp.h
10053 +++ b/include/linux/smp.h
10054 @@ -120,6 +120,13 @@ extern unsigned int setup_max_cpus;
10055  extern void __init setup_nr_cpu_ids(void);
10056  extern void __init smp_init(void);
10058 +extern int __boot_cpu_id;
10060 +static inline int get_boot_cpu_id(void)
10062 +       return __boot_cpu_id;
10065  #else /* !SMP */
10067  static inline void smp_send_stop(void) { }
10068 @@ -158,6 +165,11 @@ static inline void smp_init(void) { up_late_init(); }
10069  static inline void smp_init(void) { }
10070  #endif
10072 +static inline int get_boot_cpu_id(void)
10074 +       return 0;
10077  #endif /* !SMP */
10079  /*
10080 @@ -185,6 +197,9 @@ static inline void smp_init(void) { }
10081  #define get_cpu()              ({ preempt_disable(); smp_processor_id(); })
10082  #define put_cpu()              preempt_enable()
10084 +#define get_cpu_light()                ({ migrate_disable(); smp_processor_id(); })
10085 +#define put_cpu_light()                migrate_enable()
10087  /*
10088   * Callback to arch code if there's nosmp or maxcpus=0 on the
10089   * boot command line:
10090 diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
10091 index 47dd0cebd204..b241cc044bd3 100644
10092 --- a/include/linux/spinlock.h
10093 +++ b/include/linux/spinlock.h
10094 @@ -271,7 +271,11 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock)
10095  #define raw_spin_can_lock(lock)        (!raw_spin_is_locked(lock))
10097  /* Include rwlock functions */
10098 -#include <linux/rwlock.h>
10099 +#ifdef CONFIG_PREEMPT_RT_FULL
10100 +# include <linux/rwlock_rt.h>
10101 +#else
10102 +# include <linux/rwlock.h>
10103 +#endif
10105  /*
10106   * Pull the _spin_*()/_read_*()/_write_*() functions/declarations:
10107 @@ -282,6 +286,10 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock)
10108  # include <linux/spinlock_api_up.h>
10109  #endif
10111 +#ifdef CONFIG_PREEMPT_RT_FULL
10112 +# include <linux/spinlock_rt.h>
10113 +#else /* PREEMPT_RT_FULL */
10115  /*
10116   * Map the spin_lock functions to the raw variants for PREEMPT_RT=n
10117   */
10118 @@ -416,4 +424,6 @@ extern int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock);
10119  #define atomic_dec_and_lock(atomic, lock) \
10120                 __cond_lock(lock, _atomic_dec_and_lock(atomic, lock))
10122 +#endif /* !PREEMPT_RT_FULL */
10124  #endif /* __LINUX_SPINLOCK_H */
10125 diff --git a/include/linux/spinlock_api_smp.h b/include/linux/spinlock_api_smp.h
10126 index 5344268e6e62..043263f30e81 100644
10127 --- a/include/linux/spinlock_api_smp.h
10128 +++ b/include/linux/spinlock_api_smp.h
10129 @@ -189,6 +189,8 @@ static inline int __raw_spin_trylock_bh(raw_spinlock_t *lock)
10130         return 0;
10133 -#include <linux/rwlock_api_smp.h>
10134 +#ifndef CONFIG_PREEMPT_RT_FULL
10135 +# include <linux/rwlock_api_smp.h>
10136 +#endif
10138  #endif /* __LINUX_SPINLOCK_API_SMP_H */
10139 diff --git a/include/linux/spinlock_rt.h b/include/linux/spinlock_rt.h
10140 new file mode 100644
10141 index 000000000000..43ca841b913a
10142 --- /dev/null
10143 +++ b/include/linux/spinlock_rt.h
10144 @@ -0,0 +1,162 @@
10145 +#ifndef __LINUX_SPINLOCK_RT_H
10146 +#define __LINUX_SPINLOCK_RT_H
10148 +#ifndef __LINUX_SPINLOCK_H
10149 +#error Do not include directly. Use spinlock.h
10150 +#endif
10152 +#include <linux/bug.h>
10154 +extern void
10155 +__rt_spin_lock_init(spinlock_t *lock, char *name, struct lock_class_key *key);
10157 +#define spin_lock_init(slock)                          \
10158 +do {                                                   \
10159 +       static struct lock_class_key __key;             \
10160 +                                                       \
10161 +       rt_mutex_init(&(slock)->lock);                  \
10162 +       __rt_spin_lock_init(slock, #slock, &__key);     \
10163 +} while (0)
10165 +void __lockfunc rt_spin_lock__no_mg(spinlock_t *lock);
10166 +void __lockfunc rt_spin_unlock__no_mg(spinlock_t *lock);
10167 +int __lockfunc rt_spin_trylock__no_mg(spinlock_t *lock);
10169 +extern void __lockfunc rt_spin_lock(spinlock_t *lock);
10170 +extern unsigned long __lockfunc rt_spin_lock_trace_flags(spinlock_t *lock);
10171 +extern void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int subclass);
10172 +extern void __lockfunc rt_spin_unlock(spinlock_t *lock);
10173 +extern void __lockfunc rt_spin_unlock_wait(spinlock_t *lock);
10174 +extern int __lockfunc rt_spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags);
10175 +extern int __lockfunc rt_spin_trylock_bh(spinlock_t *lock);
10176 +extern int __lockfunc rt_spin_trylock(spinlock_t *lock);
10177 +extern int atomic_dec_and_spin_lock(atomic_t *atomic, spinlock_t *lock);
10180 + * lockdep-less calls, for derived types like rwlock:
10181 + * (for trylock they can use rt_mutex_trylock() directly.
10182 + */
10183 +extern void __lockfunc __rt_spin_lock__no_mg(struct rt_mutex *lock);
10184 +extern void __lockfunc __rt_spin_lock(struct rt_mutex *lock);
10185 +extern void __lockfunc __rt_spin_unlock(struct rt_mutex *lock);
10187 +#define spin_lock(lock)                        rt_spin_lock(lock)
10189 +#define spin_lock_bh(lock)                     \
10190 +       do {                                    \
10191 +               local_bh_disable();             \
10192 +               rt_spin_lock(lock);             \
10193 +       } while (0)
10195 +#define spin_lock_irq(lock)            spin_lock(lock)
10197 +#define spin_do_trylock(lock)          __cond_lock(lock, rt_spin_trylock(lock))
10199 +#define spin_trylock(lock)                     \
10200 +({                                             \
10201 +       int __locked;                           \
10202 +       __locked = spin_do_trylock(lock);       \
10203 +       __locked;                               \
10206 +#ifdef CONFIG_LOCKDEP
10207 +# define spin_lock_nested(lock, subclass)              \
10208 +       do {                                            \
10209 +               rt_spin_lock_nested(lock, subclass);    \
10210 +       } while (0)
10212 +#define spin_lock_bh_nested(lock, subclass)            \
10213 +       do {                                            \
10214 +               local_bh_disable();                     \
10215 +               rt_spin_lock_nested(lock, subclass);    \
10216 +       } while (0)
10218 +# define spin_lock_irqsave_nested(lock, flags, subclass) \
10219 +       do {                                             \
10220 +               typecheck(unsigned long, flags);         \
10221 +               flags = 0;                               \
10222 +               rt_spin_lock_nested(lock, subclass);     \
10223 +       } while (0)
10224 +#else
10225 +# define spin_lock_nested(lock, subclass)      spin_lock(lock)
10226 +# define spin_lock_bh_nested(lock, subclass)   spin_lock_bh(lock)
10228 +# define spin_lock_irqsave_nested(lock, flags, subclass) \
10229 +       do {                                             \
10230 +               typecheck(unsigned long, flags);         \
10231 +               flags = 0;                               \
10232 +               spin_lock(lock);                         \
10233 +       } while (0)
10234 +#endif
10236 +#define spin_lock_irqsave(lock, flags)                  \
10237 +       do {                                             \
10238 +               typecheck(unsigned long, flags);         \
10239 +               flags = 0;                               \
10240 +               spin_lock(lock);                         \
10241 +       } while (0)
10243 +static inline unsigned long spin_lock_trace_flags(spinlock_t *lock)
10245 +       unsigned long flags = 0;
10246 +#ifdef CONFIG_TRACE_IRQFLAGS
10247 +       flags = rt_spin_lock_trace_flags(lock);
10248 +#else
10249 +       spin_lock(lock); /* lock_local */
10250 +#endif
10251 +       return flags;
10254 +/* FIXME: we need rt_spin_lock_nest_lock */
10255 +#define spin_lock_nest_lock(lock, nest_lock) spin_lock_nested(lock, 0)
10257 +#define spin_unlock(lock)                      rt_spin_unlock(lock)
10259 +#define spin_unlock_bh(lock)                           \
10260 +       do {                                            \
10261 +               rt_spin_unlock(lock);                   \
10262 +               local_bh_enable();                      \
10263 +       } while (0)
10265 +#define spin_unlock_irq(lock)          spin_unlock(lock)
10267 +#define spin_unlock_irqrestore(lock, flags)            \
10268 +       do {                                            \
10269 +               typecheck(unsigned long, flags);        \
10270 +               (void) flags;                           \
10271 +               spin_unlock(lock);                      \
10272 +       } while (0)
10274 +#define spin_trylock_bh(lock)  __cond_lock(lock, rt_spin_trylock_bh(lock))
10275 +#define spin_trylock_irq(lock) spin_trylock(lock)
10277 +#define spin_trylock_irqsave(lock, flags)      \
10278 +       rt_spin_trylock_irqsave(lock, &(flags))
10280 +#define spin_unlock_wait(lock)         rt_spin_unlock_wait(lock)
10282 +#ifdef CONFIG_GENERIC_LOCKBREAK
10283 +# define spin_is_contended(lock)       ((lock)->break_lock)
10284 +#else
10285 +# define spin_is_contended(lock)       (((void)(lock), 0))
10286 +#endif
10288 +static inline int spin_can_lock(spinlock_t *lock)
10290 +       return !rt_mutex_is_locked(&lock->lock);
10293 +static inline int spin_is_locked(spinlock_t *lock)
10295 +       return rt_mutex_is_locked(&lock->lock);
10298 +static inline void assert_spin_locked(spinlock_t *lock)
10300 +       BUG_ON(!spin_is_locked(lock));
10303 +#define atomic_dec_and_lock(atomic, lock) \
10304 +       atomic_dec_and_spin_lock(atomic, lock)
10306 +#endif
10307 diff --git a/include/linux/spinlock_types.h b/include/linux/spinlock_types.h
10308 index 73548eb13a5d..10bac715ea96 100644
10309 --- a/include/linux/spinlock_types.h
10310 +++ b/include/linux/spinlock_types.h
10311 @@ -9,80 +9,15 @@
10312   * Released under the General Public License (GPL).
10313   */
10315 -#if defined(CONFIG_SMP)
10316 -# include <asm/spinlock_types.h>
10317 -#else
10318 -# include <linux/spinlock_types_up.h>
10319 -#endif
10321 -#include <linux/lockdep.h>
10323 -typedef struct raw_spinlock {
10324 -       arch_spinlock_t raw_lock;
10325 -#ifdef CONFIG_GENERIC_LOCKBREAK
10326 -       unsigned int break_lock;
10327 -#endif
10328 -#ifdef CONFIG_DEBUG_SPINLOCK
10329 -       unsigned int magic, owner_cpu;
10330 -       void *owner;
10331 -#endif
10332 -#ifdef CONFIG_DEBUG_LOCK_ALLOC
10333 -       struct lockdep_map dep_map;
10334 -#endif
10335 -} raw_spinlock_t;
10337 -#define SPINLOCK_MAGIC         0xdead4ead
10339 -#define SPINLOCK_OWNER_INIT    ((void *)-1L)
10341 -#ifdef CONFIG_DEBUG_LOCK_ALLOC
10342 -# define SPIN_DEP_MAP_INIT(lockname)   .dep_map = { .name = #lockname }
10343 -#else
10344 -# define SPIN_DEP_MAP_INIT(lockname)
10345 -#endif
10346 +#include <linux/spinlock_types_raw.h>
10348 -#ifdef CONFIG_DEBUG_SPINLOCK
10349 -# define SPIN_DEBUG_INIT(lockname)             \
10350 -       .magic = SPINLOCK_MAGIC,                \
10351 -       .owner_cpu = -1,                        \
10352 -       .owner = SPINLOCK_OWNER_INIT,
10353 +#ifndef CONFIG_PREEMPT_RT_FULL
10354 +# include <linux/spinlock_types_nort.h>
10355 +# include <linux/rwlock_types.h>
10356  #else
10357 -# define SPIN_DEBUG_INIT(lockname)
10358 +# include <linux/rtmutex.h>
10359 +# include <linux/spinlock_types_rt.h>
10360 +# include <linux/rwlock_types_rt.h>
10361  #endif
10363 -#define __RAW_SPIN_LOCK_INITIALIZER(lockname)  \
10364 -       {                                       \
10365 -       .raw_lock = __ARCH_SPIN_LOCK_UNLOCKED,  \
10366 -       SPIN_DEBUG_INIT(lockname)               \
10367 -       SPIN_DEP_MAP_INIT(lockname) }
10369 -#define __RAW_SPIN_LOCK_UNLOCKED(lockname)     \
10370 -       (raw_spinlock_t) __RAW_SPIN_LOCK_INITIALIZER(lockname)
10372 -#define DEFINE_RAW_SPINLOCK(x) raw_spinlock_t x = __RAW_SPIN_LOCK_UNLOCKED(x)
10374 -typedef struct spinlock {
10375 -       union {
10376 -               struct raw_spinlock rlock;
10378 -#ifdef CONFIG_DEBUG_LOCK_ALLOC
10379 -# define LOCK_PADSIZE (offsetof(struct raw_spinlock, dep_map))
10380 -               struct {
10381 -                       u8 __padding[LOCK_PADSIZE];
10382 -                       struct lockdep_map dep_map;
10383 -               };
10384 -#endif
10385 -       };
10386 -} spinlock_t;
10388 -#define __SPIN_LOCK_INITIALIZER(lockname) \
10389 -       { { .rlock = __RAW_SPIN_LOCK_INITIALIZER(lockname) } }
10391 -#define __SPIN_LOCK_UNLOCKED(lockname) \
10392 -       (spinlock_t ) __SPIN_LOCK_INITIALIZER(lockname)
10394 -#define DEFINE_SPINLOCK(x)     spinlock_t x = __SPIN_LOCK_UNLOCKED(x)
10396 -#include <linux/rwlock_types.h>
10398  #endif /* __LINUX_SPINLOCK_TYPES_H */
10399 diff --git a/include/linux/spinlock_types_nort.h b/include/linux/spinlock_types_nort.h
10400 new file mode 100644
10401 index 000000000000..f1dac1fb1d6a
10402 --- /dev/null
10403 +++ b/include/linux/spinlock_types_nort.h
10404 @@ -0,0 +1,33 @@
10405 +#ifndef __LINUX_SPINLOCK_TYPES_NORT_H
10406 +#define __LINUX_SPINLOCK_TYPES_NORT_H
10408 +#ifndef __LINUX_SPINLOCK_TYPES_H
10409 +#error "Do not include directly. Include spinlock_types.h instead"
10410 +#endif
10413 + * The non RT version maps spinlocks to raw_spinlocks
10414 + */
10415 +typedef struct spinlock {
10416 +       union {
10417 +               struct raw_spinlock rlock;
10419 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
10420 +# define LOCK_PADSIZE (offsetof(struct raw_spinlock, dep_map))
10421 +               struct {
10422 +                       u8 __padding[LOCK_PADSIZE];
10423 +                       struct lockdep_map dep_map;
10424 +               };
10425 +#endif
10426 +       };
10427 +} spinlock_t;
10429 +#define __SPIN_LOCK_INITIALIZER(lockname) \
10430 +       { { .rlock = __RAW_SPIN_LOCK_INITIALIZER(lockname) } }
10432 +#define __SPIN_LOCK_UNLOCKED(lockname) \
10433 +       (spinlock_t ) __SPIN_LOCK_INITIALIZER(lockname)
10435 +#define DEFINE_SPINLOCK(x)     spinlock_t x = __SPIN_LOCK_UNLOCKED(x)
10437 +#endif
10438 diff --git a/include/linux/spinlock_types_raw.h b/include/linux/spinlock_types_raw.h
10439 new file mode 100644
10440 index 000000000000..edffc4d53fc9
10441 --- /dev/null
10442 +++ b/include/linux/spinlock_types_raw.h
10443 @@ -0,0 +1,56 @@
10444 +#ifndef __LINUX_SPINLOCK_TYPES_RAW_H
10445 +#define __LINUX_SPINLOCK_TYPES_RAW_H
10447 +#if defined(CONFIG_SMP)
10448 +# include <asm/spinlock_types.h>
10449 +#else
10450 +# include <linux/spinlock_types_up.h>
10451 +#endif
10453 +#include <linux/lockdep.h>
10455 +typedef struct raw_spinlock {
10456 +       arch_spinlock_t raw_lock;
10457 +#ifdef CONFIG_GENERIC_LOCKBREAK
10458 +       unsigned int break_lock;
10459 +#endif
10460 +#ifdef CONFIG_DEBUG_SPINLOCK
10461 +       unsigned int magic, owner_cpu;
10462 +       void *owner;
10463 +#endif
10464 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
10465 +       struct lockdep_map dep_map;
10466 +#endif
10467 +} raw_spinlock_t;
10469 +#define SPINLOCK_MAGIC         0xdead4ead
10471 +#define SPINLOCK_OWNER_INIT    ((void *)-1L)
10473 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
10474 +# define SPIN_DEP_MAP_INIT(lockname)   .dep_map = { .name = #lockname }
10475 +#else
10476 +# define SPIN_DEP_MAP_INIT(lockname)
10477 +#endif
10479 +#ifdef CONFIG_DEBUG_SPINLOCK
10480 +# define SPIN_DEBUG_INIT(lockname)             \
10481 +       .magic = SPINLOCK_MAGIC,                \
10482 +       .owner_cpu = -1,                        \
10483 +       .owner = SPINLOCK_OWNER_INIT,
10484 +#else
10485 +# define SPIN_DEBUG_INIT(lockname)
10486 +#endif
10488 +#define __RAW_SPIN_LOCK_INITIALIZER(lockname)  \
10489 +       {                                       \
10490 +       .raw_lock = __ARCH_SPIN_LOCK_UNLOCKED,  \
10491 +       SPIN_DEBUG_INIT(lockname)               \
10492 +       SPIN_DEP_MAP_INIT(lockname) }
10494 +#define __RAW_SPIN_LOCK_UNLOCKED(lockname)     \
10495 +       (raw_spinlock_t) __RAW_SPIN_LOCK_INITIALIZER(lockname)
10497 +#define DEFINE_RAW_SPINLOCK(x) raw_spinlock_t x = __RAW_SPIN_LOCK_UNLOCKED(x)
10499 +#endif
10500 diff --git a/include/linux/spinlock_types_rt.h b/include/linux/spinlock_types_rt.h
10501 new file mode 100644
10502 index 000000000000..3e3d8c5f7a9a
10503 --- /dev/null
10504 +++ b/include/linux/spinlock_types_rt.h
10505 @@ -0,0 +1,48 @@
10506 +#ifndef __LINUX_SPINLOCK_TYPES_RT_H
10507 +#define __LINUX_SPINLOCK_TYPES_RT_H
10509 +#ifndef __LINUX_SPINLOCK_TYPES_H
10510 +#error "Do not include directly. Include spinlock_types.h instead"
10511 +#endif
10513 +#include <linux/cache.h>
10516 + * PREEMPT_RT: spinlocks - an RT mutex plus lock-break field:
10517 + */
10518 +typedef struct spinlock {
10519 +       struct rt_mutex         lock;
10520 +       unsigned int            break_lock;
10521 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
10522 +       struct lockdep_map      dep_map;
10523 +#endif
10524 +} spinlock_t;
10526 +#ifdef CONFIG_DEBUG_RT_MUTEXES
10527 +# define __RT_SPIN_INITIALIZER(name) \
10528 +       { \
10529 +       .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), \
10530 +       .save_state = 1, \
10531 +       .file = __FILE__, \
10532 +       .line = __LINE__ , \
10533 +       }
10534 +#else
10535 +# define __RT_SPIN_INITIALIZER(name) \
10536 +       {                                                               \
10537 +       .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock),          \
10538 +       .save_state = 1, \
10539 +       }
10540 +#endif
10543 +.wait_list = PLIST_HEAD_INIT_RAW((name).lock.wait_list, (name).lock.wait_lock)
10546 +#define __SPIN_LOCK_UNLOCKED(name)                     \
10547 +       { .lock = __RT_SPIN_INITIALIZER(name.lock),             \
10548 +         SPIN_DEP_MAP_INIT(name) }
10550 +#define DEFINE_SPINLOCK(name) \
10551 +       spinlock_t name = __SPIN_LOCK_UNLOCKED(name)
10553 +#endif
10554 diff --git a/include/linux/srcu.h b/include/linux/srcu.h
10555 index dc8eb63c6568..e793d3a257da 100644
10556 --- a/include/linux/srcu.h
10557 +++ b/include/linux/srcu.h
10558 @@ -84,10 +84,10 @@ int init_srcu_struct(struct srcu_struct *sp);
10560  void process_srcu(struct work_struct *work);
10562 -#define __SRCU_STRUCT_INIT(name)                                       \
10563 +#define __SRCU_STRUCT_INIT(name, pcpu_name)                            \
10564         {                                                               \
10565                 .completed = -300,                                      \
10566 -               .per_cpu_ref = &name##_srcu_array,                      \
10567 +               .per_cpu_ref = &pcpu_name,                              \
10568                 .queue_lock = __SPIN_LOCK_UNLOCKED(name.queue_lock),    \
10569                 .running = false,                                       \
10570                 .batch_queue = RCU_BATCH_INIT(name.batch_queue),        \
10571 @@ -119,7 +119,7 @@ void process_srcu(struct work_struct *work);
10572   */
10573  #define __DEFINE_SRCU(name, is_static)                                 \
10574         static DEFINE_PER_CPU(struct srcu_struct_array, name##_srcu_array);\
10575 -       is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name)
10576 +       is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name, name##_srcu_array)
10577  #define DEFINE_SRCU(name)              __DEFINE_SRCU(name, /* not static */)
10578  #define DEFINE_STATIC_SRCU(name)       __DEFINE_SRCU(name, static)
10580 diff --git a/include/linux/suspend.h b/include/linux/suspend.h
10581 index d9718378a8be..e81e6dc7dcb1 100644
10582 --- a/include/linux/suspend.h
10583 +++ b/include/linux/suspend.h
10584 @@ -193,6 +193,12 @@ struct platform_freeze_ops {
10585         void (*end)(void);
10586  };
10588 +#if defined(CONFIG_SUSPEND) || defined(CONFIG_HIBERNATION)
10589 +extern bool pm_in_action;
10590 +#else
10591 +# define pm_in_action false
10592 +#endif
10594  #ifdef CONFIG_SUSPEND
10595  /**
10596   * suspend_set_ops - set platform dependent suspend operations
10597 diff --git a/include/linux/swait.h b/include/linux/swait.h
10598 index c1f9c62a8a50..83f004a72320 100644
10599 --- a/include/linux/swait.h
10600 +++ b/include/linux/swait.h
10601 @@ -87,6 +87,7 @@ static inline int swait_active(struct swait_queue_head *q)
10602  extern void swake_up(struct swait_queue_head *q);
10603  extern void swake_up_all(struct swait_queue_head *q);
10604  extern void swake_up_locked(struct swait_queue_head *q);
10605 +extern void swake_up_all_locked(struct swait_queue_head *q);
10607  extern void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait);
10608  extern void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int state);
10609 diff --git a/include/linux/swap.h b/include/linux/swap.h
10610 index 55ff5593c193..52bf5477dc92 100644
10611 --- a/include/linux/swap.h
10612 +++ b/include/linux/swap.h
10613 @@ -11,6 +11,7 @@
10614  #include <linux/fs.h>
10615  #include <linux/atomic.h>
10616  #include <linux/page-flags.h>
10617 +#include <linux/locallock.h>
10618  #include <asm/page.h>
10620  struct notifier_block;
10621 @@ -247,7 +248,8 @@ struct swap_info_struct {
10622  void *workingset_eviction(struct address_space *mapping, struct page *page);
10623  bool workingset_refault(void *shadow);
10624  void workingset_activation(struct page *page);
10625 -extern struct list_lru workingset_shadow_nodes;
10626 +extern struct list_lru __workingset_shadow_nodes;
10627 +DECLARE_LOCAL_IRQ_LOCK(workingset_shadow_lock);
10629  static inline unsigned int workingset_node_pages(struct radix_tree_node *node)
10631 @@ -292,6 +294,7 @@ extern unsigned long nr_free_pagecache_pages(void);
10634  /* linux/mm/swap.c */
10635 +DECLARE_LOCAL_IRQ_LOCK(swapvec_lock);
10636  extern void lru_cache_add(struct page *);
10637  extern void lru_cache_add_anon(struct page *page);
10638  extern void lru_cache_add_file(struct page *page);
10639 diff --git a/include/linux/swork.h b/include/linux/swork.h
10640 new file mode 100644
10641 index 000000000000..f175fa9a6016
10642 --- /dev/null
10643 +++ b/include/linux/swork.h
10644 @@ -0,0 +1,24 @@
10645 +#ifndef _LINUX_SWORK_H
10646 +#define _LINUX_SWORK_H
10648 +#include <linux/list.h>
10650 +struct swork_event {
10651 +       struct list_head item;
10652 +       unsigned long flags;
10653 +       void (*func)(struct swork_event *);
10656 +static inline void INIT_SWORK(struct swork_event *event,
10657 +                             void (*func)(struct swork_event *))
10659 +       event->flags = 0;
10660 +       event->func = func;
10663 +bool swork_queue(struct swork_event *sev);
10665 +int swork_get(void);
10666 +void swork_put(void);
10668 +#endif /* _LINUX_SWORK_H */
10669 diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
10670 index 2873baf5372a..eb1a108f17ca 100644
10671 --- a/include/linux/thread_info.h
10672 +++ b/include/linux/thread_info.h
10673 @@ -107,7 +107,17 @@ static inline int test_ti_thread_flag(struct thread_info *ti, int flag)
10674  #define test_thread_flag(flag) \
10675         test_ti_thread_flag(current_thread_info(), flag)
10677 -#define tif_need_resched() test_thread_flag(TIF_NEED_RESCHED)
10678 +#ifdef CONFIG_PREEMPT_LAZY
10679 +#define tif_need_resched()     (test_thread_flag(TIF_NEED_RESCHED) || \
10680 +                                test_thread_flag(TIF_NEED_RESCHED_LAZY))
10681 +#define tif_need_resched_now() (test_thread_flag(TIF_NEED_RESCHED))
10682 +#define tif_need_resched_lazy()        test_thread_flag(TIF_NEED_RESCHED_LAZY))
10684 +#else
10685 +#define tif_need_resched()     test_thread_flag(TIF_NEED_RESCHED)
10686 +#define tif_need_resched_now() test_thread_flag(TIF_NEED_RESCHED)
10687 +#define tif_need_resched_lazy()        0
10688 +#endif
10690  #ifndef CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES
10691  static inline int arch_within_stack_frames(const void * const stack,
10692 diff --git a/include/linux/timer.h b/include/linux/timer.h
10693 index ec86e4e55ea3..8e5b680d1275 100644
10694 --- a/include/linux/timer.h
10695 +++ b/include/linux/timer.h
10696 @@ -241,7 +241,7 @@ extern void add_timer(struct timer_list *timer);
10698  extern int try_to_del_timer_sync(struct timer_list *timer);
10700 -#ifdef CONFIG_SMP
10701 +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
10702    extern int del_timer_sync(struct timer_list *timer);
10703  #else
10704  # define del_timer_sync(t)             del_timer(t)
10705 diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
10706 index ba57266d9e80..5c36934ec2bc 100644
10707 --- a/include/linux/trace_events.h
10708 +++ b/include/linux/trace_events.h
10709 @@ -56,6 +56,9 @@ struct trace_entry {
10710         unsigned char           flags;
10711         unsigned char           preempt_count;
10712         int                     pid;
10713 +       unsigned short          migrate_disable;
10714 +       unsigned short          padding;
10715 +       unsigned char           preempt_lazy_count;
10716  };
10718  #define TRACE_EVENT_TYPE_MAX                                           \
10719 diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
10720 index f30c187ed785..83bf0f798426 100644
10721 --- a/include/linux/uaccess.h
10722 +++ b/include/linux/uaccess.h
10723 @@ -24,6 +24,7 @@ static __always_inline void pagefault_disabled_dec(void)
10724   */
10725  static inline void pagefault_disable(void)
10727 +       migrate_disable();
10728         pagefault_disabled_inc();
10729         /*
10730          * make sure to have issued the store before a pagefault
10731 @@ -40,6 +41,7 @@ static inline void pagefault_enable(void)
10732          */
10733         barrier();
10734         pagefault_disabled_dec();
10735 +       migrate_enable();
10738  /*
10739 diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
10740 index 4a29c75b146e..0a294e950df8 100644
10741 --- a/include/linux/uprobes.h
10742 +++ b/include/linux/uprobes.h
10743 @@ -27,6 +27,7 @@
10744  #include <linux/errno.h>
10745  #include <linux/rbtree.h>
10746  #include <linux/types.h>
10747 +#include <linux/wait.h>
10749  struct vm_area_struct;
10750  struct mm_struct;
10751 diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
10752 index 613771909b6e..e28c5a43229d 100644
10753 --- a/include/linux/vmstat.h
10754 +++ b/include/linux/vmstat.h
10755 @@ -33,7 +33,9 @@ DECLARE_PER_CPU(struct vm_event_state, vm_event_states);
10756   */
10757  static inline void __count_vm_event(enum vm_event_item item)
10759 +       preempt_disable_rt();
10760         raw_cpu_inc(vm_event_states.event[item]);
10761 +       preempt_enable_rt();
10764  static inline void count_vm_event(enum vm_event_item item)
10765 @@ -43,7 +45,9 @@ static inline void count_vm_event(enum vm_event_item item)
10767  static inline void __count_vm_events(enum vm_event_item item, long delta)
10769 +       preempt_disable_rt();
10770         raw_cpu_add(vm_event_states.event[item], delta);
10771 +       preempt_enable_rt();
10774  static inline void count_vm_events(enum vm_event_item item, long delta)
10775 diff --git a/include/linux/wait.h b/include/linux/wait.h
10776 index 2408e8d5c05c..db50d6609195 100644
10777 --- a/include/linux/wait.h
10778 +++ b/include/linux/wait.h
10779 @@ -8,6 +8,7 @@
10780  #include <linux/spinlock.h>
10781  #include <asm/current.h>
10782  #include <uapi/linux/wait.h>
10783 +#include <linux/atomic.h>
10785  typedef struct __wait_queue wait_queue_t;
10786  typedef int (*wait_queue_func_t)(wait_queue_t *wait, unsigned mode, int flags, void *key);
10787 diff --git a/include/net/dst.h b/include/net/dst.h
10788 index ddcff17615da..a1fc787b1a8c 100644
10789 --- a/include/net/dst.h
10790 +++ b/include/net/dst.h
10791 @@ -452,7 +452,7 @@ static inline void dst_confirm(struct dst_entry *dst)
10792  static inline int dst_neigh_output(struct dst_entry *dst, struct neighbour *n,
10793                                    struct sk_buff *skb)
10795 -       const struct hh_cache *hh;
10796 +       struct hh_cache *hh;
10798         if (dst->pending_confirm) {
10799                 unsigned long now = jiffies;
10800 diff --git a/include/net/gen_stats.h b/include/net/gen_stats.h
10801 index 231e121cc7d9..d125222b979d 100644
10802 --- a/include/net/gen_stats.h
10803 +++ b/include/net/gen_stats.h
10804 @@ -5,6 +5,7 @@
10805  #include <linux/socket.h>
10806  #include <linux/rtnetlink.h>
10807  #include <linux/pkt_sched.h>
10808 +#include <net/net_seq_lock.h>
10810  struct gnet_stats_basic_cpu {
10811         struct gnet_stats_basic_packed bstats;
10812 @@ -33,11 +34,11 @@ int gnet_stats_start_copy_compat(struct sk_buff *skb, int type,
10813                                  spinlock_t *lock, struct gnet_dump *d,
10814                                  int padattr);
10816 -int gnet_stats_copy_basic(const seqcount_t *running,
10817 +int gnet_stats_copy_basic(net_seqlock_t *running,
10818                           struct gnet_dump *d,
10819                           struct gnet_stats_basic_cpu __percpu *cpu,
10820                           struct gnet_stats_basic_packed *b);
10821 -void __gnet_stats_copy_basic(const seqcount_t *running,
10822 +void __gnet_stats_copy_basic(net_seqlock_t *running,
10823                              struct gnet_stats_basic_packed *bstats,
10824                              struct gnet_stats_basic_cpu __percpu *cpu,
10825                              struct gnet_stats_basic_packed *b);
10826 @@ -55,14 +56,14 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
10827                       struct gnet_stats_basic_cpu __percpu *cpu_bstats,
10828                       struct gnet_stats_rate_est64 *rate_est,
10829                       spinlock_t *stats_lock,
10830 -                     seqcount_t *running, struct nlattr *opt);
10831 +                     net_seqlock_t *running, struct nlattr *opt);
10832  void gen_kill_estimator(struct gnet_stats_basic_packed *bstats,
10833                         struct gnet_stats_rate_est64 *rate_est);
10834  int gen_replace_estimator(struct gnet_stats_basic_packed *bstats,
10835                           struct gnet_stats_basic_cpu __percpu *cpu_bstats,
10836                           struct gnet_stats_rate_est64 *rate_est,
10837                           spinlock_t *stats_lock,
10838 -                         seqcount_t *running, struct nlattr *opt);
10839 +                         net_seqlock_t *running, struct nlattr *opt);
10840  bool gen_estimator_active(const struct gnet_stats_basic_packed *bstats,
10841                           const struct gnet_stats_rate_est64 *rate_est);
10842  #endif
10843 diff --git a/include/net/neighbour.h b/include/net/neighbour.h
10844 index 8b683841e574..bf656008f6e7 100644
10845 --- a/include/net/neighbour.h
10846 +++ b/include/net/neighbour.h
10847 @@ -446,7 +446,7 @@ static inline int neigh_hh_bridge(struct hh_cache *hh, struct sk_buff *skb)
10849  #endif
10851 -static inline int neigh_hh_output(const struct hh_cache *hh, struct sk_buff *skb)
10852 +static inline int neigh_hh_output(struct hh_cache *hh, struct sk_buff *skb)
10854         unsigned int seq;
10855         int hh_len;
10856 @@ -501,7 +501,7 @@ struct neighbour_cb {
10858  #define NEIGH_CB(skb)  ((struct neighbour_cb *)(skb)->cb)
10860 -static inline void neigh_ha_snapshot(char *dst, const struct neighbour *n,
10861 +static inline void neigh_ha_snapshot(char *dst, struct neighbour *n,
10862                                      const struct net_device *dev)
10864         unsigned int seq;
10865 diff --git a/include/net/net_seq_lock.h b/include/net/net_seq_lock.h
10866 new file mode 100644
10867 index 000000000000..a7034298a82a
10868 --- /dev/null
10869 +++ b/include/net/net_seq_lock.h
10870 @@ -0,0 +1,15 @@
10871 +#ifndef __NET_NET_SEQ_LOCK_H__
10872 +#define __NET_NET_SEQ_LOCK_H__
10874 +#ifdef CONFIG_PREEMPT_RT_BASE
10875 +# define net_seqlock_t                 seqlock_t
10876 +# define net_seq_begin(__r)            read_seqbegin(__r)
10877 +# define net_seq_retry(__r, __s)       read_seqretry(__r, __s)
10879 +#else
10880 +# define net_seqlock_t                 seqcount_t
10881 +# define net_seq_begin(__r)            read_seqcount_begin(__r)
10882 +# define net_seq_retry(__r, __s)       read_seqcount_retry(__r, __s)
10883 +#endif
10885 +#endif
10886 diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
10887 index 7adf4386ac8f..d3fd5c357268 100644
10888 --- a/include/net/netns/ipv4.h
10889 +++ b/include/net/netns/ipv4.h
10890 @@ -69,6 +69,7 @@ struct netns_ipv4 {
10892         int sysctl_icmp_echo_ignore_all;
10893         int sysctl_icmp_echo_ignore_broadcasts;
10894 +       int sysctl_icmp_echo_sysrq;
10895         int sysctl_icmp_ignore_bogus_error_responses;
10896         int sysctl_icmp_ratelimit;
10897         int sysctl_icmp_ratemask;
10898 diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
10899 index f18fc1a0321f..5d2c9b89c168 100644
10900 --- a/include/net/sch_generic.h
10901 +++ b/include/net/sch_generic.h
10902 @@ -10,6 +10,7 @@
10903  #include <linux/dynamic_queue_limits.h>
10904  #include <net/gen_stats.h>
10905  #include <net/rtnetlink.h>
10906 +#include <net/net_seq_lock.h>
10908  struct Qdisc_ops;
10909  struct qdisc_walker;
10910 @@ -86,7 +87,7 @@ struct Qdisc {
10911         struct sk_buff          *gso_skb ____cacheline_aligned_in_smp;
10912         struct qdisc_skb_head   q;
10913         struct gnet_stats_basic_packed bstats;
10914 -       seqcount_t              running;
10915 +       net_seqlock_t           running;
10916         struct gnet_stats_queue qstats;
10917         unsigned long           state;
10918         struct Qdisc            *next_sched;
10919 @@ -98,13 +99,22 @@ struct Qdisc {
10920         spinlock_t              busylock ____cacheline_aligned_in_smp;
10921  };
10923 -static inline bool qdisc_is_running(const struct Qdisc *qdisc)
10924 +static inline bool qdisc_is_running(struct Qdisc *qdisc)
10926 +#ifdef CONFIG_PREEMPT_RT_BASE
10927 +       return spin_is_locked(&qdisc->running.lock) ? true : false;
10928 +#else
10929         return (raw_read_seqcount(&qdisc->running) & 1) ? true : false;
10930 +#endif
10933  static inline bool qdisc_run_begin(struct Qdisc *qdisc)
10935 +#ifdef CONFIG_PREEMPT_RT_BASE
10936 +       if (try_write_seqlock(&qdisc->running))
10937 +               return true;
10938 +       return false;
10939 +#else
10940         if (qdisc_is_running(qdisc))
10941                 return false;
10942         /* Variant of write_seqcount_begin() telling lockdep a trylock
10943 @@ -113,11 +123,16 @@ static inline bool qdisc_run_begin(struct Qdisc *qdisc)
10944         raw_write_seqcount_begin(&qdisc->running);
10945         seqcount_acquire(&qdisc->running.dep_map, 0, 1, _RET_IP_);
10946         return true;
10947 +#endif
10950  static inline void qdisc_run_end(struct Qdisc *qdisc)
10952 +#ifdef CONFIG_PREEMPT_RT_BASE
10953 +       write_sequnlock(&qdisc->running);
10954 +#else
10955         write_seqcount_end(&qdisc->running);
10956 +#endif
10959  static inline bool qdisc_may_bulk(const struct Qdisc *qdisc)
10960 @@ -308,7 +323,7 @@ static inline spinlock_t *qdisc_root_sleeping_lock(const struct Qdisc *qdisc)
10961         return qdisc_lock(root);
10964 -static inline seqcount_t *qdisc_root_sleeping_running(const struct Qdisc *qdisc)
10965 +static inline net_seqlock_t *qdisc_root_sleeping_running(const struct Qdisc *qdisc)
10967         struct Qdisc *root = qdisc_root_sleeping(qdisc);
10969 diff --git a/include/trace/events/hist.h b/include/trace/events/hist.h
10970 new file mode 100644
10971 index 000000000000..f7710de1b1f3
10972 --- /dev/null
10973 +++ b/include/trace/events/hist.h
10974 @@ -0,0 +1,73 @@
10975 +#undef TRACE_SYSTEM
10976 +#define TRACE_SYSTEM hist
10978 +#if !defined(_TRACE_HIST_H) || defined(TRACE_HEADER_MULTI_READ)
10979 +#define _TRACE_HIST_H
10981 +#include "latency_hist.h"
10982 +#include <linux/tracepoint.h>
10984 +#if !defined(CONFIG_PREEMPT_OFF_HIST) && !defined(CONFIG_INTERRUPT_OFF_HIST)
10985 +#define trace_preemptirqsoff_hist(a, b)
10986 +#define trace_preemptirqsoff_hist_rcuidle(a, b)
10987 +#else
10988 +TRACE_EVENT(preemptirqsoff_hist,
10990 +       TP_PROTO(int reason, int starthist),
10992 +       TP_ARGS(reason, starthist),
10994 +       TP_STRUCT__entry(
10995 +               __field(int,    reason)
10996 +               __field(int,    starthist)
10997 +       ),
10999 +       TP_fast_assign(
11000 +               __entry->reason         = reason;
11001 +               __entry->starthist      = starthist;
11002 +       ),
11004 +       TP_printk("reason=%s starthist=%s", getaction(__entry->reason),
11005 +                 __entry->starthist ? "start" : "stop")
11007 +#endif
11009 +#ifndef CONFIG_MISSED_TIMER_OFFSETS_HIST
11010 +#define trace_hrtimer_interrupt(a, b, c, d)
11011 +#else
11012 +TRACE_EVENT(hrtimer_interrupt,
11014 +       TP_PROTO(int cpu, long long offset, struct task_struct *curr,
11015 +               struct task_struct *task),
11017 +       TP_ARGS(cpu, offset, curr, task),
11019 +       TP_STRUCT__entry(
11020 +               __field(int,            cpu)
11021 +               __field(long long,      offset)
11022 +               __array(char,           ccomm,  TASK_COMM_LEN)
11023 +               __field(int,            cprio)
11024 +               __array(char,           tcomm,  TASK_COMM_LEN)
11025 +               __field(int,            tprio)
11026 +       ),
11028 +       TP_fast_assign(
11029 +               __entry->cpu    = cpu;
11030 +               __entry->offset = offset;
11031 +               memcpy(__entry->ccomm, curr->comm, TASK_COMM_LEN);
11032 +               __entry->cprio  = curr->prio;
11033 +               memcpy(__entry->tcomm, task != NULL ? task->comm : "<none>",
11034 +                       task != NULL ? TASK_COMM_LEN : 7);
11035 +               __entry->tprio  = task != NULL ? task->prio : -1;
11036 +       ),
11038 +       TP_printk("cpu=%d offset=%lld curr=%s[%d] thread=%s[%d]",
11039 +               __entry->cpu, __entry->offset, __entry->ccomm,
11040 +               __entry->cprio, __entry->tcomm, __entry->tprio)
11042 +#endif
11044 +#endif /* _TRACE_HIST_H */
11046 +/* This part must be outside protection */
11047 +#include <trace/define_trace.h>
11048 diff --git a/include/trace/events/latency_hist.h b/include/trace/events/latency_hist.h
11049 new file mode 100644
11050 index 000000000000..d3f2fbd560b1
11051 --- /dev/null
11052 +++ b/include/trace/events/latency_hist.h
11053 @@ -0,0 +1,29 @@
11054 +#ifndef _LATENCY_HIST_H
11055 +#define _LATENCY_HIST_H
11057 +enum hist_action {
11058 +       IRQS_ON,
11059 +       PREEMPT_ON,
11060 +       TRACE_STOP,
11061 +       IRQS_OFF,
11062 +       PREEMPT_OFF,
11063 +       TRACE_START,
11066 +static char *actions[] = {
11067 +       "IRQS_ON",
11068 +       "PREEMPT_ON",
11069 +       "TRACE_STOP",
11070 +       "IRQS_OFF",
11071 +       "PREEMPT_OFF",
11072 +       "TRACE_START",
11075 +static inline char *getaction(int action)
11077 +       if (action >= 0 && action <= sizeof(actions)/sizeof(actions[0]))
11078 +               return actions[action];
11079 +       return "unknown";
11082 +#endif /* _LATENCY_HIST_H */
11083 diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
11084 index 9b90c57517a9..516ae88cddf4 100644
11085 --- a/include/trace/events/sched.h
11086 +++ b/include/trace/events/sched.h
11087 @@ -70,7 +70,7 @@ DECLARE_EVENT_CLASS(sched_wakeup_template,
11088         TP_fast_assign(
11089                 memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
11090                 __entry->pid            = p->pid;
11091 -               __entry->prio           = p->prio;
11092 +               __entry->prio           = p->prio; /* XXX SCHED_DEADLINE */
11093                 __entry->success        = 1; /* rudiment, kill when possible */
11094                 __entry->target_cpu     = task_cpu(p);
11095         ),
11096 @@ -147,6 +147,7 @@ TRACE_EVENT(sched_switch,
11097                 memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
11098                 __entry->next_pid       = next->pid;
11099                 __entry->next_prio      = next->prio;
11100 +               /* XXX SCHED_DEADLINE */
11101         ),
11103         TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%s%s ==> next_comm=%s next_pid=%d next_prio=%d",
11104 @@ -181,7 +182,7 @@ TRACE_EVENT(sched_migrate_task,
11105         TP_fast_assign(
11106                 memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
11107                 __entry->pid            = p->pid;
11108 -               __entry->prio           = p->prio;
11109 +               __entry->prio           = p->prio; /* XXX SCHED_DEADLINE */
11110                 __entry->orig_cpu       = task_cpu(p);
11111                 __entry->dest_cpu       = dest_cpu;
11112         ),
11113 @@ -206,7 +207,7 @@ DECLARE_EVENT_CLASS(sched_process_template,
11114         TP_fast_assign(
11115                 memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
11116                 __entry->pid            = p->pid;
11117 -               __entry->prio           = p->prio;
11118 +               __entry->prio           = p->prio; /* XXX SCHED_DEADLINE */
11119         ),
11121         TP_printk("comm=%s pid=%d prio=%d",
11122 @@ -253,7 +254,7 @@ TRACE_EVENT(sched_process_wait,
11123         TP_fast_assign(
11124                 memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
11125                 __entry->pid            = pid_nr(pid);
11126 -               __entry->prio           = current->prio;
11127 +               __entry->prio           = current->prio; /* XXX SCHED_DEADLINE */
11128         ),
11130         TP_printk("comm=%s pid=%d prio=%d",
11131 @@ -413,9 +414,9 @@ DEFINE_EVENT(sched_stat_runtime, sched_stat_runtime,
11132   */
11133  TRACE_EVENT(sched_pi_setprio,
11135 -       TP_PROTO(struct task_struct *tsk, int newprio),
11136 +       TP_PROTO(struct task_struct *tsk, struct task_struct *pi_task),
11138 -       TP_ARGS(tsk, newprio),
11139 +       TP_ARGS(tsk, pi_task),
11141         TP_STRUCT__entry(
11142                 __array( char,  comm,   TASK_COMM_LEN   )
11143 @@ -428,7 +429,8 @@ TRACE_EVENT(sched_pi_setprio,
11144                 memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
11145                 __entry->pid            = tsk->pid;
11146                 __entry->oldprio        = tsk->prio;
11147 -               __entry->newprio        = newprio;
11148 +               __entry->newprio        = pi_task ? pi_task->prio : tsk->prio;
11149 +               /* XXX SCHED_DEADLINE bits missing */
11150         ),
11152         TP_printk("comm=%s pid=%d oldprio=%d newprio=%d",
11153 diff --git a/init/Kconfig b/init/Kconfig
11154 index 34407f15e6d3..2ce33a32e65d 100644
11155 --- a/init/Kconfig
11156 +++ b/init/Kconfig
11157 @@ -506,7 +506,7 @@ config TINY_RCU
11159  config RCU_EXPERT
11160         bool "Make expert-level adjustments to RCU configuration"
11161 -       default n
11162 +       default y if PREEMPT_RT_FULL
11163         help
11164           This option needs to be enabled if you wish to make
11165           expert-level adjustments to RCU configuration.  By default,
11166 @@ -623,7 +623,7 @@ config RCU_FANOUT_LEAF
11168  config RCU_FAST_NO_HZ
11169         bool "Accelerate last non-dyntick-idle CPU's grace periods"
11170 -       depends on NO_HZ_COMMON && SMP && RCU_EXPERT
11171 +       depends on NO_HZ_COMMON && SMP && RCU_EXPERT && !PREEMPT_RT_FULL
11172         default n
11173         help
11174           This option permits CPUs to enter dynticks-idle state even if
11175 @@ -650,7 +650,7 @@ config TREE_RCU_TRACE
11176  config RCU_BOOST
11177         bool "Enable RCU priority boosting"
11178         depends on RT_MUTEXES && PREEMPT_RCU && RCU_EXPERT
11179 -       default n
11180 +       default y if PREEMPT_RT_FULL
11181         help
11182           This option boosts the priority of preempted RCU readers that
11183           block the current preemptible RCU grace period for too long.
11184 @@ -781,19 +781,6 @@ config RCU_NOCB_CPU_ALL
11186  endchoice
11188 -config RCU_EXPEDITE_BOOT
11189 -       bool
11190 -       default n
11191 -       help
11192 -         This option enables expedited grace periods at boot time,
11193 -         as if rcu_expedite_gp() had been invoked early in boot.
11194 -         The corresponding rcu_unexpedite_gp() is invoked from
11195 -         rcu_end_inkernel_boot(), which is intended to be invoked
11196 -         at the end of the kernel-only boot sequence, just before
11197 -         init is exec'ed.
11199 -         Accept the default if unsure.
11201  endmenu # "RCU Subsystem"
11203  config BUILD_BIN2C
11204 @@ -1064,6 +1051,7 @@ config CFS_BANDWIDTH
11205  config RT_GROUP_SCHED
11206         bool "Group scheduling for SCHED_RR/FIFO"
11207         depends on CGROUP_SCHED
11208 +       depends on !PREEMPT_RT_FULL
11209         default n
11210         help
11211           This feature lets you explicitly allocate real CPU bandwidth
11212 @@ -1772,6 +1760,7 @@ choice
11214  config SLAB
11215         bool "SLAB"
11216 +       depends on !PREEMPT_RT_FULL
11217         select HAVE_HARDENED_USERCOPY_ALLOCATOR
11218         help
11219           The regular slab allocator that is established and known to work
11220 @@ -1792,6 +1781,7 @@ config SLUB
11221  config SLOB
11222         depends on EXPERT
11223         bool "SLOB (Simple Allocator)"
11224 +       depends on !PREEMPT_RT_FULL
11225         help
11226            SLOB replaces the stock allocator with a drastically simpler
11227            allocator. SLOB is generally more space efficient but
11228 @@ -1810,7 +1800,7 @@ config SLAB_FREELIST_RANDOM
11230  config SLUB_CPU_PARTIAL
11231         default y
11232 -       depends on SLUB && SMP
11233 +       depends on SLUB && SMP && !PREEMPT_RT_FULL
11234         bool "SLUB per cpu partial cache"
11235         help
11236           Per cpu partial caches accellerate objects allocation and freeing
11237 diff --git a/init/Makefile b/init/Makefile
11238 index c4fb45525d08..821190dfaa75 100644
11239 --- a/init/Makefile
11240 +++ b/init/Makefile
11241 @@ -35,4 +35,4 @@ silent_chk_compile.h = :
11242  include/generated/compile.h: FORCE
11243         @$($(quiet)chk_compile.h)
11244         $(Q)$(CONFIG_SHELL) $(srctree)/scripts/mkcompile_h $@ \
11245 -       "$(UTS_MACHINE)" "$(CONFIG_SMP)" "$(CONFIG_PREEMPT)" "$(CC) $(KBUILD_CFLAGS)"
11246 +       "$(UTS_MACHINE)" "$(CONFIG_SMP)" "$(CONFIG_PREEMPT)" "$(CONFIG_PREEMPT_RT_FULL)" "$(CC) $(KBUILD_CFLAGS)"
11247 diff --git a/init/main.c b/init/main.c
11248 index 99f026565608..48ffaaad8ac9 100644
11249 --- a/init/main.c
11250 +++ b/init/main.c
11251 @@ -508,6 +508,7 @@ asmlinkage __visible void __init start_kernel(void)
11252         setup_command_line(command_line);
11253         setup_nr_cpu_ids();
11254         setup_per_cpu_areas();
11255 +       softirq_early_init();
11256         boot_cpu_state_init();
11257         smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */
11259 diff --git a/ipc/sem.c b/ipc/sem.c
11260 index 10b94bc59d4a..b8360eaacc7a 100644
11261 --- a/ipc/sem.c
11262 +++ b/ipc/sem.c
11263 @@ -712,6 +712,13 @@ static int perform_atomic_semop(struct sem_array *sma, struct sem_queue *q)
11264  static void wake_up_sem_queue_prepare(struct list_head *pt,
11265                                 struct sem_queue *q, int error)
11267 +#ifdef CONFIG_PREEMPT_RT_BASE
11268 +       struct task_struct *p = q->sleeper;
11269 +       get_task_struct(p);
11270 +       q->status = error;
11271 +       wake_up_process(p);
11272 +       put_task_struct(p);
11273 +#else
11274         if (list_empty(pt)) {
11275                 /*
11276                  * Hold preempt off so that we don't get preempted and have the
11277 @@ -723,6 +730,7 @@ static void wake_up_sem_queue_prepare(struct list_head *pt,
11278         q->pid = error;
11280         list_add_tail(&q->list, pt);
11281 +#endif
11284  /**
11285 @@ -736,6 +744,7 @@ static void wake_up_sem_queue_prepare(struct list_head *pt,
11286   */
11287  static void wake_up_sem_queue_do(struct list_head *pt)
11289 +#ifndef CONFIG_PREEMPT_RT_BASE
11290         struct sem_queue *q, *t;
11291         int did_something;
11293 @@ -748,6 +757,7 @@ static void wake_up_sem_queue_do(struct list_head *pt)
11294         }
11295         if (did_something)
11296                 preempt_enable();
11297 +#endif
11300  static void unlink_queue(struct sem_array *sma, struct sem_queue *q)
11301 diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
11302 index ebdb0043203a..b9e6aa7e5aa6 100644
11303 --- a/kernel/Kconfig.locks
11304 +++ b/kernel/Kconfig.locks
11305 @@ -225,11 +225,11 @@ config ARCH_SUPPORTS_ATOMIC_RMW
11307  config MUTEX_SPIN_ON_OWNER
11308         def_bool y
11309 -       depends on SMP && !DEBUG_MUTEXES && ARCH_SUPPORTS_ATOMIC_RMW
11310 +       depends on SMP && !DEBUG_MUTEXES && ARCH_SUPPORTS_ATOMIC_RMW && !PREEMPT_RT_FULL
11312  config RWSEM_SPIN_ON_OWNER
11313         def_bool y
11314 -       depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW
11315 +       depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW && !PREEMPT_RT_FULL
11317  config LOCK_SPIN_ON_OWNER
11318         def_bool y
11319 diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
11320 index 3f9c97419f02..11dbe26a8279 100644
11321 --- a/kernel/Kconfig.preempt
11322 +++ b/kernel/Kconfig.preempt
11323 @@ -1,3 +1,16 @@
11324 +config PREEMPT
11325 +       bool
11326 +       select PREEMPT_COUNT
11328 +config PREEMPT_RT_BASE
11329 +       bool
11330 +       select PREEMPT
11332 +config HAVE_PREEMPT_LAZY
11333 +       bool
11335 +config PREEMPT_LAZY
11336 +       def_bool y if HAVE_PREEMPT_LAZY && PREEMPT_RT_FULL
11338  choice
11339         prompt "Preemption Model"
11340 @@ -33,9 +46,9 @@ config PREEMPT_VOLUNTARY
11342           Select this if you are building a kernel for a desktop system.
11344 -config PREEMPT
11345 +config PREEMPT__LL
11346         bool "Preemptible Kernel (Low-Latency Desktop)"
11347 -       select PREEMPT_COUNT
11348 +       select PREEMPT
11349         select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK
11350         help
11351           This option reduces the latency of the kernel by making
11352 @@ -52,6 +65,22 @@ config PREEMPT
11353           embedded system with latency requirements in the milliseconds
11354           range.
11356 +config PREEMPT_RTB
11357 +       bool "Preemptible Kernel (Basic RT)"
11358 +       select PREEMPT_RT_BASE
11359 +       help
11360 +         This option is basically the same as (Low-Latency Desktop) but
11361 +         enables changes which are preliminary for the full preemptible
11362 +         RT kernel.
11364 +config PREEMPT_RT_FULL
11365 +       bool "Fully Preemptible Kernel (RT)"
11366 +       depends on IRQ_FORCED_THREADING
11367 +       select PREEMPT_RT_BASE
11368 +       select PREEMPT_RCU
11369 +       help
11370 +         All and everything
11372  endchoice
11374  config PREEMPT_COUNT
11375 diff --git a/kernel/cgroup.c b/kernel/cgroup.c
11376 index 4c233437ee1a..6c3c9f298f22 100644
11377 --- a/kernel/cgroup.c
11378 +++ b/kernel/cgroup.c
11379 @@ -5041,10 +5041,10 @@ static void css_free_rcu_fn(struct rcu_head *rcu_head)
11380         queue_work(cgroup_destroy_wq, &css->destroy_work);
11383 -static void css_release_work_fn(struct work_struct *work)
11384 +static void css_release_work_fn(struct swork_event *sev)
11386         struct cgroup_subsys_state *css =
11387 -               container_of(work, struct cgroup_subsys_state, destroy_work);
11388 +               container_of(sev, struct cgroup_subsys_state, destroy_swork);
11389         struct cgroup_subsys *ss = css->ss;
11390         struct cgroup *cgrp = css->cgroup;
11392 @@ -5087,8 +5087,8 @@ static void css_release(struct percpu_ref *ref)
11393         struct cgroup_subsys_state *css =
11394                 container_of(ref, struct cgroup_subsys_state, refcnt);
11396 -       INIT_WORK(&css->destroy_work, css_release_work_fn);
11397 -       queue_work(cgroup_destroy_wq, &css->destroy_work);
11398 +       INIT_SWORK(&css->destroy_swork, css_release_work_fn);
11399 +       swork_queue(&css->destroy_swork);
11402  static void init_and_link_css(struct cgroup_subsys_state *css,
11403 @@ -5749,6 +5749,7 @@ static int __init cgroup_wq_init(void)
11404          */
11405         cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
11406         BUG_ON(!cgroup_destroy_wq);
11407 +       BUG_ON(swork_get());
11409         /*
11410          * Used to destroy pidlists and separate to serve as flush domain.
11411 diff --git a/kernel/cpu.c b/kernel/cpu.c
11412 index 802eb3361a0a..c6a4cf8ba645 100644
11413 --- a/kernel/cpu.c
11414 +++ b/kernel/cpu.c
11415 @@ -239,6 +239,289 @@ static struct {
11416  #define cpuhp_lock_acquire()      lock_map_acquire(&cpu_hotplug.dep_map)
11417  #define cpuhp_lock_release()      lock_map_release(&cpu_hotplug.dep_map)
11419 +/**
11420 + * hotplug_pcp - per cpu hotplug descriptor
11421 + * @unplug:    set when pin_current_cpu() needs to sync tasks
11422 + * @sync_tsk:  the task that waits for tasks to finish pinned sections
11423 + * @refcount:  counter of tasks in pinned sections
11424 + * @grab_lock: set when the tasks entering pinned sections should wait
11425 + * @synced:    notifier for @sync_tsk to tell cpu_down it's finished
11426 + * @mutex:     the mutex to make tasks wait (used when @grab_lock is true)
11427 + * @mutex_init:        zero if the mutex hasn't been initialized yet.
11428 + *
11429 + * Although @unplug and @sync_tsk may point to the same task, the @unplug
11430 + * is used as a flag and still exists after @sync_tsk has exited and
11431 + * @sync_tsk set to NULL.
11432 + */
11433 +struct hotplug_pcp {
11434 +       struct task_struct *unplug;
11435 +       struct task_struct *sync_tsk;
11436 +       int refcount;
11437 +       int grab_lock;
11438 +       struct completion synced;
11439 +       struct completion unplug_wait;
11440 +#ifdef CONFIG_PREEMPT_RT_FULL
11441 +       /*
11442 +        * Note, on PREEMPT_RT, the hotplug lock must save the state of
11443 +        * the task, otherwise the mutex will cause the task to fail
11444 +        * to sleep when required. (Because it's called from migrate_disable())
11445 +        *
11446 +        * The spinlock_t on PREEMPT_RT is a mutex that saves the task's
11447 +        * state.
11448 +        */
11449 +       spinlock_t lock;
11450 +#else
11451 +       struct mutex mutex;
11452 +#endif
11453 +       int mutex_init;
11456 +#ifdef CONFIG_PREEMPT_RT_FULL
11457 +# define hotplug_lock(hp) rt_spin_lock__no_mg(&(hp)->lock)
11458 +# define hotplug_unlock(hp) rt_spin_unlock__no_mg(&(hp)->lock)
11459 +#else
11460 +# define hotplug_lock(hp) mutex_lock(&(hp)->mutex)
11461 +# define hotplug_unlock(hp) mutex_unlock(&(hp)->mutex)
11462 +#endif
11464 +static DEFINE_PER_CPU(struct hotplug_pcp, hotplug_pcp);
11466 +/**
11467 + * pin_current_cpu - Prevent the current cpu from being unplugged
11468 + *
11469 + * Lightweight version of get_online_cpus() to prevent cpu from being
11470 + * unplugged when code runs in a migration disabled region.
11471 + *
11472 + * Must be called with preemption disabled (preempt_count = 1)!
11473 + */
11474 +void pin_current_cpu(void)
11476 +       struct hotplug_pcp *hp;
11477 +       int force = 0;
11479 +retry:
11480 +       hp = this_cpu_ptr(&hotplug_pcp);
11482 +       if (!hp->unplug || hp->refcount || force || preempt_count() > 1 ||
11483 +           hp->unplug == current) {
11484 +               hp->refcount++;
11485 +               return;
11486 +       }
11487 +       if (hp->grab_lock) {
11488 +               preempt_enable();
11489 +               hotplug_lock(hp);
11490 +               hotplug_unlock(hp);
11491 +       } else {
11492 +               preempt_enable();
11493 +               /*
11494 +                * Try to push this task off of this CPU.
11495 +                */
11496 +               if (!migrate_me()) {
11497 +                       preempt_disable();
11498 +                       hp = this_cpu_ptr(&hotplug_pcp);
11499 +                       if (!hp->grab_lock) {
11500 +                               /*
11501 +                                * Just let it continue it's already pinned
11502 +                                * or about to sleep.
11503 +                                */
11504 +                               force = 1;
11505 +                               goto retry;
11506 +                       }
11507 +                       preempt_enable();
11508 +               }
11509 +       }
11510 +       preempt_disable();
11511 +       goto retry;
11514 +/**
11515 + * unpin_current_cpu - Allow unplug of current cpu
11516 + *
11517 + * Must be called with preemption or interrupts disabled!
11518 + */
11519 +void unpin_current_cpu(void)
11521 +       struct hotplug_pcp *hp = this_cpu_ptr(&hotplug_pcp);
11523 +       WARN_ON(hp->refcount <= 0);
11525 +       /* This is safe. sync_unplug_thread is pinned to this cpu */
11526 +       if (!--hp->refcount && hp->unplug && hp->unplug != current)
11527 +               wake_up_process(hp->unplug);
11530 +static void wait_for_pinned_cpus(struct hotplug_pcp *hp)
11532 +       set_current_state(TASK_UNINTERRUPTIBLE);
11533 +       while (hp->refcount) {
11534 +               schedule_preempt_disabled();
11535 +               set_current_state(TASK_UNINTERRUPTIBLE);
11536 +       }
11539 +static int sync_unplug_thread(void *data)
11541 +       struct hotplug_pcp *hp = data;
11543 +       wait_for_completion(&hp->unplug_wait);
11544 +       preempt_disable();
11545 +       hp->unplug = current;
11546 +       wait_for_pinned_cpus(hp);
11548 +       /*
11549 +        * This thread will synchronize the cpu_down() with threads
11550 +        * that have pinned the CPU. When the pinned CPU count reaches
11551 +        * zero, we inform the cpu_down code to continue to the next step.
11552 +        */
11553 +       set_current_state(TASK_UNINTERRUPTIBLE);
11554 +       preempt_enable();
11555 +       complete(&hp->synced);
11557 +       /*
11558 +        * If all succeeds, the next step will need tasks to wait till
11559 +        * the CPU is offline before continuing. To do this, the grab_lock
11560 +        * is set and tasks going into pin_current_cpu() will block on the
11561 +        * mutex. But we still need to wait for those that are already in
11562 +        * pinned CPU sections. If the cpu_down() failed, the kthread_should_stop()
11563 +        * will kick this thread out.
11564 +        */
11565 +       while (!hp->grab_lock && !kthread_should_stop()) {
11566 +               schedule();
11567 +               set_current_state(TASK_UNINTERRUPTIBLE);
11568 +       }
11570 +       /* Make sure grab_lock is seen before we see a stale completion */
11571 +       smp_mb();
11573 +       /*
11574 +        * Now just before cpu_down() enters stop machine, we need to make
11575 +        * sure all tasks that are in pinned CPU sections are out, and new
11576 +        * tasks will now grab the lock, keeping them from entering pinned
11577 +        * CPU sections.
11578 +        */
11579 +       if (!kthread_should_stop()) {
11580 +               preempt_disable();
11581 +               wait_for_pinned_cpus(hp);
11582 +               preempt_enable();
11583 +               complete(&hp->synced);
11584 +       }
11586 +       set_current_state(TASK_UNINTERRUPTIBLE);
11587 +       while (!kthread_should_stop()) {
11588 +               schedule();
11589 +               set_current_state(TASK_UNINTERRUPTIBLE);
11590 +       }
11591 +       set_current_state(TASK_RUNNING);
11593 +       /*
11594 +        * Force this thread off this CPU as it's going down and
11595 +        * we don't want any more work on this CPU.
11596 +        */
11597 +       current->flags &= ~PF_NO_SETAFFINITY;
11598 +       set_cpus_allowed_ptr(current, cpu_present_mask);
11599 +       migrate_me();
11600 +       return 0;
11603 +static void __cpu_unplug_sync(struct hotplug_pcp *hp)
11605 +       wake_up_process(hp->sync_tsk);
11606 +       wait_for_completion(&hp->synced);
11609 +static void __cpu_unplug_wait(unsigned int cpu)
11611 +       struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
11613 +       complete(&hp->unplug_wait);
11614 +       wait_for_completion(&hp->synced);
11618 + * Start the sync_unplug_thread on the target cpu and wait for it to
11619 + * complete.
11620 + */
11621 +static int cpu_unplug_begin(unsigned int cpu)
11623 +       struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
11624 +       int err;
11626 +       /* Protected by cpu_hotplug.lock */
11627 +       if (!hp->mutex_init) {
11628 +#ifdef CONFIG_PREEMPT_RT_FULL
11629 +               spin_lock_init(&hp->lock);
11630 +#else
11631 +               mutex_init(&hp->mutex);
11632 +#endif
11633 +               hp->mutex_init = 1;
11634 +       }
11636 +       /* Inform the scheduler to migrate tasks off this CPU */
11637 +       tell_sched_cpu_down_begin(cpu);
11639 +       init_completion(&hp->synced);
11640 +       init_completion(&hp->unplug_wait);
11642 +       hp->sync_tsk = kthread_create(sync_unplug_thread, hp, "sync_unplug/%d", cpu);
11643 +       if (IS_ERR(hp->sync_tsk)) {
11644 +               err = PTR_ERR(hp->sync_tsk);
11645 +               hp->sync_tsk = NULL;
11646 +               return err;
11647 +       }
11648 +       kthread_bind(hp->sync_tsk, cpu);
11650 +       /*
11651 +        * Wait for tasks to get out of the pinned sections,
11652 +        * it's still OK if new tasks enter. Some CPU notifiers will
11653 +        * wait for tasks that are going to enter these sections and
11654 +        * we must not have them block.
11655 +        */
11656 +       wake_up_process(hp->sync_tsk);
11657 +       return 0;
11660 +static void cpu_unplug_sync(unsigned int cpu)
11662 +       struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
11664 +       init_completion(&hp->synced);
11665 +       /* The completion needs to be initialzied before setting grab_lock */
11666 +       smp_wmb();
11668 +       /* Grab the mutex before setting grab_lock */
11669 +       hotplug_lock(hp);
11670 +       hp->grab_lock = 1;
11672 +       /*
11673 +        * The CPU notifiers have been completed.
11674 +        * Wait for tasks to get out of pinned CPU sections and have new
11675 +        * tasks block until the CPU is completely down.
11676 +        */
11677 +       __cpu_unplug_sync(hp);
11679 +       /* All done with the sync thread */
11680 +       kthread_stop(hp->sync_tsk);
11681 +       hp->sync_tsk = NULL;
11684 +static void cpu_unplug_done(unsigned int cpu)
11686 +       struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
11688 +       hp->unplug = NULL;
11689 +       /* Let all tasks know cpu unplug is finished before cleaning up */
11690 +       smp_wmb();
11692 +       if (hp->sync_tsk)
11693 +               kthread_stop(hp->sync_tsk);
11695 +       if (hp->grab_lock) {
11696 +               hotplug_unlock(hp);
11697 +               /* protected by cpu_hotplug.lock */
11698 +               hp->grab_lock = 0;
11699 +       }
11700 +       tell_sched_cpu_down_done(cpu);
11703  void get_online_cpus(void)
11705 @@ -802,10 +1085,14 @@ static int takedown_cpu(unsigned int cpu)
11706         struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
11707         int err;
11709 +       __cpu_unplug_wait(cpu);
11710         /* Park the smpboot threads */
11711         kthread_park(per_cpu_ptr(&cpuhp_state, cpu)->thread);
11712         smpboot_park_threads(cpu);
11714 +       /* Notifiers are done. Don't let any more tasks pin this CPU. */
11715 +       cpu_unplug_sync(cpu);
11717         /*
11718          * Prevent irq alloc/free while the dying cpu reorganizes the
11719          * interrupt affinities.
11720 @@ -890,6 +1177,9 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
11721         struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
11722         int prev_state, ret = 0;
11723         bool hasdied = false;
11724 +       int mycpu;
11725 +       cpumask_var_t cpumask;
11726 +       cpumask_var_t cpumask_org;
11728         if (num_online_cpus() == 1)
11729                 return -EBUSY;
11730 @@ -897,7 +1187,34 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
11731         if (!cpu_present(cpu))
11732                 return -EINVAL;
11734 +       /* Move the downtaker off the unplug cpu */
11735 +       if (!alloc_cpumask_var(&cpumask, GFP_KERNEL))
11736 +               return -ENOMEM;
11737 +       if (!alloc_cpumask_var(&cpumask_org, GFP_KERNEL))  {
11738 +               free_cpumask_var(cpumask);
11739 +               return -ENOMEM;
11740 +       }
11742 +       cpumask_copy(cpumask_org, tsk_cpus_allowed(current));
11743 +       cpumask_andnot(cpumask, cpu_online_mask, cpumask_of(cpu));
11744 +       set_cpus_allowed_ptr(current, cpumask);
11745 +       free_cpumask_var(cpumask);
11746 +       migrate_disable();
11747 +       mycpu = smp_processor_id();
11748 +       if (mycpu == cpu) {
11749 +               printk(KERN_ERR "Yuck! Still on unplug CPU\n!");
11750 +               migrate_enable();
11751 +               ret = -EBUSY;
11752 +               goto restore_cpus;
11753 +       }
11755 +       migrate_enable();
11756         cpu_hotplug_begin();
11757 +       ret = cpu_unplug_begin(cpu);
11758 +       if (ret) {
11759 +               printk("cpu_unplug_begin(%d) failed\n", cpu);
11760 +               goto out_cancel;
11761 +       }
11763         cpuhp_tasks_frozen = tasks_frozen;
11765 @@ -936,10 +1253,15 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
11767         hasdied = prev_state != st->state && st->state == CPUHP_OFFLINE;
11768  out:
11769 +       cpu_unplug_done(cpu);
11770 +out_cancel:
11771         cpu_hotplug_done();
11772         /* This post dead nonsense must die */
11773         if (!ret && hasdied)
11774                 cpu_notify_nofail(CPU_POST_DEAD, cpu);
11775 +restore_cpus:
11776 +       set_cpus_allowed_ptr(current, cpumask_org);
11777 +       free_cpumask_var(cpumask_org);
11778         return ret;
11781 @@ -1242,6 +1564,8 @@ core_initcall(cpu_hotplug_pm_sync_init);
11783  #endif /* CONFIG_PM_SLEEP_SMP */
11785 +int __boot_cpu_id;
11787  #endif /* CONFIG_SMP */
11789  /* Boot processor state steps */
11790 @@ -1926,6 +2250,10 @@ void __init boot_cpu_init(void)
11791         set_cpu_active(cpu, true);
11792         set_cpu_present(cpu, true);
11793         set_cpu_possible(cpu, true);
11795 +#ifdef CONFIG_SMP
11796 +       __boot_cpu_id = cpu;
11797 +#endif
11800  /*
11801 diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c
11802 index 009cc9a17d95..67b02e138a47 100644
11803 --- a/kernel/cpu_pm.c
11804 +++ b/kernel/cpu_pm.c
11805 @@ -22,15 +22,21 @@
11806  #include <linux/spinlock.h>
11807  #include <linux/syscore_ops.h>
11809 -static DEFINE_RWLOCK(cpu_pm_notifier_lock);
11810 -static RAW_NOTIFIER_HEAD(cpu_pm_notifier_chain);
11811 +static ATOMIC_NOTIFIER_HEAD(cpu_pm_notifier_chain);
11813  static int cpu_pm_notify(enum cpu_pm_event event, int nr_to_call, int *nr_calls)
11815         int ret;
11817 -       ret = __raw_notifier_call_chain(&cpu_pm_notifier_chain, event, NULL,
11818 +       /*
11819 +        * __atomic_notifier_call_chain has a RCU read critical section, which
11820 +        * could be disfunctional in cpu idle. Copy RCU_NONIDLE code to let
11821 +        * RCU know this.
11822 +        */
11823 +       rcu_irq_enter_irqson();
11824 +       ret = __atomic_notifier_call_chain(&cpu_pm_notifier_chain, event, NULL,
11825                 nr_to_call, nr_calls);
11826 +       rcu_irq_exit_irqson();
11828         return notifier_to_errno(ret);
11830 @@ -47,14 +53,7 @@ static int cpu_pm_notify(enum cpu_pm_event event, int nr_to_call, int *nr_calls)
11831   */
11832  int cpu_pm_register_notifier(struct notifier_block *nb)
11834 -       unsigned long flags;
11835 -       int ret;
11837 -       write_lock_irqsave(&cpu_pm_notifier_lock, flags);
11838 -       ret = raw_notifier_chain_register(&cpu_pm_notifier_chain, nb);
11839 -       write_unlock_irqrestore(&cpu_pm_notifier_lock, flags);
11841 -       return ret;
11842 +       return atomic_notifier_chain_register(&cpu_pm_notifier_chain, nb);
11844  EXPORT_SYMBOL_GPL(cpu_pm_register_notifier);
11846 @@ -69,14 +68,7 @@ EXPORT_SYMBOL_GPL(cpu_pm_register_notifier);
11847   */
11848  int cpu_pm_unregister_notifier(struct notifier_block *nb)
11850 -       unsigned long flags;
11851 -       int ret;
11853 -       write_lock_irqsave(&cpu_pm_notifier_lock, flags);
11854 -       ret = raw_notifier_chain_unregister(&cpu_pm_notifier_chain, nb);
11855 -       write_unlock_irqrestore(&cpu_pm_notifier_lock, flags);
11857 -       return ret;
11858 +       return atomic_notifier_chain_unregister(&cpu_pm_notifier_chain, nb);
11860  EXPORT_SYMBOL_GPL(cpu_pm_unregister_notifier);
11862 @@ -100,7 +92,6 @@ int cpu_pm_enter(void)
11863         int nr_calls;
11864         int ret = 0;
11866 -       read_lock(&cpu_pm_notifier_lock);
11867         ret = cpu_pm_notify(CPU_PM_ENTER, -1, &nr_calls);
11868         if (ret)
11869                 /*
11870 @@ -108,7 +99,6 @@ int cpu_pm_enter(void)
11871                  * PM entry who are notified earlier to prepare for it.
11872                  */
11873                 cpu_pm_notify(CPU_PM_ENTER_FAILED, nr_calls - 1, NULL);
11874 -       read_unlock(&cpu_pm_notifier_lock);
11876         return ret;
11878 @@ -128,13 +118,7 @@ EXPORT_SYMBOL_GPL(cpu_pm_enter);
11879   */
11880  int cpu_pm_exit(void)
11882 -       int ret;
11884 -       read_lock(&cpu_pm_notifier_lock);
11885 -       ret = cpu_pm_notify(CPU_PM_EXIT, -1, NULL);
11886 -       read_unlock(&cpu_pm_notifier_lock);
11888 -       return ret;
11889 +       return cpu_pm_notify(CPU_PM_EXIT, -1, NULL);
11891  EXPORT_SYMBOL_GPL(cpu_pm_exit);
11893 @@ -159,7 +143,6 @@ int cpu_cluster_pm_enter(void)
11894         int nr_calls;
11895         int ret = 0;
11897 -       read_lock(&cpu_pm_notifier_lock);
11898         ret = cpu_pm_notify(CPU_CLUSTER_PM_ENTER, -1, &nr_calls);
11899         if (ret)
11900                 /*
11901 @@ -167,7 +150,6 @@ int cpu_cluster_pm_enter(void)
11902                  * PM entry who are notified earlier to prepare for it.
11903                  */
11904                 cpu_pm_notify(CPU_CLUSTER_PM_ENTER_FAILED, nr_calls - 1, NULL);
11905 -       read_unlock(&cpu_pm_notifier_lock);
11907         return ret;
11909 @@ -190,13 +172,7 @@ EXPORT_SYMBOL_GPL(cpu_cluster_pm_enter);
11910   */
11911  int cpu_cluster_pm_exit(void)
11913 -       int ret;
11915 -       read_lock(&cpu_pm_notifier_lock);
11916 -       ret = cpu_pm_notify(CPU_CLUSTER_PM_EXIT, -1, NULL);
11917 -       read_unlock(&cpu_pm_notifier_lock);
11919 -       return ret;
11920 +       return cpu_pm_notify(CPU_CLUSTER_PM_EXIT, -1, NULL);
11922  EXPORT_SYMBOL_GPL(cpu_cluster_pm_exit);
11924 diff --git a/kernel/cpuset.c b/kernel/cpuset.c
11925 index 511b1dd8ff09..1dd63833ecdc 100644
11926 --- a/kernel/cpuset.c
11927 +++ b/kernel/cpuset.c
11928 @@ -285,7 +285,7 @@ static struct cpuset top_cpuset = {
11929   */
11931  static DEFINE_MUTEX(cpuset_mutex);
11932 -static DEFINE_SPINLOCK(callback_lock);
11933 +static DEFINE_RAW_SPINLOCK(callback_lock);
11935  static struct workqueue_struct *cpuset_migrate_mm_wq;
11937 @@ -908,9 +908,9 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
11938                         continue;
11939                 rcu_read_unlock();
11941 -               spin_lock_irq(&callback_lock);
11942 +               raw_spin_lock_irq(&callback_lock);
11943                 cpumask_copy(cp->effective_cpus, new_cpus);
11944 -               spin_unlock_irq(&callback_lock);
11945 +               raw_spin_unlock_irq(&callback_lock);
11947                 WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
11948                         !cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
11949 @@ -975,9 +975,9 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
11950         if (retval < 0)
11951                 return retval;
11953 -       spin_lock_irq(&callback_lock);
11954 +       raw_spin_lock_irq(&callback_lock);
11955         cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
11956 -       spin_unlock_irq(&callback_lock);
11957 +       raw_spin_unlock_irq(&callback_lock);
11959         /* use trialcs->cpus_allowed as a temp variable */
11960         update_cpumasks_hier(cs, trialcs->cpus_allowed);
11961 @@ -1177,9 +1177,9 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
11962                         continue;
11963                 rcu_read_unlock();
11965 -               spin_lock_irq(&callback_lock);
11966 +               raw_spin_lock_irq(&callback_lock);
11967                 cp->effective_mems = *new_mems;
11968 -               spin_unlock_irq(&callback_lock);
11969 +               raw_spin_unlock_irq(&callback_lock);
11971                 WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
11972                         !nodes_equal(cp->mems_allowed, cp->effective_mems));
11973 @@ -1247,9 +1247,9 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
11974         if (retval < 0)
11975                 goto done;
11977 -       spin_lock_irq(&callback_lock);
11978 +       raw_spin_lock_irq(&callback_lock);
11979         cs->mems_allowed = trialcs->mems_allowed;
11980 -       spin_unlock_irq(&callback_lock);
11981 +       raw_spin_unlock_irq(&callback_lock);
11983         /* use trialcs->mems_allowed as a temp variable */
11984         update_nodemasks_hier(cs, &trialcs->mems_allowed);
11985 @@ -1340,9 +1340,9 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
11986         spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
11987                         || (is_spread_page(cs) != is_spread_page(trialcs)));
11989 -       spin_lock_irq(&callback_lock);
11990 +       raw_spin_lock_irq(&callback_lock);
11991         cs->flags = trialcs->flags;
11992 -       spin_unlock_irq(&callback_lock);
11993 +       raw_spin_unlock_irq(&callback_lock);
11995         if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
11996                 rebuild_sched_domains_locked();
11997 @@ -1757,7 +1757,7 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
11998         cpuset_filetype_t type = seq_cft(sf)->private;
11999         int ret = 0;
12001 -       spin_lock_irq(&callback_lock);
12002 +       raw_spin_lock_irq(&callback_lock);
12004         switch (type) {
12005         case FILE_CPULIST:
12006 @@ -1776,7 +1776,7 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
12007                 ret = -EINVAL;
12008         }
12010 -       spin_unlock_irq(&callback_lock);
12011 +       raw_spin_unlock_irq(&callback_lock);
12012         return ret;
12015 @@ -1991,12 +1991,12 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
12017         cpuset_inc();
12019 -       spin_lock_irq(&callback_lock);
12020 +       raw_spin_lock_irq(&callback_lock);
12021         if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) {
12022                 cpumask_copy(cs->effective_cpus, parent->effective_cpus);
12023                 cs->effective_mems = parent->effective_mems;
12024         }
12025 -       spin_unlock_irq(&callback_lock);
12026 +       raw_spin_unlock_irq(&callback_lock);
12028         if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
12029                 goto out_unlock;
12030 @@ -2023,12 +2023,12 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
12031         }
12032         rcu_read_unlock();
12034 -       spin_lock_irq(&callback_lock);
12035 +       raw_spin_lock_irq(&callback_lock);
12036         cs->mems_allowed = parent->mems_allowed;
12037         cs->effective_mems = parent->mems_allowed;
12038         cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
12039         cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
12040 -       spin_unlock_irq(&callback_lock);
12041 +       raw_spin_unlock_irq(&callback_lock);
12042  out_unlock:
12043         mutex_unlock(&cpuset_mutex);
12044         return 0;
12045 @@ -2067,7 +2067,7 @@ static void cpuset_css_free(struct cgroup_subsys_state *css)
12046  static void cpuset_bind(struct cgroup_subsys_state *root_css)
12048         mutex_lock(&cpuset_mutex);
12049 -       spin_lock_irq(&callback_lock);
12050 +       raw_spin_lock_irq(&callback_lock);
12052         if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) {
12053                 cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
12054 @@ -2078,7 +2078,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css)
12055                 top_cpuset.mems_allowed = top_cpuset.effective_mems;
12056         }
12058 -       spin_unlock_irq(&callback_lock);
12059 +       raw_spin_unlock_irq(&callback_lock);
12060         mutex_unlock(&cpuset_mutex);
12063 @@ -2179,12 +2179,12 @@ hotplug_update_tasks_legacy(struct cpuset *cs,
12065         bool is_empty;
12067 -       spin_lock_irq(&callback_lock);
12068 +       raw_spin_lock_irq(&callback_lock);
12069         cpumask_copy(cs->cpus_allowed, new_cpus);
12070         cpumask_copy(cs->effective_cpus, new_cpus);
12071         cs->mems_allowed = *new_mems;
12072         cs->effective_mems = *new_mems;
12073 -       spin_unlock_irq(&callback_lock);
12074 +       raw_spin_unlock_irq(&callback_lock);
12076         /*
12077          * Don't call update_tasks_cpumask() if the cpuset becomes empty,
12078 @@ -2221,10 +2221,10 @@ hotplug_update_tasks(struct cpuset *cs,
12079         if (nodes_empty(*new_mems))
12080                 *new_mems = parent_cs(cs)->effective_mems;
12082 -       spin_lock_irq(&callback_lock);
12083 +       raw_spin_lock_irq(&callback_lock);
12084         cpumask_copy(cs->effective_cpus, new_cpus);
12085         cs->effective_mems = *new_mems;
12086 -       spin_unlock_irq(&callback_lock);
12087 +       raw_spin_unlock_irq(&callback_lock);
12089         if (cpus_updated)
12090                 update_tasks_cpumask(cs);
12091 @@ -2317,21 +2317,21 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
12093         /* synchronize cpus_allowed to cpu_active_mask */
12094         if (cpus_updated) {
12095 -               spin_lock_irq(&callback_lock);
12096 +               raw_spin_lock_irq(&callback_lock);
12097                 if (!on_dfl)
12098                         cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
12099                 cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
12100 -               spin_unlock_irq(&callback_lock);
12101 +               raw_spin_unlock_irq(&callback_lock);
12102                 /* we don't mess with cpumasks of tasks in top_cpuset */
12103         }
12105         /* synchronize mems_allowed to N_MEMORY */
12106         if (mems_updated) {
12107 -               spin_lock_irq(&callback_lock);
12108 +               raw_spin_lock_irq(&callback_lock);
12109                 if (!on_dfl)
12110                         top_cpuset.mems_allowed = new_mems;
12111                 top_cpuset.effective_mems = new_mems;
12112 -               spin_unlock_irq(&callback_lock);
12113 +               raw_spin_unlock_irq(&callback_lock);
12114                 update_tasks_nodemask(&top_cpuset);
12115         }
12117 @@ -2436,11 +2436,11 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
12119         unsigned long flags;
12121 -       spin_lock_irqsave(&callback_lock, flags);
12122 +       raw_spin_lock_irqsave(&callback_lock, flags);
12123         rcu_read_lock();
12124         guarantee_online_cpus(task_cs(tsk), pmask);
12125         rcu_read_unlock();
12126 -       spin_unlock_irqrestore(&callback_lock, flags);
12127 +       raw_spin_unlock_irqrestore(&callback_lock, flags);
12130  void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
12131 @@ -2488,11 +2488,11 @@ nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
12132         nodemask_t mask;
12133         unsigned long flags;
12135 -       spin_lock_irqsave(&callback_lock, flags);
12136 +       raw_spin_lock_irqsave(&callback_lock, flags);
12137         rcu_read_lock();
12138         guarantee_online_mems(task_cs(tsk), &mask);
12139         rcu_read_unlock();
12140 -       spin_unlock_irqrestore(&callback_lock, flags);
12141 +       raw_spin_unlock_irqrestore(&callback_lock, flags);
12143         return mask;
12145 @@ -2584,14 +2584,14 @@ bool __cpuset_node_allowed(int node, gfp_t gfp_mask)
12146                 return true;
12148         /* Not hardwall and node outside mems_allowed: scan up cpusets */
12149 -       spin_lock_irqsave(&callback_lock, flags);
12150 +       raw_spin_lock_irqsave(&callback_lock, flags);
12152         rcu_read_lock();
12153         cs = nearest_hardwall_ancestor(task_cs(current));
12154         allowed = node_isset(node, cs->mems_allowed);
12155         rcu_read_unlock();
12157 -       spin_unlock_irqrestore(&callback_lock, flags);
12158 +       raw_spin_unlock_irqrestore(&callback_lock, flags);
12159         return allowed;
12162 diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
12163 index 77777d918676..3203e9dee9f8 100644
12164 --- a/kernel/debug/kdb/kdb_io.c
12165 +++ b/kernel/debug/kdb/kdb_io.c
12166 @@ -554,7 +554,6 @@ int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap)
12167         int linecount;
12168         int colcount;
12169         int logging, saved_loglevel = 0;
12170 -       int saved_trap_printk;
12171         int got_printf_lock = 0;
12172         int retlen = 0;
12173         int fnd, len;
12174 @@ -565,8 +564,6 @@ int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap)
12175         unsigned long uninitialized_var(flags);
12177         preempt_disable();
12178 -       saved_trap_printk = kdb_trap_printk;
12179 -       kdb_trap_printk = 0;
12181         /* Serialize kdb_printf if multiple cpus try to write at once.
12182          * But if any cpu goes recursive in kdb, just print the output,
12183 @@ -855,7 +852,6 @@ int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap)
12184         } else {
12185                 __release(kdb_printf_lock);
12186         }
12187 -       kdb_trap_printk = saved_trap_printk;
12188         preempt_enable();
12189         return retlen;
12191 @@ -865,9 +861,11 @@ int kdb_printf(const char *fmt, ...)
12192         va_list ap;
12193         int r;
12195 +       kdb_trap_printk++;
12196         va_start(ap, fmt);
12197         r = vkdb_printf(KDB_MSGSRC_INTERNAL, fmt, ap);
12198         va_end(ap);
12199 +       kdb_trap_printk--;
12201         return r;
12203 diff --git a/kernel/events/core.c b/kernel/events/core.c
12204 index 13b9784427b0..f74fbfe5465c 100644
12205 --- a/kernel/events/core.c
12206 +++ b/kernel/events/core.c
12207 @@ -1050,6 +1050,7 @@ static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
12208         raw_spin_lock_init(&cpuctx->hrtimer_lock);
12209         hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
12210         timer->function = perf_mux_hrtimer_handler;
12211 +       timer->irqsafe = 1;
12214  static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx)
12215 @@ -8405,6 +8406,7 @@ static void perf_swevent_init_hrtimer(struct perf_event *event)
12217         hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
12218         hwc->hrtimer.function = perf_swevent_hrtimer;
12219 +       hwc->hrtimer.irqsafe = 1;
12221         /*
12222          * Since hrtimers have a fixed rate, we can do a static freq->period
12223 diff --git a/kernel/exit.c b/kernel/exit.c
12224 index 3076f3089919..fb2ebcf3ca7c 100644
12225 --- a/kernel/exit.c
12226 +++ b/kernel/exit.c
12227 @@ -143,7 +143,7 @@ static void __exit_signal(struct task_struct *tsk)
12228          * Do this under ->siglock, we can race with another thread
12229          * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals.
12230          */
12231 -       flush_sigqueue(&tsk->pending);
12232 +       flush_task_sigqueue(tsk);
12233         tsk->sighand = NULL;
12234         spin_unlock(&sighand->siglock);
12236 diff --git a/kernel/fork.c b/kernel/fork.c
12237 index 70e10cb49be0..2529725eefa2 100644
12238 --- a/kernel/fork.c
12239 +++ b/kernel/fork.c
12240 @@ -77,6 +77,7 @@
12241  #include <linux/compiler.h>
12242  #include <linux/sysctl.h>
12243  #include <linux/kcov.h>
12244 +#include <linux/kprobes.h>
12246  #include <asm/pgtable.h>
12247  #include <asm/pgalloc.h>
12248 @@ -378,13 +379,24 @@ static inline void put_signal_struct(struct signal_struct *sig)
12249         if (atomic_dec_and_test(&sig->sigcnt))
12250                 free_signal_struct(sig);
12253 +#ifdef CONFIG_PREEMPT_RT_BASE
12254 +static
12255 +#endif
12256  void __put_task_struct(struct task_struct *tsk)
12258         WARN_ON(!tsk->exit_state);
12259         WARN_ON(atomic_read(&tsk->usage));
12260         WARN_ON(tsk == current);
12262 +       /*
12263 +        * Remove function-return probe instances associated with this
12264 +        * task and put them back on the free list.
12265 +        */
12266 +       kprobe_flush_task(tsk);
12268 +       /* Task is done with its stack. */
12269 +       put_task_stack(tsk);
12271         cgroup_free(tsk);
12272         task_numa_free(tsk);
12273         security_task_free(tsk);
12274 @@ -395,7 +407,18 @@ void __put_task_struct(struct task_struct *tsk)
12275         if (!profile_handoff_task(tsk))
12276                 free_task(tsk);
12278 +#ifndef CONFIG_PREEMPT_RT_BASE
12279  EXPORT_SYMBOL_GPL(__put_task_struct);
12280 +#else
12281 +void __put_task_struct_cb(struct rcu_head *rhp)
12283 +       struct task_struct *tsk = container_of(rhp, struct task_struct, put_rcu);
12285 +       __put_task_struct(tsk);
12288 +EXPORT_SYMBOL_GPL(__put_task_struct_cb);
12289 +#endif
12291  void __init __weak arch_task_cache_init(void) { }
12293 @@ -541,6 +564,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
12294         tsk->splice_pipe = NULL;
12295         tsk->task_frag.page = NULL;
12296         tsk->wake_q.next = NULL;
12297 +       tsk->wake_q_sleeper.next = NULL;
12299         account_kernel_stack(tsk, 1);
12301 @@ -867,6 +891,19 @@ void __mmdrop(struct mm_struct *mm)
12303  EXPORT_SYMBOL_GPL(__mmdrop);
12305 +#ifdef CONFIG_PREEMPT_RT_BASE
12307 + * RCU callback for delayed mm drop. Not strictly rcu, but we don't
12308 + * want another facility to make this work.
12309 + */
12310 +void __mmdrop_delayed(struct rcu_head *rhp)
12312 +       struct mm_struct *mm = container_of(rhp, struct mm_struct, delayed_drop);
12314 +       __mmdrop(mm);
12316 +#endif
12318  static inline void __mmput(struct mm_struct *mm)
12320         VM_BUG_ON(atomic_read(&mm->mm_users));
12321 @@ -1432,6 +1469,7 @@ static void rt_mutex_init_task(struct task_struct *p)
12322  #ifdef CONFIG_RT_MUTEXES
12323         p->pi_waiters = RB_ROOT;
12324         p->pi_waiters_leftmost = NULL;
12325 +       p->pi_top_task = NULL;
12326         p->pi_blocked_on = NULL;
12327  #endif
12329 @@ -1441,6 +1479,9 @@ static void rt_mutex_init_task(struct task_struct *p)
12330   */
12331  static void posix_cpu_timers_init(struct task_struct *tsk)
12333 +#ifdef CONFIG_PREEMPT_RT_BASE
12334 +       tsk->posix_timer_list = NULL;
12335 +#endif
12336         tsk->cputime_expires.prof_exp = 0;
12337         tsk->cputime_expires.virt_exp = 0;
12338         tsk->cputime_expires.sched_exp = 0;
12339 @@ -1567,6 +1608,7 @@ static __latent_entropy struct task_struct *copy_process(
12340         spin_lock_init(&p->alloc_lock);
12342         init_sigpending(&p->pending);
12343 +       p->sigqueue_cache = NULL;
12345         p->utime = p->stime = p->gtime = 0;
12346         p->utimescaled = p->stimescaled = 0;
12347 diff --git a/kernel/futex.c b/kernel/futex.c
12348 index 88bad86180ac..2e074d63e8fa 100644
12349 --- a/kernel/futex.c
12350 +++ b/kernel/futex.c
12351 @@ -801,7 +801,7 @@ static int refill_pi_state_cache(void)
12352         return 0;
12355 -static struct futex_pi_state * alloc_pi_state(void)
12356 +static struct futex_pi_state *alloc_pi_state(void)
12358         struct futex_pi_state *pi_state = current->pi_state_cache;
12360 @@ -811,6 +811,11 @@ static struct futex_pi_state * alloc_pi_state(void)
12361         return pi_state;
12364 +static void get_pi_state(struct futex_pi_state *pi_state)
12366 +       WARN_ON_ONCE(!atomic_inc_not_zero(&pi_state->refcount));
12369  /*
12370   * Drops a reference to the pi_state object and frees or caches it
12371   * when the last reference is gone.
12372 @@ -855,7 +860,7 @@ static void put_pi_state(struct futex_pi_state *pi_state)
12373   * Look up the task based on what TID userspace gave us.
12374   * We dont trust it.
12375   */
12376 -static struct task_struct * futex_find_get_task(pid_t pid)
12377 +static struct task_struct *futex_find_get_task(pid_t pid)
12379         struct task_struct *p;
12381 @@ -905,7 +910,9 @@ void exit_pi_state_list(struct task_struct *curr)
12382                  * task still owns the PI-state:
12383                  */
12384                 if (head->next != next) {
12385 +                       raw_spin_unlock_irq(&curr->pi_lock);
12386                         spin_unlock(&hb->lock);
12387 +                       raw_spin_lock_irq(&curr->pi_lock);
12388                         continue;
12389                 }
12391 @@ -915,10 +922,12 @@ void exit_pi_state_list(struct task_struct *curr)
12392                 pi_state->owner = NULL;
12393                 raw_spin_unlock_irq(&curr->pi_lock);
12395 -               rt_mutex_unlock(&pi_state->pi_mutex);
12397 +               get_pi_state(pi_state);
12398                 spin_unlock(&hb->lock);
12400 +               rt_mutex_futex_unlock(&pi_state->pi_mutex);
12401 +               put_pi_state(pi_state);
12403                 raw_spin_lock_irq(&curr->pi_lock);
12404         }
12405         raw_spin_unlock_irq(&curr->pi_lock);
12406 @@ -972,6 +981,39 @@ void exit_pi_state_list(struct task_struct *curr)
12407   *
12408   * [10] There is no transient state which leaves owner and user space
12409   *     TID out of sync.
12410 + *
12411 + *
12412 + * Serialization and lifetime rules:
12413 + *
12414 + * hb->lock:
12415 + *
12416 + *     hb -> futex_q, relation
12417 + *     futex_q -> pi_state, relation
12418 + *
12419 + *     (cannot be raw because hb can contain arbitrary amount
12420 + *      of futex_q's)
12421 + *
12422 + * pi_mutex->wait_lock:
12423 + *
12424 + *     {uval, pi_state}
12425 + *
12426 + *     (and pi_mutex 'obviously')
12427 + *
12428 + * p->pi_lock:
12429 + *
12430 + *     p->pi_state_list -> pi_state->list, relation
12431 + *
12432 + * pi_state->refcount:
12433 + *
12434 + *     pi_state lifetime
12435 + *
12436 + *
12437 + * Lock order:
12438 + *
12439 + *   hb->lock
12440 + *     pi_mutex->wait_lock
12441 + *       p->pi_lock
12442 + *
12443   */
12445  /*
12446 @@ -979,10 +1021,13 @@ void exit_pi_state_list(struct task_struct *curr)
12447   * the pi_state against the user space value. If correct, attach to
12448   * it.
12449   */
12450 -static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
12451 +static int attach_to_pi_state(u32 __user *uaddr, u32 uval,
12452 +                             struct futex_pi_state *pi_state,
12453                               struct futex_pi_state **ps)
12455         pid_t pid = uval & FUTEX_TID_MASK;
12456 +       u32 uval2;
12457 +       int ret;
12459         /*
12460          * Userspace might have messed up non-PI and PI futexes [3]
12461 @@ -990,8 +1035,38 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
12462         if (unlikely(!pi_state))
12463                 return -EINVAL;
12465 +       /*
12466 +        * We get here with hb->lock held, and having found a
12467 +        * futex_top_waiter(). This means that futex_lock_pi() of said futex_q
12468 +        * has dropped the hb->lock in between queue_me() and unqueue_me_pi(),
12469 +        * which in turn means that futex_lock_pi() still has a reference on
12470 +        * our pi_state.
12471 +        *
12472 +        * The waiter holding a reference on @pi_state also protects against
12473 +        * the unlocked put_pi_state() in futex_unlock_pi(), futex_lock_pi()
12474 +        * and futex_wait_requeue_pi() as it cannot go to 0 and consequently
12475 +        * free pi_state before we can take a reference ourselves.
12476 +        */
12477         WARN_ON(!atomic_read(&pi_state->refcount));
12479 +       /*
12480 +        * Now that we have a pi_state, we can acquire wait_lock
12481 +        * and do the state validation.
12482 +        */
12483 +       raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
12485 +       /*
12486 +        * Since {uval, pi_state} is serialized by wait_lock, and our current
12487 +        * uval was read without holding it, it can have changed. Verify it
12488 +        * still is what we expect it to be, otherwise retry the entire
12489 +        * operation.
12490 +        */
12491 +       if (get_futex_value_locked(&uval2, uaddr))
12492 +               goto out_efault;
12494 +       if (uval != uval2)
12495 +               goto out_eagain;
12497         /*
12498          * Handle the owner died case:
12499          */
12500 @@ -1007,11 +1082,11 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
12501                          * is not 0. Inconsistent state. [5]
12502                          */
12503                         if (pid)
12504 -                               return -EINVAL;
12505 +                               goto out_einval;
12506                         /*
12507                          * Take a ref on the state and return success. [4]
12508                          */
12509 -                       goto out_state;
12510 +                       goto out_attach;
12511                 }
12513                 /*
12514 @@ -1023,14 +1098,14 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
12515                  * Take a ref on the state and return success. [6]
12516                  */
12517                 if (!pid)
12518 -                       goto out_state;
12519 +                       goto out_attach;
12520         } else {
12521                 /*
12522                  * If the owner died bit is not set, then the pi_state
12523                  * must have an owner. [7]
12524                  */
12525                 if (!pi_state->owner)
12526 -                       return -EINVAL;
12527 +                       goto out_einval;
12528         }
12530         /*
12531 @@ -1039,11 +1114,29 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
12532          * user space TID. [9/10]
12533          */
12534         if (pid != task_pid_vnr(pi_state->owner))
12535 -               return -EINVAL;
12536 -out_state:
12537 -       atomic_inc(&pi_state->refcount);
12538 +               goto out_einval;
12540 +out_attach:
12541 +       get_pi_state(pi_state);
12542 +       raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
12543         *ps = pi_state;
12544         return 0;
12546 +out_einval:
12547 +       ret = -EINVAL;
12548 +       goto out_error;
12550 +out_eagain:
12551 +       ret = -EAGAIN;
12552 +       goto out_error;
12554 +out_efault:
12555 +       ret = -EFAULT;
12556 +       goto out_error;
12558 +out_error:
12559 +       raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
12560 +       return ret;
12563  /*
12564 @@ -1094,6 +1187,9 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key,
12566         /*
12567          * No existing pi state. First waiter. [2]
12568 +        *
12569 +        * This creates pi_state, we have hb->lock held, this means nothing can
12570 +        * observe this state, wait_lock is irrelevant.
12571          */
12572         pi_state = alloc_pi_state();
12574 @@ -1118,17 +1214,18 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key,
12575         return 0;
12578 -static int lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
12579 +static int lookup_pi_state(u32 __user *uaddr, u32 uval,
12580 +                          struct futex_hash_bucket *hb,
12581                            union futex_key *key, struct futex_pi_state **ps)
12583 -       struct futex_q *match = futex_top_waiter(hb, key);
12584 +       struct futex_q *top_waiter = futex_top_waiter(hb, key);
12586         /*
12587          * If there is a waiter on that futex, validate it and
12588          * attach to the pi_state when the validation succeeds.
12589          */
12590 -       if (match)
12591 -               return attach_to_pi_state(uval, match->pi_state, ps);
12592 +       if (top_waiter)
12593 +               return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);
12595         /*
12596          * We are the first waiter - try to look up the owner based on
12597 @@ -1147,7 +1244,7 @@ static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
12598         if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)))
12599                 return -EFAULT;
12601 -       /*If user space value changed, let the caller retry */
12602 +       /* If user space value changed, let the caller retry */
12603         return curval != uval ? -EAGAIN : 0;
12606 @@ -1175,7 +1272,7 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
12607                                 struct task_struct *task, int set_waiters)
12609         u32 uval, newval, vpid = task_pid_vnr(task);
12610 -       struct futex_q *match;
12611 +       struct futex_q *top_waiter;
12612         int ret;
12614         /*
12615 @@ -1201,9 +1298,9 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
12616          * Lookup existing state first. If it exists, try to attach to
12617          * its pi_state.
12618          */
12619 -       match = futex_top_waiter(hb, key);
12620 -       if (match)
12621 -               return attach_to_pi_state(uval, match->pi_state, ps);
12622 +       top_waiter = futex_top_waiter(hb, key);
12623 +       if (top_waiter)
12624 +               return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);
12626         /*
12627          * No waiter and user TID is 0. We are here because the
12628 @@ -1284,50 +1381,45 @@ static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q)
12629         wake_q_add(wake_q, p);
12630         __unqueue_futex(q);
12631         /*
12632 -        * The waiting task can free the futex_q as soon as
12633 -        * q->lock_ptr = NULL is written, without taking any locks. A
12634 -        * memory barrier is required here to prevent the following
12635 -        * store to lock_ptr from getting ahead of the plist_del.
12636 +        * The waiting task can free the futex_q as soon as q->lock_ptr = NULL
12637 +        * is written, without taking any locks. This is possible in the event
12638 +        * of a spurious wakeup, for example. A memory barrier is required here
12639 +        * to prevent the following store to lock_ptr from getting ahead of the
12640 +        * plist_del in __unqueue_futex().
12641          */
12642 -       smp_wmb();
12643 -       q->lock_ptr = NULL;
12644 +       smp_store_release(&q->lock_ptr, NULL);
12647 -static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
12648 -                        struct futex_hash_bucket *hb)
12650 + * Caller must hold a reference on @pi_state.
12651 + */
12652 +static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state)
12654 -       struct task_struct *new_owner;
12655 -       struct futex_pi_state *pi_state = this->pi_state;
12656         u32 uninitialized_var(curval), newval;
12657 +       struct task_struct *new_owner;
12658 +       bool postunlock = false;
12659         WAKE_Q(wake_q);
12660 -       bool deboost;
12661 +       WAKE_Q(wake_sleeper_q);
12662         int ret = 0;
12664 -       if (!pi_state)
12665 -               return -EINVAL;
12667 -       /*
12668 -        * If current does not own the pi_state then the futex is
12669 -        * inconsistent and user space fiddled with the futex value.
12670 -        */
12671 -       if (pi_state->owner != current)
12672 -               return -EINVAL;
12674 -       raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
12675         new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
12676 +       if (WARN_ON_ONCE(!new_owner)) {
12677 +               /*
12678 +                * As per the comment in futex_unlock_pi() this should not happen.
12679 +                *
12680 +                * When this happens, give up our locks and try again, giving
12681 +                * the futex_lock_pi() instance time to complete, either by
12682 +                * waiting on the rtmutex or removing itself from the futex
12683 +                * queue.
12684 +                */
12685 +               ret = -EAGAIN;
12686 +               goto out_unlock;
12687 +       }
12689         /*
12690 -        * It is possible that the next waiter (the one that brought
12691 -        * this owner to the kernel) timed out and is no longer
12692 -        * waiting on the lock.
12693 -        */
12694 -       if (!new_owner)
12695 -               new_owner = this->task;
12697 -       /*
12698 -        * We pass it to the next owner. The WAITERS bit is always
12699 -        * kept enabled while there is PI state around. We cleanup the
12700 -        * owner died bit, because we are the owner.
12701 +        * We pass it to the next owner. The WAITERS bit is always kept
12702 +        * enabled while there is PI state around. We cleanup the owner
12703 +        * died bit, because we are the owner.
12704          */
12705         newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
12707 @@ -1336,6 +1428,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
12709         if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) {
12710                 ret = -EFAULT;
12712         } else if (curval != uval) {
12713                 /*
12714                  * If a unconditional UNLOCK_PI operation (user space did not
12715 @@ -1348,10 +1441,14 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
12716                 else
12717                         ret = -EINVAL;
12718         }
12719 -       if (ret) {
12720 -               raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
12721 -               return ret;
12722 -       }
12724 +       if (ret)
12725 +               goto out_unlock;
12727 +       /*
12728 +        * This is a point of no return; once we modify the uval there is no
12729 +        * going back and subsequent operations must not fail.
12730 +        */
12732         raw_spin_lock(&pi_state->owner->pi_lock);
12733         WARN_ON(list_empty(&pi_state->list));
12734 @@ -1364,22 +1461,15 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
12735         pi_state->owner = new_owner;
12736         raw_spin_unlock(&new_owner->pi_lock);
12738 +       postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q,
12739 +                                            &wake_sleeper_q);
12740 +out_unlock:
12741         raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
12743 -       deboost = rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q);
12744 +       if (postunlock)
12745 +               rt_mutex_postunlock(&wake_q, &wake_sleeper_q);
12747 -       /*
12748 -        * First unlock HB so the waiter does not spin on it once he got woken
12749 -        * up. Second wake up the waiter before the priority is adjusted. If we
12750 -        * deboost first (and lose our higher priority), then the task might get
12751 -        * scheduled away before the wake up can take place.
12752 -        */
12753 -       spin_unlock(&hb->lock);
12754 -       wake_up_q(&wake_q);
12755 -       if (deboost)
12756 -               rt_mutex_adjust_prio(current);
12758 -       return 0;
12759 +       return ret;
12762  /*
12763 @@ -1825,7 +1915,7 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
12764                          * If that call succeeds then we have pi_state and an
12765                          * initial refcount on it.
12766                          */
12767 -                       ret = lookup_pi_state(ret, hb2, &key2, &pi_state);
12768 +                       ret = lookup_pi_state(uaddr2, ret, hb2, &key2, &pi_state);
12769                 }
12771                 switch (ret) {
12772 @@ -1908,7 +1998,7 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
12773                          * refcount on the pi_state and store the pointer in
12774                          * the futex_q object of the waiter.
12775                          */
12776 -                       atomic_inc(&pi_state->refcount);
12777 +                       get_pi_state(pi_state);
12778                         this->pi_state = pi_state;
12779                         ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
12780                                                         this->rt_waiter,
12781 @@ -1925,6 +2015,16 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
12782                                 requeue_pi_wake_futex(this, &key2, hb2);
12783                                 drop_count++;
12784                                 continue;
12785 +                       } else if (ret == -EAGAIN) {
12786 +                               /*
12787 +                                * Waiter was woken by timeout or
12788 +                                * signal and has set pi_blocked_on to
12789 +                                * PI_WAKEUP_INPROGRESS before we
12790 +                                * tried to enqueue it on the rtmutex.
12791 +                                */
12792 +                               this->pi_state = NULL;
12793 +                               put_pi_state(pi_state);
12794 +                               continue;
12795                         } else if (ret) {
12796                                 /*
12797                                  * rt_mutex_start_proxy_lock() detected a
12798 @@ -2008,20 +2108,7 @@ queue_unlock(struct futex_hash_bucket *hb)
12799         hb_waiters_dec(hb);
12802 -/**
12803 - * queue_me() - Enqueue the futex_q on the futex_hash_bucket
12804 - * @q: The futex_q to enqueue
12805 - * @hb:        The destination hash bucket
12806 - *
12807 - * The hb->lock must be held by the caller, and is released here. A call to
12808 - * queue_me() is typically paired with exactly one call to unqueue_me().  The
12809 - * exceptions involve the PI related operations, which may use unqueue_me_pi()
12810 - * or nothing if the unqueue is done as part of the wake process and the unqueue
12811 - * state is implicit in the state of woken task (see futex_wait_requeue_pi() for
12812 - * an example).
12813 - */
12814 -static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
12815 -       __releases(&hb->lock)
12816 +static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
12818         int prio;
12820 @@ -2038,6 +2125,24 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
12821         plist_node_init(&q->list, prio);
12822         plist_add(&q->list, &hb->chain);
12823         q->task = current;
12826 +/**
12827 + * queue_me() - Enqueue the futex_q on the futex_hash_bucket
12828 + * @q: The futex_q to enqueue
12829 + * @hb:        The destination hash bucket
12830 + *
12831 + * The hb->lock must be held by the caller, and is released here. A call to
12832 + * queue_me() is typically paired with exactly one call to unqueue_me().  The
12833 + * exceptions involve the PI related operations, which may use unqueue_me_pi()
12834 + * or nothing if the unqueue is done as part of the wake process and the unqueue
12835 + * state is implicit in the state of woken task (see futex_wait_requeue_pi() for
12836 + * an example).
12837 + */
12838 +static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
12839 +       __releases(&hb->lock)
12841 +       __queue_me(q, hb);
12842         spin_unlock(&hb->lock);
12845 @@ -2124,10 +2229,13 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
12847         u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
12848         struct futex_pi_state *pi_state = q->pi_state;
12849 -       struct task_struct *oldowner = pi_state->owner;
12850         u32 uval, uninitialized_var(curval), newval;
12851 +       struct task_struct *oldowner;
12852         int ret;
12854 +       raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
12856 +       oldowner = pi_state->owner;
12857         /* Owner died? */
12858         if (!pi_state->owner)
12859                 newtid |= FUTEX_OWNER_DIED;
12860 @@ -2135,7 +2243,8 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
12861         /*
12862          * We are here either because we stole the rtmutex from the
12863          * previous highest priority waiter or we are the highest priority
12864 -        * waiter but failed to get the rtmutex the first time.
12865 +        * waiter but have failed to get the rtmutex the first time.
12866 +        *
12867          * We have to replace the newowner TID in the user space variable.
12868          * This must be atomic as we have to preserve the owner died bit here.
12869          *
12870 @@ -2143,17 +2252,16 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
12871          * because we can fault here. Imagine swapped out pages or a fork
12872          * that marked all the anonymous memory readonly for cow.
12873          *
12874 -        * Modifying pi_state _before_ the user space value would
12875 -        * leave the pi_state in an inconsistent state when we fault
12876 -        * here, because we need to drop the hash bucket lock to
12877 -        * handle the fault. This might be observed in the PID check
12878 -        * in lookup_pi_state.
12879 +        * Modifying pi_state _before_ the user space value would leave the
12880 +        * pi_state in an inconsistent state when we fault here, because we
12881 +        * need to drop the locks to handle the fault. This might be observed
12882 +        * in the PID check in lookup_pi_state.
12883          */
12884  retry:
12885         if (get_futex_value_locked(&uval, uaddr))
12886                 goto handle_fault;
12888 -       while (1) {
12889 +       for (;;) {
12890                 newval = (uval & FUTEX_OWNER_DIED) | newtid;
12892                 if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
12893 @@ -2168,47 +2276,60 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
12894          * itself.
12895          */
12896         if (pi_state->owner != NULL) {
12897 -               raw_spin_lock_irq(&pi_state->owner->pi_lock);
12898 +               raw_spin_lock(&pi_state->owner->pi_lock);
12899                 WARN_ON(list_empty(&pi_state->list));
12900                 list_del_init(&pi_state->list);
12901 -               raw_spin_unlock_irq(&pi_state->owner->pi_lock);
12902 +               raw_spin_unlock(&pi_state->owner->pi_lock);
12903         }
12905         pi_state->owner = newowner;
12907 -       raw_spin_lock_irq(&newowner->pi_lock);
12908 +       raw_spin_lock(&newowner->pi_lock);
12909         WARN_ON(!list_empty(&pi_state->list));
12910         list_add(&pi_state->list, &newowner->pi_state_list);
12911 -       raw_spin_unlock_irq(&newowner->pi_lock);
12912 +       raw_spin_unlock(&newowner->pi_lock);
12913 +       raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
12915         return 0;
12917         /*
12918 -        * To handle the page fault we need to drop the hash bucket
12919 -        * lock here. That gives the other task (either the highest priority
12920 -        * waiter itself or the task which stole the rtmutex) the
12921 -        * chance to try the fixup of the pi_state. So once we are
12922 -        * back from handling the fault we need to check the pi_state
12923 -        * after reacquiring the hash bucket lock and before trying to
12924 -        * do another fixup. When the fixup has been done already we
12925 -        * simply return.
12926 +        * To handle the page fault we need to drop the locks here. That gives
12927 +        * the other task (either the highest priority waiter itself or the
12928 +        * task which stole the rtmutex) the chance to try the fixup of the
12929 +        * pi_state. So once we are back from handling the fault we need to
12930 +        * check the pi_state after reacquiring the locks and before trying to
12931 +        * do another fixup. When the fixup has been done already we simply
12932 +        * return.
12933 +        *
12934 +        * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely
12935 +        * drop hb->lock since the caller owns the hb -> futex_q relation.
12936 +        * Dropping the pi_mutex->wait_lock requires the state revalidate.
12937          */
12938  handle_fault:
12939 +       raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
12940         spin_unlock(q->lock_ptr);
12942         ret = fault_in_user_writeable(uaddr);
12944         spin_lock(q->lock_ptr);
12945 +       raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
12947         /*
12948          * Check if someone else fixed it for us:
12949          */
12950 -       if (pi_state->owner != oldowner)
12951 -               return 0;
12952 +       if (pi_state->owner != oldowner) {
12953 +               ret = 0;
12954 +               goto out_unlock;
12955 +       }
12957         if (ret)
12958 -               return ret;
12959 +               goto out_unlock;
12961         goto retry;
12963 +out_unlock:
12964 +       raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
12965 +       return ret;
12968  static long futex_wait_restart(struct restart_block *restart);
12969 @@ -2230,57 +2351,32 @@ static long futex_wait_restart(struct restart_block *restart);
12970   */
12971  static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
12973 -       struct task_struct *owner;
12974         int ret = 0;
12976         if (locked) {
12977                 /*
12978                  * Got the lock. We might not be the anticipated owner if we
12979                  * did a lock-steal - fix up the PI-state in that case:
12980 +                *
12981 +                * We can safely read pi_state->owner without holding wait_lock
12982 +                * because we now own the rt_mutex, only the owner will attempt
12983 +                * to change it.
12984                  */
12985                 if (q->pi_state->owner != current)
12986                         ret = fixup_pi_state_owner(uaddr, q, current);
12987                 goto out;
12988         }
12990 -       /*
12991 -        * Catch the rare case, where the lock was released when we were on the
12992 -        * way back before we locked the hash bucket.
12993 -        */
12994 -       if (q->pi_state->owner == current) {
12995 -               /*
12996 -                * Try to get the rt_mutex now. This might fail as some other
12997 -                * task acquired the rt_mutex after we removed ourself from the
12998 -                * rt_mutex waiters list.
12999 -                */
13000 -               if (rt_mutex_trylock(&q->pi_state->pi_mutex)) {
13001 -                       locked = 1;
13002 -                       goto out;
13003 -               }
13005 -               /*
13006 -                * pi_state is incorrect, some other task did a lock steal and
13007 -                * we returned due to timeout or signal without taking the
13008 -                * rt_mutex. Too late.
13009 -                */
13010 -               raw_spin_lock_irq(&q->pi_state->pi_mutex.wait_lock);
13011 -               owner = rt_mutex_owner(&q->pi_state->pi_mutex);
13012 -               if (!owner)
13013 -                       owner = rt_mutex_next_owner(&q->pi_state->pi_mutex);
13014 -               raw_spin_unlock_irq(&q->pi_state->pi_mutex.wait_lock);
13015 -               ret = fixup_pi_state_owner(uaddr, q, owner);
13016 -               goto out;
13017 -       }
13019         /*
13020          * Paranoia check. If we did not take the lock, then we should not be
13021          * the owner of the rt_mutex.
13022          */
13023 -       if (rt_mutex_owner(&q->pi_state->pi_mutex) == current)
13024 +       if (rt_mutex_owner(&q->pi_state->pi_mutex) == current) {
13025                 printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p "
13026                                 "pi-state %p\n", ret,
13027                                 q->pi_state->pi_mutex.owner,
13028                                 q->pi_state->owner);
13029 +       }
13031  out:
13032         return ret ? ret : locked;
13033 @@ -2504,6 +2600,8 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
13034                          ktime_t *time, int trylock)
13036         struct hrtimer_sleeper timeout, *to = NULL;
13037 +       struct futex_pi_state *pi_state = NULL;
13038 +       struct rt_mutex_waiter rt_waiter;
13039         struct futex_hash_bucket *hb;
13040         struct futex_q q = futex_q_init;
13041         int res, ret;
13042 @@ -2556,24 +2654,76 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
13043                 }
13044         }
13046 +       WARN_ON(!q.pi_state);
13048         /*
13049          * Only actually queue now that the atomic ops are done:
13050          */
13051 -       queue_me(&q, hb);
13052 +       __queue_me(&q, hb);
13054 -       WARN_ON(!q.pi_state);
13055 -       /*
13056 -        * Block on the PI mutex:
13057 -        */
13058 -       if (!trylock) {
13059 -               ret = rt_mutex_timed_futex_lock(&q.pi_state->pi_mutex, to);
13060 -       } else {
13061 -               ret = rt_mutex_trylock(&q.pi_state->pi_mutex);
13062 +       if (trylock) {
13063 +               ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex);
13064                 /* Fixup the trylock return value: */
13065                 ret = ret ? 0 : -EWOULDBLOCK;
13066 +               goto no_block;
13067 +       }
13069 +       rt_mutex_init_waiter(&rt_waiter, false);
13071 +       /*
13072 +        * On PREEMPT_RT_FULL, when hb->lock becomes an rt_mutex, we must not
13073 +        * hold it while doing rt_mutex_start_proxy(), because then it will
13074 +        * include hb->lock in the blocking chain, even through we'll not in
13075 +        * fact hold it while blocking. This will lead it to report -EDEADLK
13076 +        * and BUG when futex_unlock_pi() interleaves with this.
13077 +        *
13078 +        * Therefore acquire wait_lock while holding hb->lock, but drop the
13079 +        * latter before calling rt_mutex_start_proxy_lock(). This still fully
13080 +        * serializes against futex_unlock_pi() as that does the exact same
13081 +        * lock handoff sequence.
13082 +        */
13083 +       raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock);
13084 +       /*
13085 +        * the migrate_disable() here disables migration in the in_atomic() fast
13086 +        * path which is enabled again in the following spin_unlock(). We have
13087 +        * one migrate_disable() pending in the slow-path which is reversed
13088 +        * after the raw_spin_unlock_irq() where we leave the atomic context.
13089 +        */
13090 +       migrate_disable();
13092 +       spin_unlock(q.lock_ptr);
13093 +       ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current);
13094 +       raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock);
13095 +       migrate_enable();
13097 +       if (ret) {
13098 +               if (ret == 1)
13099 +                       ret = 0;
13101 +               spin_lock(q.lock_ptr);
13102 +               goto no_block;
13103         }
13106 +       if (unlikely(to))
13107 +               hrtimer_start_expires(&to->timer, HRTIMER_MODE_ABS);
13109 +       ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter);
13111         spin_lock(q.lock_ptr);
13112 +       /*
13113 +        * If we failed to acquire the lock (signal/timeout), we must
13114 +        * first acquire the hb->lock before removing the lock from the
13115 +        * rt_mutex waitqueue, such that we can keep the hb and rt_mutex
13116 +        * wait lists consistent.
13117 +        *
13118 +        * In particular; it is important that futex_unlock_pi() can not
13119 +        * observe this inconsistency.
13120 +        */
13121 +       if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter))
13122 +               ret = 0;
13124 +no_block:
13125         /*
13126          * Fixup the pi_state owner and possibly acquire the lock if we
13127          * haven't already.
13128 @@ -2590,12 +2740,19 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
13129          * If fixup_owner() faulted and was unable to handle the fault, unlock
13130          * it and return the fault to userspace.
13131          */
13132 -       if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current))
13133 -               rt_mutex_unlock(&q.pi_state->pi_mutex);
13134 +       if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current)) {
13135 +               pi_state = q.pi_state;
13136 +               get_pi_state(pi_state);
13137 +       }
13139         /* Unqueue and drop the lock */
13140         unqueue_me_pi(&q);
13142 +       if (pi_state) {
13143 +               rt_mutex_futex_unlock(&pi_state->pi_mutex);
13144 +               put_pi_state(pi_state);
13145 +       }
13147         goto out_put_key;
13149  out_unlock_put_key:
13150 @@ -2604,8 +2761,10 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
13151  out_put_key:
13152         put_futex_key(&q.key);
13153  out:
13154 -       if (to)
13155 +       if (to) {
13156 +               hrtimer_cancel(&to->timer);
13157                 destroy_hrtimer_on_stack(&to->timer);
13158 +       }
13159         return ret != -EINTR ? ret : -ERESTARTNOINTR;
13161  uaddr_faulted:
13162 @@ -2632,7 +2791,7 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
13163         u32 uninitialized_var(curval), uval, vpid = task_pid_vnr(current);
13164         union futex_key key = FUTEX_KEY_INIT;
13165         struct futex_hash_bucket *hb;
13166 -       struct futex_q *match;
13167 +       struct futex_q *top_waiter;
13168         int ret;
13170  retry:
13171 @@ -2656,12 +2815,48 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
13172          * all and we at least want to know if user space fiddled
13173          * with the futex value instead of blindly unlocking.
13174          */
13175 -       match = futex_top_waiter(hb, &key);
13176 -       if (match) {
13177 -               ret = wake_futex_pi(uaddr, uval, match, hb);
13178 +       top_waiter = futex_top_waiter(hb, &key);
13179 +       if (top_waiter) {
13180 +               struct futex_pi_state *pi_state = top_waiter->pi_state;
13182 +               ret = -EINVAL;
13183 +               if (!pi_state)
13184 +                       goto out_unlock;
13186 +               /*
13187 +                * If current does not own the pi_state then the futex is
13188 +                * inconsistent and user space fiddled with the futex value.
13189 +                */
13190 +               if (pi_state->owner != current)
13191 +                       goto out_unlock;
13193 +               get_pi_state(pi_state);
13194 +               /*
13195 +                * By taking wait_lock while still holding hb->lock, we ensure
13196 +                * there is no point where we hold neither; and therefore
13197 +                * wake_futex_pi() must observe a state consistent with what we
13198 +                * observed.
13199 +                */
13200 +               raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
13201 +               /*
13202 +                * Magic trickery for now to make the RT migrate disable
13203 +                * logic happy. The following spin_unlock() happens with
13204 +                * interrupts disabled so the internal migrate_enable()
13205 +                * won't undo the migrate_disable() which was issued when
13206 +                * locking hb->lock.
13207 +                */
13208 +               migrate_disable();
13209 +               spin_unlock(&hb->lock);
13211 +               /* Drops pi_state->pi_mutex.wait_lock */
13212 +               ret = wake_futex_pi(uaddr, uval, pi_state);
13214 +               migrate_enable();
13216 +               put_pi_state(pi_state);
13218                 /*
13219 -                * In case of success wake_futex_pi dropped the hash
13220 -                * bucket lock.
13221 +                * Success, we're done! No tricky corner cases.
13222                  */
13223                 if (!ret)
13224                         goto out_putkey;
13225 @@ -2676,7 +2871,6 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
13226                  * setting the FUTEX_WAITERS bit. Try again.
13227                  */
13228                 if (ret == -EAGAIN) {
13229 -                       spin_unlock(&hb->lock);
13230                         put_futex_key(&key);
13231                         goto retry;
13232                 }
13233 @@ -2684,7 +2878,7 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
13234                  * wake_futex_pi has detected invalid state. Tell user
13235                  * space.
13236                  */
13237 -               goto out_unlock;
13238 +               goto out_putkey;
13239         }
13241         /*
13242 @@ -2694,8 +2888,10 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
13243          * preserve the WAITERS bit not the OWNER_DIED one. We are the
13244          * owner.
13245          */
13246 -       if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0))
13247 +       if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0)) {
13248 +               spin_unlock(&hb->lock);
13249                 goto pi_faulted;
13250 +       }
13252         /*
13253          * If uval has changed, let user space handle it.
13254 @@ -2709,7 +2905,6 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
13255         return ret;
13257  pi_faulted:
13258 -       spin_unlock(&hb->lock);
13259         put_futex_key(&key);
13261         ret = fault_in_user_writeable(uaddr);
13262 @@ -2813,8 +3008,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
13263                                  u32 __user *uaddr2)
13265         struct hrtimer_sleeper timeout, *to = NULL;
13266 +       struct futex_pi_state *pi_state = NULL;
13267         struct rt_mutex_waiter rt_waiter;
13268 -       struct futex_hash_bucket *hb;
13269 +       struct futex_hash_bucket *hb, *hb2;
13270         union futex_key key2 = FUTEX_KEY_INIT;
13271         struct futex_q q = futex_q_init;
13272         int res, ret;
13273 @@ -2839,10 +3035,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
13274          * The waiter is allocated on our stack, manipulated by the requeue
13275          * code while we sleep on uaddr.
13276          */
13277 -       debug_rt_mutex_init_waiter(&rt_waiter);
13278 -       RB_CLEAR_NODE(&rt_waiter.pi_tree_entry);
13279 -       RB_CLEAR_NODE(&rt_waiter.tree_entry);
13280 -       rt_waiter.task = NULL;
13281 +       rt_mutex_init_waiter(&rt_waiter, false);
13283         ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
13284         if (unlikely(ret != 0))
13285 @@ -2873,20 +3066,55 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
13286         /* Queue the futex_q, drop the hb lock, wait for wakeup. */
13287         futex_wait_queue_me(hb, &q, to);
13289 -       spin_lock(&hb->lock);
13290 -       ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
13291 -       spin_unlock(&hb->lock);
13292 -       if (ret)
13293 -               goto out_put_keys;
13294 +       /*
13295 +        * On RT we must avoid races with requeue and trying to block
13296 +        * on two mutexes (hb->lock and uaddr2's rtmutex) by
13297 +        * serializing access to pi_blocked_on with pi_lock.
13298 +        */
13299 +       raw_spin_lock_irq(&current->pi_lock);
13300 +       if (current->pi_blocked_on) {
13301 +               /*
13302 +                * We have been requeued or are in the process of
13303 +                * being requeued.
13304 +                */
13305 +               raw_spin_unlock_irq(&current->pi_lock);
13306 +       } else {
13307 +               /*
13308 +                * Setting pi_blocked_on to PI_WAKEUP_INPROGRESS
13309 +                * prevents a concurrent requeue from moving us to the
13310 +                * uaddr2 rtmutex. After that we can safely acquire
13311 +                * (and possibly block on) hb->lock.
13312 +                */
13313 +               current->pi_blocked_on = PI_WAKEUP_INPROGRESS;
13314 +               raw_spin_unlock_irq(&current->pi_lock);
13316 +               spin_lock(&hb->lock);
13318 +               /*
13319 +                * Clean up pi_blocked_on. We might leak it otherwise
13320 +                * when we succeeded with the hb->lock in the fast
13321 +                * path.
13322 +                */
13323 +               raw_spin_lock_irq(&current->pi_lock);
13324 +               current->pi_blocked_on = NULL;
13325 +               raw_spin_unlock_irq(&current->pi_lock);
13327 +               ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
13328 +               spin_unlock(&hb->lock);
13329 +               if (ret)
13330 +                       goto out_put_keys;
13331 +       }
13333         /*
13334 -        * In order for us to be here, we know our q.key == key2, and since
13335 -        * we took the hb->lock above, we also know that futex_requeue() has
13336 -        * completed and we no longer have to concern ourselves with a wakeup
13337 -        * race with the atomic proxy lock acquisition by the requeue code. The
13338 -        * futex_requeue dropped our key1 reference and incremented our key2
13339 -        * reference count.
13340 +        * In order to be here, we have either been requeued, are in
13341 +        * the process of being requeued, or requeue successfully
13342 +        * acquired uaddr2 on our behalf.  If pi_blocked_on was
13343 +        * non-null above, we may be racing with a requeue.  Do not
13344 +        * rely on q->lock_ptr to be hb2->lock until after blocking on
13345 +        * hb->lock or hb2->lock. The futex_requeue dropped our key1
13346 +        * reference and incremented our key2 reference count.
13347          */
13348 +       hb2 = hash_futex(&key2);
13350         /* Check if the requeue code acquired the second futex for us. */
13351         if (!q.rt_waiter) {
13352 @@ -2895,16 +3123,19 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
13353                  * did a lock-steal - fix up the PI-state in that case.
13354                  */
13355                 if (q.pi_state && (q.pi_state->owner != current)) {
13356 -                       spin_lock(q.lock_ptr);
13357 +                       spin_lock(&hb2->lock);
13358 +                       BUG_ON(&hb2->lock != q.lock_ptr);
13359                         ret = fixup_pi_state_owner(uaddr2, &q, current);
13360 -                       if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current)
13361 -                               rt_mutex_unlock(&q.pi_state->pi_mutex);
13362 +                       if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) {
13363 +                               pi_state = q.pi_state;
13364 +                               get_pi_state(pi_state);
13365 +                       }
13366                         /*
13367                          * Drop the reference to the pi state which
13368                          * the requeue_pi() code acquired for us.
13369                          */
13370                         put_pi_state(q.pi_state);
13371 -                       spin_unlock(q.lock_ptr);
13372 +                       spin_unlock(&hb2->lock);
13373                 }
13374         } else {
13375                 struct rt_mutex *pi_mutex;
13376 @@ -2916,10 +3147,14 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
13377                  */
13378                 WARN_ON(!q.pi_state);
13379                 pi_mutex = &q.pi_state->pi_mutex;
13380 -               ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter);
13381 -               debug_rt_mutex_free_waiter(&rt_waiter);
13382 +               ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter);
13384 -               spin_lock(q.lock_ptr);
13385 +               spin_lock(&hb2->lock);
13386 +               BUG_ON(&hb2->lock != q.lock_ptr);
13387 +               if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter))
13388 +                       ret = 0;
13390 +               debug_rt_mutex_free_waiter(&rt_waiter);
13391                 /*
13392                  * Fixup the pi_state owner and possibly acquire the lock if we
13393                  * haven't already.
13394 @@ -2937,13 +3172,20 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
13395                  * the fault, unlock the rt_mutex and return the fault to
13396                  * userspace.
13397                  */
13398 -               if (ret && rt_mutex_owner(pi_mutex) == current)
13399 -                       rt_mutex_unlock(pi_mutex);
13400 +               if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) {
13401 +                       pi_state = q.pi_state;
13402 +                       get_pi_state(pi_state);
13403 +               }
13405                 /* Unqueue and drop the lock. */
13406                 unqueue_me_pi(&q);
13407         }
13409 +       if (pi_state) {
13410 +               rt_mutex_futex_unlock(&pi_state->pi_mutex);
13411 +               put_pi_state(pi_state);
13412 +       }
13414         if (ret == -EINTR) {
13415                 /*
13416                  * We've already been requeued, but cannot restart by calling
13417 diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
13418 index d3f24905852c..f87aa8fdcc51 100644
13419 --- a/kernel/irq/handle.c
13420 +++ b/kernel/irq/handle.c
13421 @@ -181,10 +181,16 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc)
13423         irqreturn_t retval;
13424         unsigned int flags = 0;
13425 +       struct pt_regs *regs = get_irq_regs();
13426 +       u64 ip = regs ? instruction_pointer(regs) : 0;
13428         retval = __handle_irq_event_percpu(desc, &flags);
13430 -       add_interrupt_randomness(desc->irq_data.irq, flags);
13431 +#ifdef CONFIG_PREEMPT_RT_FULL
13432 +       desc->random_ip = ip;
13433 +#else
13434 +       add_interrupt_randomness(desc->irq_data.irq, flags, ip);
13435 +#endif
13437         if (!noirqdebug)
13438                 note_interrupt(desc, retval);
13439 diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
13440 index ea41820ab12e..5994867526f3 100644
13441 --- a/kernel/irq/manage.c
13442 +++ b/kernel/irq/manage.c
13443 @@ -22,6 +22,7 @@
13444  #include "internals.h"
13446  #ifdef CONFIG_IRQ_FORCED_THREADING
13447 +# ifndef CONFIG_PREEMPT_RT_BASE
13448  __read_mostly bool force_irqthreads;
13450  static int __init setup_forced_irqthreads(char *arg)
13451 @@ -30,6 +31,7 @@ static int __init setup_forced_irqthreads(char *arg)
13452         return 0;
13454  early_param("threadirqs", setup_forced_irqthreads);
13455 +# endif
13456  #endif
13458  static void __synchronize_hardirq(struct irq_desc *desc)
13459 @@ -233,7 +235,12 @@ int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask,
13461         if (desc->affinity_notify) {
13462                 kref_get(&desc->affinity_notify->kref);
13464 +#ifdef CONFIG_PREEMPT_RT_BASE
13465 +               swork_queue(&desc->affinity_notify->swork);
13466 +#else
13467                 schedule_work(&desc->affinity_notify->work);
13468 +#endif
13469         }
13470         irqd_set(data, IRQD_AFFINITY_SET);
13472 @@ -271,10 +278,8 @@ int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
13474  EXPORT_SYMBOL_GPL(irq_set_affinity_hint);
13476 -static void irq_affinity_notify(struct work_struct *work)
13477 +static void _irq_affinity_notify(struct irq_affinity_notify *notify)
13479 -       struct irq_affinity_notify *notify =
13480 -               container_of(work, struct irq_affinity_notify, work);
13481         struct irq_desc *desc = irq_to_desc(notify->irq);
13482         cpumask_var_t cpumask;
13483         unsigned long flags;
13484 @@ -296,6 +301,35 @@ static void irq_affinity_notify(struct work_struct *work)
13485         kref_put(&notify->kref, notify->release);
13488 +#ifdef CONFIG_PREEMPT_RT_BASE
13489 +static void init_helper_thread(void)
13491 +       static int init_sworker_once;
13493 +       if (init_sworker_once)
13494 +               return;
13495 +       if (WARN_ON(swork_get()))
13496 +               return;
13497 +       init_sworker_once = 1;
13500 +static void irq_affinity_notify(struct swork_event *swork)
13502 +       struct irq_affinity_notify *notify =
13503 +               container_of(swork, struct irq_affinity_notify, swork);
13504 +       _irq_affinity_notify(notify);
13507 +#else
13509 +static void irq_affinity_notify(struct work_struct *work)
13511 +       struct irq_affinity_notify *notify =
13512 +               container_of(work, struct irq_affinity_notify, work);
13513 +       _irq_affinity_notify(notify);
13515 +#endif
13517  /**
13518   *     irq_set_affinity_notifier - control notification of IRQ affinity changes
13519   *     @irq:           Interrupt for which to enable/disable notification
13520 @@ -324,7 +358,12 @@ irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify)
13521         if (notify) {
13522                 notify->irq = irq;
13523                 kref_init(&notify->kref);
13524 +#ifdef CONFIG_PREEMPT_RT_BASE
13525 +               INIT_SWORK(&notify->swork, irq_affinity_notify);
13526 +               init_helper_thread();
13527 +#else
13528                 INIT_WORK(&notify->work, irq_affinity_notify);
13529 +#endif
13530         }
13532         raw_spin_lock_irqsave(&desc->lock, flags);
13533 @@ -879,7 +918,15 @@ irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action)
13534         local_bh_disable();
13535         ret = action->thread_fn(action->irq, action->dev_id);
13536         irq_finalize_oneshot(desc, action);
13537 -       local_bh_enable();
13538 +       /*
13539 +        * Interrupts which have real time requirements can be set up
13540 +        * to avoid softirq processing in the thread handler. This is
13541 +        * safe as these interrupts do not raise soft interrupts.
13542 +        */
13543 +       if (irq_settings_no_softirq_call(desc))
13544 +               _local_bh_enable();
13545 +       else
13546 +               local_bh_enable();
13547         return ret;
13550 @@ -976,6 +1023,12 @@ static int irq_thread(void *data)
13551                 if (action_ret == IRQ_WAKE_THREAD)
13552                         irq_wake_secondary(desc, action);
13554 +#ifdef CONFIG_PREEMPT_RT_FULL
13555 +               migrate_disable();
13556 +               add_interrupt_randomness(action->irq, 0,
13557 +                                desc->random_ip ^ (unsigned long) action);
13558 +               migrate_enable();
13559 +#endif
13560                 wake_threads_waitq(desc);
13561         }
13563 @@ -1338,6 +1391,9 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
13564                         irqd_set(&desc->irq_data, IRQD_NO_BALANCING);
13565                 }
13567 +               if (new->flags & IRQF_NO_SOFTIRQ_CALL)
13568 +                       irq_settings_set_no_softirq_call(desc);
13570                 /* Set default affinity mask once everything is setup */
13571                 setup_affinity(desc, mask);
13573 @@ -2063,7 +2119,7 @@ EXPORT_SYMBOL_GPL(irq_get_irqchip_state);
13574   *     This call sets the internal irqchip state of an interrupt,
13575   *     depending on the value of @which.
13576   *
13577 - *     This function should be called with preemption disabled if the
13578 + *     This function should be called with migration disabled if the
13579   *     interrupt controller has per-cpu registers.
13580   */
13581  int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
13582 diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
13583 index 320579d89091..2df2d4445b1e 100644
13584 --- a/kernel/irq/settings.h
13585 +++ b/kernel/irq/settings.h
13586 @@ -16,6 +16,7 @@ enum {
13587         _IRQ_PER_CPU_DEVID      = IRQ_PER_CPU_DEVID,
13588         _IRQ_IS_POLLED          = IRQ_IS_POLLED,
13589         _IRQ_DISABLE_UNLAZY     = IRQ_DISABLE_UNLAZY,
13590 +       _IRQ_NO_SOFTIRQ_CALL    = IRQ_NO_SOFTIRQ_CALL,
13591         _IRQF_MODIFY_MASK       = IRQF_MODIFY_MASK,
13592  };
13594 @@ -30,6 +31,7 @@ enum {
13595  #define IRQ_PER_CPU_DEVID      GOT_YOU_MORON
13596  #define IRQ_IS_POLLED          GOT_YOU_MORON
13597  #define IRQ_DISABLE_UNLAZY     GOT_YOU_MORON
13598 +#define IRQ_NO_SOFTIRQ_CALL    GOT_YOU_MORON
13599  #undef IRQF_MODIFY_MASK
13600  #define IRQF_MODIFY_MASK       GOT_YOU_MORON
13602 @@ -40,6 +42,16 @@ irq_settings_clr_and_set(struct irq_desc *desc, u32 clr, u32 set)
13603         desc->status_use_accessors |= (set & _IRQF_MODIFY_MASK);
13606 +static inline bool irq_settings_no_softirq_call(struct irq_desc *desc)
13608 +       return desc->status_use_accessors & _IRQ_NO_SOFTIRQ_CALL;
13611 +static inline void irq_settings_set_no_softirq_call(struct irq_desc *desc)
13613 +       desc->status_use_accessors |= _IRQ_NO_SOFTIRQ_CALL;
13616  static inline bool irq_settings_is_per_cpu(struct irq_desc *desc)
13618         return desc->status_use_accessors & _IRQ_PER_CPU;
13619 diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
13620 index 5707f97a3e6a..73f38dc7a7fb 100644
13621 --- a/kernel/irq/spurious.c
13622 +++ b/kernel/irq/spurious.c
13623 @@ -442,6 +442,10 @@ MODULE_PARM_DESC(noirqdebug, "Disable irq lockup detection when true");
13625  static int __init irqfixup_setup(char *str)
13627 +#ifdef CONFIG_PREEMPT_RT_BASE
13628 +       pr_warn("irqfixup boot option not supported w/ CONFIG_PREEMPT_RT_BASE\n");
13629 +       return 1;
13630 +#endif
13631         irqfixup = 1;
13632         printk(KERN_WARNING "Misrouted IRQ fixup support enabled.\n");
13633         printk(KERN_WARNING "This may impact system performance.\n");
13634 @@ -454,6 +458,10 @@ module_param(irqfixup, int, 0644);
13636  static int __init irqpoll_setup(char *str)
13638 +#ifdef CONFIG_PREEMPT_RT_BASE
13639 +       pr_warn("irqpoll boot option not supported w/ CONFIG_PREEMPT_RT_BASE\n");
13640 +       return 1;
13641 +#endif
13642         irqfixup = 2;
13643         printk(KERN_WARNING "Misrouted IRQ fixup and polling support "
13644                                 "enabled\n");
13645 diff --git a/kernel/irq_work.c b/kernel/irq_work.c
13646 index bcf107ce0854..2899ba0d23d1 100644
13647 --- a/kernel/irq_work.c
13648 +++ b/kernel/irq_work.c
13649 @@ -17,6 +17,7 @@
13650  #include <linux/cpu.h>
13651  #include <linux/notifier.h>
13652  #include <linux/smp.h>
13653 +#include <linux/interrupt.h>
13654  #include <asm/processor.h>
13657 @@ -65,6 +66,8 @@ void __weak arch_irq_work_raise(void)
13658   */
13659  bool irq_work_queue_on(struct irq_work *work, int cpu)
13661 +       struct llist_head *list;
13663         /* All work should have been flushed before going offline */
13664         WARN_ON_ONCE(cpu_is_offline(cpu));
13666 @@ -75,7 +78,12 @@ bool irq_work_queue_on(struct irq_work *work, int cpu)
13667         if (!irq_work_claim(work))
13668                 return false;
13670 -       if (llist_add(&work->llnode, &per_cpu(raised_list, cpu)))
13671 +       if (IS_ENABLED(CONFIG_PREEMPT_RT_FULL) && !(work->flags & IRQ_WORK_HARD_IRQ))
13672 +               list = &per_cpu(lazy_list, cpu);
13673 +       else
13674 +               list = &per_cpu(raised_list, cpu);
13676 +       if (llist_add(&work->llnode, list))
13677                 arch_send_call_function_single_ipi(cpu);
13679         return true;
13680 @@ -86,6 +94,9 @@ EXPORT_SYMBOL_GPL(irq_work_queue_on);
13681  /* Enqueue the irq work @work on the current CPU */
13682  bool irq_work_queue(struct irq_work *work)
13684 +       struct llist_head *list;
13685 +       bool lazy_work, realtime = IS_ENABLED(CONFIG_PREEMPT_RT_FULL);
13687         /* Only queue if not already pending */
13688         if (!irq_work_claim(work))
13689                 return false;
13690 @@ -93,13 +104,15 @@ bool irq_work_queue(struct irq_work *work)
13691         /* Queue the entry and raise the IPI if needed. */
13692         preempt_disable();
13694 -       /* If the work is "lazy", handle it from next tick if any */
13695 -       if (work->flags & IRQ_WORK_LAZY) {
13696 -               if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) &&
13697 -                   tick_nohz_tick_stopped())
13698 -                       arch_irq_work_raise();
13699 -       } else {
13700 -               if (llist_add(&work->llnode, this_cpu_ptr(&raised_list)))
13701 +       lazy_work = work->flags & IRQ_WORK_LAZY;
13703 +       if (lazy_work || (realtime && !(work->flags & IRQ_WORK_HARD_IRQ)))
13704 +               list = this_cpu_ptr(&lazy_list);
13705 +       else
13706 +               list = this_cpu_ptr(&raised_list);
13708 +       if (llist_add(&work->llnode, list)) {
13709 +               if (!lazy_work || tick_nohz_tick_stopped())
13710                         arch_irq_work_raise();
13711         }
13713 @@ -116,9 +129,8 @@ bool irq_work_needs_cpu(void)
13714         raised = this_cpu_ptr(&raised_list);
13715         lazy = this_cpu_ptr(&lazy_list);
13717 -       if (llist_empty(raised) || arch_irq_work_has_interrupt())
13718 -               if (llist_empty(lazy))
13719 -                       return false;
13720 +       if (llist_empty(raised) && llist_empty(lazy))
13721 +               return false;
13723         /* All work should have been flushed before going offline */
13724         WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
13725 @@ -132,7 +144,7 @@ static void irq_work_run_list(struct llist_head *list)
13726         struct irq_work *work;
13727         struct llist_node *llnode;
13729 -       BUG_ON(!irqs_disabled());
13730 +       BUG_ON_NONRT(!irqs_disabled());
13732         if (llist_empty(list))
13733                 return;
13734 @@ -169,7 +181,16 @@ static void irq_work_run_list(struct llist_head *list)
13735  void irq_work_run(void)
13737         irq_work_run_list(this_cpu_ptr(&raised_list));
13738 -       irq_work_run_list(this_cpu_ptr(&lazy_list));
13739 +       if (IS_ENABLED(CONFIG_PREEMPT_RT_FULL)) {
13740 +               /*
13741 +                * NOTE: we raise softirq via IPI for safety,
13742 +                * and execute in irq_work_tick() to move the
13743 +                * overhead from hard to soft irq context.
13744 +                */
13745 +               if (!llist_empty(this_cpu_ptr(&lazy_list)))
13746 +                       raise_softirq(TIMER_SOFTIRQ);
13747 +       } else
13748 +               irq_work_run_list(this_cpu_ptr(&lazy_list));
13750  EXPORT_SYMBOL_GPL(irq_work_run);
13752 @@ -179,8 +200,17 @@ void irq_work_tick(void)
13754         if (!llist_empty(raised) && !arch_irq_work_has_interrupt())
13755                 irq_work_run_list(raised);
13757 +       if (!IS_ENABLED(CONFIG_PREEMPT_RT_FULL))
13758 +               irq_work_run_list(this_cpu_ptr(&lazy_list));
13761 +#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_PREEMPT_RT_FULL)
13762 +void irq_work_tick_soft(void)
13764         irq_work_run_list(this_cpu_ptr(&lazy_list));
13766 +#endif
13768  /*
13769   * Synchronize against the irq_work @entry, ensures the entry is not
13770 diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
13771 index ee1bc1bb8feb..ddef07958840 100644
13772 --- a/kernel/ksysfs.c
13773 +++ b/kernel/ksysfs.c
13774 @@ -136,6 +136,15 @@ KERNEL_ATTR_RO(vmcoreinfo);
13776  #endif /* CONFIG_KEXEC_CORE */
13778 +#if defined(CONFIG_PREEMPT_RT_FULL)
13779 +static ssize_t  realtime_show(struct kobject *kobj,
13780 +                             struct kobj_attribute *attr, char *buf)
13782 +       return sprintf(buf, "%d\n", 1);
13784 +KERNEL_ATTR_RO(realtime);
13785 +#endif
13787  /* whether file capabilities are enabled */
13788  static ssize_t fscaps_show(struct kobject *kobj,
13789                                   struct kobj_attribute *attr, char *buf)
13790 @@ -224,6 +233,9 @@ static struct attribute * kernel_attrs[] = {
13791  #ifndef CONFIG_TINY_RCU
13792         &rcu_expedited_attr.attr,
13793         &rcu_normal_attr.attr,
13794 +#endif
13795 +#ifdef CONFIG_PREEMPT_RT_FULL
13796 +       &realtime_attr.attr,
13797  #endif
13798         NULL
13799  };
13800 diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
13801 index 6f88e352cd4f..6ff9e8011dd0 100644
13802 --- a/kernel/locking/Makefile
13803 +++ b/kernel/locking/Makefile
13804 @@ -2,7 +2,7 @@
13805  # and is generally not a function of system call inputs.
13806  KCOV_INSTRUMENT                := n
13808 -obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o
13809 +obj-y += semaphore.o percpu-rwsem.o
13811  ifdef CONFIG_FUNCTION_TRACER
13812  CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE)
13813 @@ -11,7 +11,11 @@ CFLAGS_REMOVE_mutex-debug.o = $(CC_FLAGS_FTRACE)
13814  CFLAGS_REMOVE_rtmutex-debug.o = $(CC_FLAGS_FTRACE)
13815  endif
13817 +ifneq ($(CONFIG_PREEMPT_RT_FULL),y)
13818 +obj-y += mutex.o
13819  obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
13820 +endif
13821 +obj-y += rwsem.o
13822  obj-$(CONFIG_LOCKDEP) += lockdep.o
13823  ifeq ($(CONFIG_PROC_FS),y)
13824  obj-$(CONFIG_LOCKDEP) += lockdep_proc.o
13825 @@ -24,7 +28,10 @@ obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
13826  obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
13827  obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
13828  obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o
13829 +ifneq ($(CONFIG_PREEMPT_RT_FULL),y)
13830  obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
13831  obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o
13832 +endif
13833 +obj-$(CONFIG_PREEMPT_RT_FULL) += rt.o rwsem-rt.o
13834  obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o
13835  obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o
13836 diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
13837 index 6599c7f3071d..79f8e00e802e 100644
13838 --- a/kernel/locking/lockdep.c
13839 +++ b/kernel/locking/lockdep.c
13840 @@ -658,6 +658,7 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
13841         struct lockdep_subclass_key *key;
13842         struct hlist_head *hash_head;
13843         struct lock_class *class;
13844 +       bool is_static = false;
13846         if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) {
13847                 debug_locks_off();
13848 @@ -671,10 +672,23 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
13850         /*
13851          * Static locks do not have their class-keys yet - for them the key
13852 -        * is the lock object itself:
13853 +        * is the lock object itself. If the lock is in the per cpu area,
13854 +        * the canonical address of the lock (per cpu offset removed) is
13855 +        * used.
13856          */
13857 -       if (unlikely(!lock->key))
13858 -               lock->key = (void *)lock;
13859 +       if (unlikely(!lock->key)) {
13860 +               unsigned long can_addr, addr = (unsigned long)lock;
13862 +               if (__is_kernel_percpu_address(addr, &can_addr))
13863 +                       lock->key = (void *)can_addr;
13864 +               else if (__is_module_percpu_address(addr, &can_addr))
13865 +                       lock->key = (void *)can_addr;
13866 +               else if (static_obj(lock))
13867 +                       lock->key = (void *)lock;
13868 +               else
13869 +                       return ERR_PTR(-EINVAL);
13870 +               is_static = true;
13871 +       }
13873         /*
13874          * NOTE: the class-key must be unique. For dynamic locks, a static
13875 @@ -706,7 +720,7 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
13876                 }
13877         }
13879 -       return NULL;
13880 +       return is_static || static_obj(lock->key) ? NULL : ERR_PTR(-EINVAL);
13883  /*
13884 @@ -724,19 +738,18 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
13885         DEBUG_LOCKS_WARN_ON(!irqs_disabled());
13887         class = look_up_lock_class(lock, subclass);
13888 -       if (likely(class))
13889 +       if (likely(!IS_ERR_OR_NULL(class)))
13890                 goto out_set_class_cache;
13892         /*
13893          * Debug-check: all keys must be persistent!
13894 -        */
13895 -       if (!static_obj(lock->key)) {
13896 +        */
13897 +       if (IS_ERR(class)) {
13898                 debug_locks_off();
13899                 printk("INFO: trying to register non-static key.\n");
13900                 printk("the code is fine but needs lockdep annotation.\n");
13901                 printk("turning off the locking correctness validator.\n");
13902                 dump_stack();
13904                 return NULL;
13905         }
13907 @@ -3417,7 +3430,7 @@ static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock)
13908                  * Clearly if the lock hasn't been acquired _ever_, we're not
13909                  * holding it either, so report failure.
13910                  */
13911 -               if (!class)
13912 +               if (IS_ERR_OR_NULL(class))
13913                         return 0;
13915                 /*
13916 @@ -3696,6 +3709,7 @@ static void check_flags(unsigned long flags)
13917                 }
13918         }
13920 +#ifndef CONFIG_PREEMPT_RT_FULL
13921         /*
13922          * We dont accurately track softirq state in e.g.
13923          * hardirq contexts (such as on 4KSTACKS), so only
13924 @@ -3710,6 +3724,7 @@ static void check_flags(unsigned long flags)
13925                         DEBUG_LOCKS_WARN_ON(!current->softirqs_enabled);
13926                 }
13927         }
13928 +#endif
13930         if (!debug_locks)
13931                 print_irqtrace_events(current);
13932 @@ -4166,7 +4181,7 @@ void lockdep_reset_lock(struct lockdep_map *lock)
13933                  * If the class exists we look it up and zap it:
13934                  */
13935                 class = look_up_lock_class(lock, j);
13936 -               if (class)
13937 +               if (!IS_ERR_OR_NULL(class))
13938                         zap_class(class);
13939         }
13940         /*
13941 diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
13942 index d3de04b12f8c..0f49abeae337 100644
13943 --- a/kernel/locking/locktorture.c
13944 +++ b/kernel/locking/locktorture.c
13945 @@ -26,7 +26,6 @@
13946  #include <linux/kthread.h>
13947  #include <linux/sched/rt.h>
13948  #include <linux/spinlock.h>
13949 -#include <linux/rwlock.h>
13950  #include <linux/mutex.h>
13951  #include <linux/rwsem.h>
13952  #include <linux/smp.h>
13953 diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
13954 index ce182599cf2e..2ad3a1e8344c 100644
13955 --- a/kernel/locking/percpu-rwsem.c
13956 +++ b/kernel/locking/percpu-rwsem.c
13957 @@ -18,7 +18,7 @@ int __percpu_init_rwsem(struct percpu_rw_semaphore *sem,
13958         /* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */
13959         rcu_sync_init(&sem->rss, RCU_SCHED_SYNC);
13960         __init_rwsem(&sem->rw_sem, name, rwsem_key);
13961 -       init_waitqueue_head(&sem->writer);
13962 +       init_swait_queue_head(&sem->writer);
13963         sem->readers_block = 0;
13964         return 0;
13966 @@ -103,7 +103,7 @@ void __percpu_up_read(struct percpu_rw_semaphore *sem)
13967         __this_cpu_dec(*sem->read_count);
13969         /* Prod writer to recheck readers_active */
13970 -       wake_up(&sem->writer);
13971 +       swake_up(&sem->writer);
13973  EXPORT_SYMBOL_GPL(__percpu_up_read);
13975 @@ -160,7 +160,7 @@ void percpu_down_write(struct percpu_rw_semaphore *sem)
13976          */
13978         /* Wait for all now active readers to complete. */
13979 -       wait_event(sem->writer, readers_active_check(sem));
13980 +       swait_event(sem->writer, readers_active_check(sem));
13982  EXPORT_SYMBOL_GPL(percpu_down_write);
13984 diff --git a/kernel/locking/rt.c b/kernel/locking/rt.c
13985 new file mode 100644
13986 index 000000000000..6284e3b15091
13987 --- /dev/null
13988 +++ b/kernel/locking/rt.c
13989 @@ -0,0 +1,331 @@
13991 + * kernel/rt.c
13992 + *
13993 + * Real-Time Preemption Support
13994 + *
13995 + * started by Ingo Molnar:
13996 + *
13997 + *  Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
13998 + *  Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
13999 + *
14000 + * historic credit for proving that Linux spinlocks can be implemented via
14001 + * RT-aware mutexes goes to many people: The Pmutex project (Dirk Grambow
14002 + * and others) who prototyped it on 2.4 and did lots of comparative
14003 + * research and analysis; TimeSys, for proving that you can implement a
14004 + * fully preemptible kernel via the use of IRQ threading and mutexes;
14005 + * Bill Huey for persuasively arguing on lkml that the mutex model is the
14006 + * right one; and to MontaVista, who ported pmutexes to 2.6.
14007 + *
14008 + * This code is a from-scratch implementation and is not based on pmutexes,
14009 + * but the idea of converting spinlocks to mutexes is used here too.
14010 + *
14011 + * lock debugging, locking tree, deadlock detection:
14012 + *
14013 + *  Copyright (C) 2004, LynuxWorks, Inc., Igor Manyilov, Bill Huey
14014 + *  Released under the General Public License (GPL).
14015 + *
14016 + * Includes portions of the generic R/W semaphore implementation from:
14017 + *
14018 + *  Copyright (c) 2001   David Howells (dhowells@redhat.com).
14019 + *  - Derived partially from idea by Andrea Arcangeli <andrea@suse.de>
14020 + *  - Derived also from comments by Linus
14021 + *
14022 + * Pending ownership of locks and ownership stealing:
14023 + *
14024 + *  Copyright (C) 2005, Kihon Technologies Inc., Steven Rostedt
14025 + *
14026 + *   (also by Steven Rostedt)
14027 + *    - Converted single pi_lock to individual task locks.
14028 + *
14029 + * By Esben Nielsen:
14030 + *    Doing priority inheritance with help of the scheduler.
14031 + *
14032 + *  Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
14033 + *  - major rework based on Esben Nielsens initial patch
14034 + *  - replaced thread_info references by task_struct refs
14035 + *  - removed task->pending_owner dependency
14036 + *  - BKL drop/reacquire for semaphore style locks to avoid deadlocks
14037 + *    in the scheduler return path as discussed with Steven Rostedt
14038 + *
14039 + *  Copyright (C) 2006, Kihon Technologies Inc.
14040 + *    Steven Rostedt <rostedt@goodmis.org>
14041 + *  - debugged and patched Thomas Gleixner's rework.
14042 + *  - added back the cmpxchg to the rework.
14043 + *  - turned atomic require back on for SMP.
14044 + */
14046 +#include <linux/spinlock.h>
14047 +#include <linux/rtmutex.h>
14048 +#include <linux/sched.h>
14049 +#include <linux/delay.h>
14050 +#include <linux/module.h>
14051 +#include <linux/kallsyms.h>
14052 +#include <linux/syscalls.h>
14053 +#include <linux/interrupt.h>
14054 +#include <linux/plist.h>
14055 +#include <linux/fs.h>
14056 +#include <linux/futex.h>
14057 +#include <linux/hrtimer.h>
14059 +#include "rtmutex_common.h"
14062 + * struct mutex functions
14063 + */
14064 +void __mutex_do_init(struct mutex *mutex, const char *name,
14065 +                    struct lock_class_key *key)
14067 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
14068 +       /*
14069 +        * Make sure we are not reinitializing a held lock:
14070 +        */
14071 +       debug_check_no_locks_freed((void *)mutex, sizeof(*mutex));
14072 +       lockdep_init_map(&mutex->dep_map, name, key, 0);
14073 +#endif
14074 +       mutex->lock.save_state = 0;
14076 +EXPORT_SYMBOL(__mutex_do_init);
14078 +void __lockfunc _mutex_lock(struct mutex *lock)
14080 +       mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
14081 +       rt_mutex_lock(&lock->lock);
14083 +EXPORT_SYMBOL(_mutex_lock);
14085 +int __lockfunc _mutex_lock_interruptible(struct mutex *lock)
14087 +       int ret;
14089 +       mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
14090 +       ret = rt_mutex_lock_interruptible(&lock->lock);
14091 +       if (ret)
14092 +               mutex_release(&lock->dep_map, 1, _RET_IP_);
14093 +       return ret;
14095 +EXPORT_SYMBOL(_mutex_lock_interruptible);
14097 +int __lockfunc _mutex_lock_killable(struct mutex *lock)
14099 +       int ret;
14101 +       mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
14102 +       ret = rt_mutex_lock_killable(&lock->lock);
14103 +       if (ret)
14104 +               mutex_release(&lock->dep_map, 1, _RET_IP_);
14105 +       return ret;
14107 +EXPORT_SYMBOL(_mutex_lock_killable);
14109 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
14110 +void __lockfunc _mutex_lock_nested(struct mutex *lock, int subclass)
14112 +       mutex_acquire_nest(&lock->dep_map, subclass, 0, NULL, _RET_IP_);
14113 +       rt_mutex_lock(&lock->lock);
14115 +EXPORT_SYMBOL(_mutex_lock_nested);
14117 +void __lockfunc _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest)
14119 +       mutex_acquire_nest(&lock->dep_map, 0, 0, nest, _RET_IP_);
14120 +       rt_mutex_lock(&lock->lock);
14122 +EXPORT_SYMBOL(_mutex_lock_nest_lock);
14124 +int __lockfunc _mutex_lock_interruptible_nested(struct mutex *lock, int subclass)
14126 +       int ret;
14128 +       mutex_acquire_nest(&lock->dep_map, subclass, 0, NULL, _RET_IP_);
14129 +       ret = rt_mutex_lock_interruptible(&lock->lock);
14130 +       if (ret)
14131 +               mutex_release(&lock->dep_map, 1, _RET_IP_);
14132 +       return ret;
14134 +EXPORT_SYMBOL(_mutex_lock_interruptible_nested);
14136 +int __lockfunc _mutex_lock_killable_nested(struct mutex *lock, int subclass)
14138 +       int ret;
14140 +       mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
14141 +       ret = rt_mutex_lock_killable(&lock->lock);
14142 +       if (ret)
14143 +               mutex_release(&lock->dep_map, 1, _RET_IP_);
14144 +       return ret;
14146 +EXPORT_SYMBOL(_mutex_lock_killable_nested);
14147 +#endif
14149 +int __lockfunc _mutex_trylock(struct mutex *lock)
14151 +       int ret = rt_mutex_trylock(&lock->lock);
14153 +       if (ret)
14154 +               mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
14156 +       return ret;
14158 +EXPORT_SYMBOL(_mutex_trylock);
14160 +void __lockfunc _mutex_unlock(struct mutex *lock)
14162 +       mutex_release(&lock->dep_map, 1, _RET_IP_);
14163 +       rt_mutex_unlock(&lock->lock);
14165 +EXPORT_SYMBOL(_mutex_unlock);
14168 + * rwlock_t functions
14169 + */
14170 +int __lockfunc rt_write_trylock(rwlock_t *rwlock)
14172 +       int ret;
14174 +       migrate_disable();
14175 +       ret = rt_mutex_trylock(&rwlock->lock);
14176 +       if (ret)
14177 +               rwlock_acquire(&rwlock->dep_map, 0, 1, _RET_IP_);
14178 +       else
14179 +               migrate_enable();
14181 +       return ret;
14183 +EXPORT_SYMBOL(rt_write_trylock);
14185 +int __lockfunc rt_write_trylock_irqsave(rwlock_t *rwlock, unsigned long *flags)
14187 +       int ret;
14189 +       *flags = 0;
14190 +       ret = rt_write_trylock(rwlock);
14191 +       return ret;
14193 +EXPORT_SYMBOL(rt_write_trylock_irqsave);
14195 +int __lockfunc rt_read_trylock(rwlock_t *rwlock)
14197 +       struct rt_mutex *lock = &rwlock->lock;
14198 +       int ret = 1;
14200 +       /*
14201 +        * recursive read locks succeed when current owns the lock,
14202 +        * but not when read_depth == 0 which means that the lock is
14203 +        * write locked.
14204 +        */
14205 +       if (rt_mutex_owner(lock) != current) {
14206 +               migrate_disable();
14207 +               ret = rt_mutex_trylock(lock);
14208 +               if (ret)
14209 +                       rwlock_acquire(&rwlock->dep_map, 0, 1, _RET_IP_);
14210 +               else
14211 +                       migrate_enable();
14213 +       } else if (!rwlock->read_depth) {
14214 +               ret = 0;
14215 +       }
14217 +       if (ret)
14218 +               rwlock->read_depth++;
14220 +       return ret;
14222 +EXPORT_SYMBOL(rt_read_trylock);
14224 +void __lockfunc rt_write_lock(rwlock_t *rwlock)
14226 +       rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_);
14227 +       __rt_spin_lock(&rwlock->lock);
14229 +EXPORT_SYMBOL(rt_write_lock);
14231 +void __lockfunc rt_read_lock(rwlock_t *rwlock)
14233 +       struct rt_mutex *lock = &rwlock->lock;
14236 +       /*
14237 +        * recursive read locks succeed when current owns the lock
14238 +        */
14239 +       if (rt_mutex_owner(lock) != current) {
14240 +               rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_);
14241 +               __rt_spin_lock(lock);
14242 +       }
14243 +       rwlock->read_depth++;
14246 +EXPORT_SYMBOL(rt_read_lock);
14248 +void __lockfunc rt_write_unlock(rwlock_t *rwlock)
14250 +       /* NOTE: we always pass in '1' for nested, for simplicity */
14251 +       rwlock_release(&rwlock->dep_map, 1, _RET_IP_);
14252 +       __rt_spin_unlock(&rwlock->lock);
14253 +       migrate_enable();
14255 +EXPORT_SYMBOL(rt_write_unlock);
14257 +void __lockfunc rt_read_unlock(rwlock_t *rwlock)
14259 +       /* Release the lock only when read_depth is down to 0 */
14260 +       if (--rwlock->read_depth == 0) {
14261 +               rwlock_release(&rwlock->dep_map, 1, _RET_IP_);
14262 +               __rt_spin_unlock(&rwlock->lock);
14263 +               migrate_enable();
14264 +       }
14266 +EXPORT_SYMBOL(rt_read_unlock);
14268 +unsigned long __lockfunc rt_write_lock_irqsave(rwlock_t *rwlock)
14270 +       rt_write_lock(rwlock);
14272 +       return 0;
14274 +EXPORT_SYMBOL(rt_write_lock_irqsave);
14276 +unsigned long __lockfunc rt_read_lock_irqsave(rwlock_t *rwlock)
14278 +       rt_read_lock(rwlock);
14280 +       return 0;
14282 +EXPORT_SYMBOL(rt_read_lock_irqsave);
14284 +void __rt_rwlock_init(rwlock_t *rwlock, char *name, struct lock_class_key *key)
14286 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
14287 +       /*
14288 +        * Make sure we are not reinitializing a held lock:
14289 +        */
14290 +       debug_check_no_locks_freed((void *)rwlock, sizeof(*rwlock));
14291 +       lockdep_init_map(&rwlock->dep_map, name, key, 0);
14292 +#endif
14293 +       rwlock->lock.save_state = 1;
14294 +       rwlock->read_depth = 0;
14296 +EXPORT_SYMBOL(__rt_rwlock_init);
14298 +/**
14299 + * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0
14300 + * @cnt: the atomic which we are to dec
14301 + * @lock: the mutex to return holding if we dec to 0
14302 + *
14303 + * return true and hold lock if we dec to 0, return false otherwise
14304 + */
14305 +int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock)
14307 +       /* dec if we can't possibly hit 0 */
14308 +       if (atomic_add_unless(cnt, -1, 1))
14309 +               return 0;
14310 +       /* we might hit 0, so take the lock */
14311 +       mutex_lock(lock);
14312 +       if (!atomic_dec_and_test(cnt)) {
14313 +               /* when we actually did the dec, we didn't hit 0 */
14314 +               mutex_unlock(lock);
14315 +               return 0;
14316 +       }
14317 +       /* we hit 0, and we hold the lock */
14318 +       return 1;
14320 +EXPORT_SYMBOL(atomic_dec_and_mutex_lock);
14321 diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c
14322 index 62b6cee8ea7f..0613c4b1d059 100644
14323 --- a/kernel/locking/rtmutex-debug.c
14324 +++ b/kernel/locking/rtmutex-debug.c
14325 @@ -173,12 +173,3 @@ void debug_rt_mutex_init(struct rt_mutex *lock, const char *name)
14326         lock->name = name;
14329 -void
14330 -rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task)
14334 -void rt_mutex_deadlock_account_unlock(struct task_struct *task)
14338 diff --git a/kernel/locking/rtmutex-debug.h b/kernel/locking/rtmutex-debug.h
14339 index d0519c3432b6..b585af9a1b50 100644
14340 --- a/kernel/locking/rtmutex-debug.h
14341 +++ b/kernel/locking/rtmutex-debug.h
14342 @@ -9,9 +9,6 @@
14343   * This file contains macros used solely by rtmutex.c. Debug version.
14344   */
14346 -extern void
14347 -rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task);
14348 -extern void rt_mutex_deadlock_account_unlock(struct task_struct *task);
14349  extern void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter);
14350  extern void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter);
14351  extern void debug_rt_mutex_init(struct rt_mutex *lock, const char *name);
14352 diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
14353 index 2c49d76f96c3..3a8b5d44aaf8 100644
14354 --- a/kernel/locking/rtmutex.c
14355 +++ b/kernel/locking/rtmutex.c
14356 @@ -7,6 +7,11 @@
14357   *  Copyright (C) 2005-2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
14358   *  Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt
14359   *  Copyright (C) 2006 Esben Nielsen
14360 + *  Adaptive Spinlocks:
14361 + *  Copyright (C) 2008 Novell, Inc., Gregory Haskins, Sven Dietrich,
14362 + *                                  and Peter Morreale,
14363 + * Adaptive Spinlocks simplification:
14364 + *  Copyright (C) 2008 Red Hat, Inc., Steven Rostedt <srostedt@redhat.com>
14365   *
14366   *  See Documentation/locking/rt-mutex-design.txt for details.
14367   */
14368 @@ -16,6 +21,8 @@
14369  #include <linux/sched/rt.h>
14370  #include <linux/sched/deadline.h>
14371  #include <linux/timer.h>
14372 +#include <linux/ww_mutex.h>
14373 +#include <linux/blkdev.h>
14375  #include "rtmutex_common.h"
14377 @@ -133,6 +140,12 @@ static void fixup_rt_mutex_waiters(struct rt_mutex *lock)
14378                 WRITE_ONCE(*p, owner & ~RT_MUTEX_HAS_WAITERS);
14381 +static int rt_mutex_real_waiter(struct rt_mutex_waiter *waiter)
14383 +       return waiter && waiter != PI_WAKEUP_INPROGRESS &&
14384 +               waiter != PI_REQUEUE_INPROGRESS;
14387  /*
14388   * We can speed up the acquire/release, if there's no debugging state to be
14389   * set up.
14390 @@ -222,6 +235,12 @@ static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock,
14392  #endif
14395 + * Only use with rt_mutex_waiter_{less,equal}()
14396 + */
14397 +#define task_to_waiter(p) &(struct rt_mutex_waiter) \
14398 +       { .prio = (p)->prio, .deadline = (p)->dl.deadline, .task = (p) }
14400  static inline int
14401  rt_mutex_waiter_less(struct rt_mutex_waiter *left,
14402                      struct rt_mutex_waiter *right)
14403 @@ -236,12 +255,51 @@ rt_mutex_waiter_less(struct rt_mutex_waiter *left,
14404          * then right waiter has a dl_prio() too.
14405          */
14406         if (dl_prio(left->prio))
14407 -               return dl_time_before(left->task->dl.deadline,
14408 -                                     right->task->dl.deadline);
14409 +               return dl_time_before(left->deadline, right->deadline);
14411         return 0;
14414 +static inline int
14415 +rt_mutex_waiter_equal(struct rt_mutex_waiter *left,
14416 +                     struct rt_mutex_waiter *right)
14418 +       if (left->prio != right->prio)
14419 +               return 0;
14421 +       /*
14422 +        * If both waiters have dl_prio(), we check the deadlines of the
14423 +        * associated tasks.
14424 +        * If left waiter has a dl_prio(), and we didn't return 0 above,
14425 +        * then right waiter has a dl_prio() too.
14426 +        */
14427 +       if (dl_prio(left->prio))
14428 +               return left->deadline == right->deadline;
14430 +       return 1;
14433 +#define STEAL_NORMAL  0
14434 +#define STEAL_LATERAL 1
14436 +static inline int
14437 +rt_mutex_steal(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, int mode)
14439 +       struct rt_mutex_waiter *top_waiter = rt_mutex_top_waiter(lock);
14441 +       if (waiter == top_waiter || rt_mutex_waiter_less(waiter, top_waiter))
14442 +               return 1;
14444 +       /*
14445 +        * Note that RT tasks are excluded from lateral-steals
14446 +        * to prevent the introduction of an unbounded latency.
14447 +        */
14448 +       if (mode == STEAL_NORMAL || rt_task(waiter->task))
14449 +               return 0;
14451 +       return rt_mutex_waiter_equal(waiter, top_waiter);
14454  static void
14455  rt_mutex_enqueue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter)
14457 @@ -320,72 +378,16 @@ rt_mutex_dequeue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter)
14458         RB_CLEAR_NODE(&waiter->pi_tree_entry);
14462 - * Calculate task priority from the waiter tree priority
14463 - *
14464 - * Return task->normal_prio when the waiter tree is empty or when
14465 - * the waiter is not allowed to do priority boosting
14466 - */
14467 -int rt_mutex_getprio(struct task_struct *task)
14469 -       if (likely(!task_has_pi_waiters(task)))
14470 -               return task->normal_prio;
14472 -       return min(task_top_pi_waiter(task)->prio,
14473 -                  task->normal_prio);
14476 -struct task_struct *rt_mutex_get_top_task(struct task_struct *task)
14478 -       if (likely(!task_has_pi_waiters(task)))
14479 -               return NULL;
14481 -       return task_top_pi_waiter(task)->task;
14485 - * Called by sched_setscheduler() to get the priority which will be
14486 - * effective after the change.
14487 - */
14488 -int rt_mutex_get_effective_prio(struct task_struct *task, int newprio)
14490 -       if (!task_has_pi_waiters(task))
14491 -               return newprio;
14493 -       if (task_top_pi_waiter(task)->task->prio <= newprio)
14494 -               return task_top_pi_waiter(task)->task->prio;
14495 -       return newprio;
14499 - * Adjust the priority of a task, after its pi_waiters got modified.
14500 - *
14501 - * This can be both boosting and unboosting. task->pi_lock must be held.
14502 - */
14503 -static void __rt_mutex_adjust_prio(struct task_struct *task)
14504 +static void rt_mutex_adjust_prio(struct task_struct *p)
14506 -       int prio = rt_mutex_getprio(task);
14507 +       struct task_struct *pi_task = NULL;
14509 -       if (task->prio != prio || dl_prio(prio))
14510 -               rt_mutex_setprio(task, prio);
14512 +       lockdep_assert_held(&p->pi_lock);
14515 - * Adjust task priority (undo boosting). Called from the exit path of
14516 - * rt_mutex_slowunlock() and rt_mutex_slowlock().
14517 - *
14518 - * (Note: We do this outside of the protection of lock->wait_lock to
14519 - * allow the lock to be taken while or before we readjust the priority
14520 - * of task. We do not use the spin_xx_mutex() variants here as we are
14521 - * outside of the debug path.)
14522 - */
14523 -void rt_mutex_adjust_prio(struct task_struct *task)
14525 -       unsigned long flags;
14526 +       if (task_has_pi_waiters(p))
14527 +               pi_task = task_top_pi_waiter(p)->task;
14529 -       raw_spin_lock_irqsave(&task->pi_lock, flags);
14530 -       __rt_mutex_adjust_prio(task);
14531 -       raw_spin_unlock_irqrestore(&task->pi_lock, flags);
14532 +       rt_mutex_setprio(p, pi_task);
14535  /*
14536 @@ -414,6 +416,14 @@ static bool rt_mutex_cond_detect_deadlock(struct rt_mutex_waiter *waiter,
14537         return debug_rt_mutex_detect_deadlock(waiter, chwalk);
14540 +static void rt_mutex_wake_waiter(struct rt_mutex_waiter *waiter)
14542 +       if (waiter->savestate)
14543 +               wake_up_lock_sleeper(waiter->task);
14544 +       else
14545 +               wake_up_process(waiter->task);
14548  /*
14549   * Max number of times we'll walk the boosting chain:
14550   */
14551 @@ -421,7 +431,8 @@ int max_lock_depth = 1024;
14553  static inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p)
14555 -       return p->pi_blocked_on ? p->pi_blocked_on->lock : NULL;
14556 +       return rt_mutex_real_waiter(p->pi_blocked_on) ?
14557 +               p->pi_blocked_on->lock : NULL;
14560  /*
14561 @@ -557,7 +568,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
14562          * reached or the state of the chain has changed while we
14563          * dropped the locks.
14564          */
14565 -       if (!waiter)
14566 +       if (!rt_mutex_real_waiter(waiter))
14567                 goto out_unlock_pi;
14569         /*
14570 @@ -608,7 +619,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
14571          * enabled we continue, but stop the requeueing in the chain
14572          * walk.
14573          */
14574 -       if (waiter->prio == task->prio) {
14575 +       if (rt_mutex_waiter_equal(waiter, task_to_waiter(task))) {
14576                 if (!detect_deadlock)
14577                         goto out_unlock_pi;
14578                 else
14579 @@ -704,7 +715,26 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
14581         /* [7] Requeue the waiter in the lock waiter tree. */
14582         rt_mutex_dequeue(lock, waiter);
14584 +       /*
14585 +        * Update the waiter prio fields now that we're dequeued.
14586 +        *
14587 +        * These values can have changed through either:
14588 +        *
14589 +        *   sys_sched_set_scheduler() / sys_sched_setattr()
14590 +        *
14591 +        * or
14592 +        *
14593 +        *   DL CBS enforcement advancing the effective deadline.
14594 +        *
14595 +        * Even though pi_waiters also uses these fields, and that tree is only
14596 +        * updated in [11], we can do this here, since we hold [L], which
14597 +        * serializes all pi_waiters access and rb_erase() does not care about
14598 +        * the values of the node being removed.
14599 +        */
14600         waiter->prio = task->prio;
14601 +       waiter->deadline = task->dl.deadline;
14603         rt_mutex_enqueue(lock, waiter);
14605         /* [8] Release the task */
14606 @@ -719,13 +749,16 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
14607          * follow here. This is the end of the chain we are walking.
14608          */
14609         if (!rt_mutex_owner(lock)) {
14610 +               struct rt_mutex_waiter *lock_top_waiter;
14612                 /*
14613                  * If the requeue [7] above changed the top waiter,
14614                  * then we need to wake the new top waiter up to try
14615                  * to get the lock.
14616                  */
14617 -               if (prerequeue_top_waiter != rt_mutex_top_waiter(lock))
14618 -                       wake_up_process(rt_mutex_top_waiter(lock)->task);
14619 +               lock_top_waiter = rt_mutex_top_waiter(lock);
14620 +               if (prerequeue_top_waiter != lock_top_waiter)
14621 +                       rt_mutex_wake_waiter(lock_top_waiter);
14622                 raw_spin_unlock_irq(&lock->wait_lock);
14623                 return 0;
14624         }
14625 @@ -745,7 +778,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
14626                  */
14627                 rt_mutex_dequeue_pi(task, prerequeue_top_waiter);
14628                 rt_mutex_enqueue_pi(task, waiter);
14629 -               __rt_mutex_adjust_prio(task);
14630 +               rt_mutex_adjust_prio(task);
14632         } else if (prerequeue_top_waiter == waiter) {
14633                 /*
14634 @@ -761,7 +794,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
14635                 rt_mutex_dequeue_pi(task, waiter);
14636                 waiter = rt_mutex_top_waiter(lock);
14637                 rt_mutex_enqueue_pi(task, waiter);
14638 -               __rt_mutex_adjust_prio(task);
14639 +               rt_mutex_adjust_prio(task);
14640         } else {
14641                 /*
14642                  * Nothing changed. No need to do any priority
14643 @@ -818,6 +851,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
14644         return ret;
14648  /*
14649   * Try to take an rt-mutex
14650   *
14651 @@ -827,10 +861,14 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
14652   * @task:   The task which wants to acquire the lock
14653   * @waiter: The waiter that is queued to the lock's wait tree if the
14654   *         callsite called task_blocked_on_lock(), otherwise NULL
14655 + * @mode:   Lock steal mode (STEAL_NORMAL, STEAL_LATERAL)
14656   */
14657 -static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
14658 -                               struct rt_mutex_waiter *waiter)
14659 +static int __try_to_take_rt_mutex(struct rt_mutex *lock,
14660 +                                 struct task_struct *task,
14661 +                                 struct rt_mutex_waiter *waiter, int mode)
14663 +       lockdep_assert_held(&lock->wait_lock);
14665         /*
14666          * Before testing whether we can acquire @lock, we set the
14667          * RT_MUTEX_HAS_WAITERS bit in @lock->owner. This forces all
14668 @@ -863,12 +901,11 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
14669          */
14670         if (waiter) {
14671                 /*
14672 -                * If waiter is not the highest priority waiter of
14673 -                * @lock, give up.
14674 +                * If waiter is not the highest priority waiter of @lock,
14675 +                * or its peer when lateral steal is allowed, give up.
14676                  */
14677 -               if (waiter != rt_mutex_top_waiter(lock))
14678 +               if (!rt_mutex_steal(lock, waiter, mode))
14679                         return 0;
14681                 /*
14682                  * We can acquire the lock. Remove the waiter from the
14683                  * lock waiters tree.
14684 @@ -886,13 +923,12 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
14685                  */
14686                 if (rt_mutex_has_waiters(lock)) {
14687                         /*
14688 -                        * If @task->prio is greater than or equal to
14689 -                        * the top waiter priority (kernel view),
14690 -                        * @task lost.
14691 +                        * If @task->prio is greater than the top waiter
14692 +                        * priority (kernel view), or equal to it when a
14693 +                        * lateral steal is forbidden, @task lost.
14694                          */
14695 -                       if (task->prio >= rt_mutex_top_waiter(lock)->prio)
14696 +                       if (!rt_mutex_steal(lock, task_to_waiter(task), mode))
14697                                 return 0;
14699                         /*
14700                          * The current top waiter stays enqueued. We
14701                          * don't have to change anything in the lock
14702 @@ -936,177 +972,589 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
14703          */
14704         rt_mutex_set_owner(lock, task);
14706 -       rt_mutex_deadlock_account_lock(lock, task);
14708         return 1;
14711 +#ifdef CONFIG_PREEMPT_RT_FULL
14712  /*
14713 - * Task blocks on lock.
14714 - *
14715 - * Prepare waiter and propagate pi chain
14716 - *
14717 - * This must be called with lock->wait_lock held and interrupts disabled
14718 + * preemptible spin_lock functions:
14719   */
14720 -static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
14721 -                                  struct rt_mutex_waiter *waiter,
14722 -                                  struct task_struct *task,
14723 -                                  enum rtmutex_chainwalk chwalk)
14724 +static inline void rt_spin_lock_fastlock(struct rt_mutex *lock,
14725 +                                        void  (*slowfn)(struct rt_mutex *lock,
14726 +                                                        bool mg_off),
14727 +                                        bool do_mig_dis)
14729 -       struct task_struct *owner = rt_mutex_owner(lock);
14730 -       struct rt_mutex_waiter *top_waiter = waiter;
14731 -       struct rt_mutex *next_lock;
14732 -       int chain_walk = 0, res;
14733 +       might_sleep_no_state_check();
14735 -       /*
14736 -        * Early deadlock detection. We really don't want the task to
14737 -        * enqueue on itself just to untangle the mess later. It's not
14738 -        * only an optimization. We drop the locks, so another waiter
14739 -        * can come in before the chain walk detects the deadlock. So
14740 -        * the other will detect the deadlock and return -EDEADLOCK,
14741 -        * which is wrong, as the other waiter is not in a deadlock
14742 -        * situation.
14743 -        */
14744 -       if (owner == task)
14745 -               return -EDEADLK;
14746 +       if (do_mig_dis)
14747 +               migrate_disable();
14749 -       raw_spin_lock(&task->pi_lock);
14750 -       __rt_mutex_adjust_prio(task);
14751 -       waiter->task = task;
14752 -       waiter->lock = lock;
14753 -       waiter->prio = task->prio;
14754 +       if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
14755 +               return;
14756 +       else
14757 +               slowfn(lock, do_mig_dis);
14760 -       /* Get the top priority waiter on the lock */
14761 -       if (rt_mutex_has_waiters(lock))
14762 -               top_waiter = rt_mutex_top_waiter(lock);
14763 -       rt_mutex_enqueue(lock, waiter);
14764 +static inline void rt_spin_lock_fastunlock(struct rt_mutex *lock,
14765 +                                          void  (*slowfn)(struct rt_mutex *lock))
14767 +       if (likely(rt_mutex_cmpxchg_release(lock, current, NULL)))
14768 +               return;
14769 +       else
14770 +               slowfn(lock);
14772 +#ifdef CONFIG_SMP
14774 + * Note that owner is a speculative pointer and dereferencing relies
14775 + * on rcu_read_lock() and the check against the lock owner.
14776 + */
14777 +static int adaptive_wait(struct rt_mutex *lock,
14778 +                        struct task_struct *owner)
14780 +       int res = 0;
14782 -       task->pi_blocked_on = waiter;
14783 +       rcu_read_lock();
14784 +       for (;;) {
14785 +               if (owner != rt_mutex_owner(lock))
14786 +                       break;
14787 +               /*
14788 +                * Ensure that owner->on_cpu is dereferenced _after_
14789 +                * checking the above to be valid.
14790 +                */
14791 +               barrier();
14792 +               if (!owner->on_cpu) {
14793 +                       res = 1;
14794 +                       break;
14795 +               }
14796 +               cpu_relax();
14797 +       }
14798 +       rcu_read_unlock();
14799 +       return res;
14801 +#else
14802 +static int adaptive_wait(struct rt_mutex *lock,
14803 +                        struct task_struct *orig_owner)
14805 +       return 1;
14807 +#endif
14809 -       raw_spin_unlock(&task->pi_lock);
14810 +static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
14811 +                                  struct rt_mutex_waiter *waiter,
14812 +                                  struct task_struct *task,
14813 +                                  enum rtmutex_chainwalk chwalk);
14815 + * Slow path lock function spin_lock style: this variant is very
14816 + * careful not to miss any non-lock wakeups.
14817 + *
14818 + * We store the current state under p->pi_lock in p->saved_state and
14819 + * the try_to_wake_up() code handles this accordingly.
14820 + */
14821 +static void  noinline __sched rt_spin_lock_slowlock(struct rt_mutex *lock,
14822 +                                                   bool mg_off)
14824 +       struct task_struct *lock_owner, *self = current;
14825 +       struct rt_mutex_waiter waiter, *top_waiter;
14826 +       unsigned long flags;
14827 +       int ret;
14829 -       if (!owner)
14830 -               return 0;
14831 +       rt_mutex_init_waiter(&waiter, true);
14833 -       raw_spin_lock(&owner->pi_lock);
14834 -       if (waiter == rt_mutex_top_waiter(lock)) {
14835 -               rt_mutex_dequeue_pi(owner, top_waiter);
14836 -               rt_mutex_enqueue_pi(owner, waiter);
14837 +       raw_spin_lock_irqsave(&lock->wait_lock, flags);
14839 -               __rt_mutex_adjust_prio(owner);
14840 -               if (owner->pi_blocked_on)
14841 -                       chain_walk = 1;
14842 -       } else if (rt_mutex_cond_detect_deadlock(waiter, chwalk)) {
14843 -               chain_walk = 1;
14844 +       if (__try_to_take_rt_mutex(lock, self, NULL, STEAL_LATERAL)) {
14845 +               raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
14846 +               return;
14847         }
14849 -       /* Store the lock on which owner is blocked or NULL */
14850 -       next_lock = task_blocked_on_lock(owner);
14851 +       BUG_ON(rt_mutex_owner(lock) == self);
14853 -       raw_spin_unlock(&owner->pi_lock);
14854         /*
14855 -        * Even if full deadlock detection is on, if the owner is not
14856 -        * blocked itself, we can avoid finding this out in the chain
14857 -        * walk.
14858 +        * We save whatever state the task is in and we'll restore it
14859 +        * after acquiring the lock taking real wakeups into account
14860 +        * as well. We are serialized via pi_lock against wakeups. See
14861 +        * try_to_wake_up().
14862          */
14863 -       if (!chain_walk || !next_lock)
14864 -               return 0;
14865 +       raw_spin_lock(&self->pi_lock);
14866 +       self->saved_state = self->state;
14867 +       __set_current_state_no_track(TASK_UNINTERRUPTIBLE);
14868 +       raw_spin_unlock(&self->pi_lock);
14870 -       /*
14871 -        * The owner can't disappear while holding a lock,
14872 -        * so the owner struct is protected by wait_lock.
14873 -        * Gets dropped in rt_mutex_adjust_prio_chain()!
14874 -        */
14875 -       get_task_struct(owner);
14876 +       ret = task_blocks_on_rt_mutex(lock, &waiter, self, RT_MUTEX_MIN_CHAINWALK);
14877 +       BUG_ON(ret);
14879 -       raw_spin_unlock_irq(&lock->wait_lock);
14880 +       for (;;) {
14881 +               /* Try to acquire the lock again. */
14882 +               if (__try_to_take_rt_mutex(lock, self, &waiter, STEAL_LATERAL))
14883 +                       break;
14885 -       res = rt_mutex_adjust_prio_chain(owner, chwalk, lock,
14886 -                                        next_lock, waiter, task);
14887 +               top_waiter = rt_mutex_top_waiter(lock);
14888 +               lock_owner = rt_mutex_owner(lock);
14890 -       raw_spin_lock_irq(&lock->wait_lock);
14891 +               raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
14893 -       return res;
14895 +               debug_rt_mutex_print_deadlock(&waiter);
14898 - * Remove the top waiter from the current tasks pi waiter tree and
14899 - * queue it up.
14900 - *
14901 - * Called with lock->wait_lock held and interrupts disabled.
14902 - */
14903 -static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
14904 -                                   struct rt_mutex *lock)
14906 -       struct rt_mutex_waiter *waiter;
14907 +               if (top_waiter != &waiter || adaptive_wait(lock, lock_owner)) {
14908 +                       if (mg_off)
14909 +                               migrate_enable();
14910 +                       schedule();
14911 +                       if (mg_off)
14912 +                               migrate_disable();
14913 +               }
14915 -       raw_spin_lock(&current->pi_lock);
14916 +               raw_spin_lock_irqsave(&lock->wait_lock, flags);
14918 -       waiter = rt_mutex_top_waiter(lock);
14919 +               raw_spin_lock(&self->pi_lock);
14920 +               __set_current_state_no_track(TASK_UNINTERRUPTIBLE);
14921 +               raw_spin_unlock(&self->pi_lock);
14922 +       }
14924         /*
14925 -        * Remove it from current->pi_waiters. We do not adjust a
14926 -        * possible priority boost right now. We execute wakeup in the
14927 -        * boosted mode and go back to normal after releasing
14928 -        * lock->wait_lock.
14929 +        * Restore the task state to current->saved_state. We set it
14930 +        * to the original state above and the try_to_wake_up() code
14931 +        * has possibly updated it when a real (non-rtmutex) wakeup
14932 +        * happened while we were blocked. Clear saved_state so
14933 +        * try_to_wakeup() does not get confused.
14934          */
14935 -       rt_mutex_dequeue_pi(current, waiter);
14936 +       raw_spin_lock(&self->pi_lock);
14937 +       __set_current_state_no_track(self->saved_state);
14938 +       self->saved_state = TASK_RUNNING;
14939 +       raw_spin_unlock(&self->pi_lock);
14941         /*
14942 -        * As we are waking up the top waiter, and the waiter stays
14943 -        * queued on the lock until it gets the lock, this lock
14944 -        * obviously has waiters. Just set the bit here and this has
14945 -        * the added benefit of forcing all new tasks into the
14946 -        * slow path making sure no task of lower priority than
14947 -        * the top waiter can steal this lock.
14948 +        * try_to_take_rt_mutex() sets the waiter bit
14949 +        * unconditionally. We might have to fix that up:
14950          */
14951 -       lock->owner = (void *) RT_MUTEX_HAS_WAITERS;
14952 +       fixup_rt_mutex_waiters(lock);
14954 -       raw_spin_unlock(&current->pi_lock);
14955 +       BUG_ON(rt_mutex_has_waiters(lock) && &waiter == rt_mutex_top_waiter(lock));
14956 +       BUG_ON(!RB_EMPTY_NODE(&waiter.tree_entry));
14958 -       wake_q_add(wake_q, waiter->task);
14959 +       raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
14961 +       debug_rt_mutex_free_waiter(&waiter);
14964 +static bool __sched __rt_mutex_unlock_common(struct rt_mutex *lock,
14965 +                                            struct wake_q_head *wake_q,
14966 +                                            struct wake_q_head *wq_sleeper);
14967  /*
14968 - * Remove a waiter from a lock and give up
14969 - *
14970 - * Must be called with lock->wait_lock held and interrupts disabled. I must
14971 - * have just failed to try_to_take_rt_mutex().
14972 + * Slow path to release a rt_mutex spin_lock style
14973   */
14974 -static void remove_waiter(struct rt_mutex *lock,
14975 -                         struct rt_mutex_waiter *waiter)
14976 +static void  noinline __sched rt_spin_lock_slowunlock(struct rt_mutex *lock)
14978 -       bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock));
14979 -       struct task_struct *owner = rt_mutex_owner(lock);
14980 -       struct rt_mutex *next_lock;
14981 +       unsigned long flags;
14982 +       WAKE_Q(wake_q);
14983 +       WAKE_Q(wake_sleeper_q);
14984 +       bool postunlock;
14986 -       raw_spin_lock(&current->pi_lock);
14987 -       rt_mutex_dequeue(lock, waiter);
14988 -       current->pi_blocked_on = NULL;
14989 -       raw_spin_unlock(&current->pi_lock);
14990 +       raw_spin_lock_irqsave(&lock->wait_lock, flags);
14991 +       postunlock = __rt_mutex_unlock_common(lock, &wake_q, &wake_sleeper_q);
14992 +       raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
14994 -       /*
14995 -        * Only update priority if the waiter was the highest priority
14996 -        * waiter of the lock and there is an owner to update.
14997 -        */
14998 -       if (!owner || !is_top_waiter)
14999 -               return;
15000 +       if (postunlock)
15001 +               rt_mutex_postunlock(&wake_q, &wake_sleeper_q);
15004 -       raw_spin_lock(&owner->pi_lock);
15005 +void __lockfunc rt_spin_lock__no_mg(spinlock_t *lock)
15007 +       rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock, false);
15008 +       spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
15010 +EXPORT_SYMBOL(rt_spin_lock__no_mg);
15012 -       rt_mutex_dequeue_pi(owner, waiter);
15013 +void __lockfunc rt_spin_lock(spinlock_t *lock)
15015 +       rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock, true);
15016 +       spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
15018 +EXPORT_SYMBOL(rt_spin_lock);
15020 -       if (rt_mutex_has_waiters(lock))
15021 -               rt_mutex_enqueue_pi(owner, rt_mutex_top_waiter(lock));
15022 +void __lockfunc __rt_spin_lock(struct rt_mutex *lock)
15024 +       rt_spin_lock_fastlock(lock, rt_spin_lock_slowlock, true);
15026 +EXPORT_SYMBOL(__rt_spin_lock);
15028 -       __rt_mutex_adjust_prio(owner);
15029 +void __lockfunc __rt_spin_lock__no_mg(struct rt_mutex *lock)
15031 +       rt_spin_lock_fastlock(lock, rt_spin_lock_slowlock, false);
15033 +EXPORT_SYMBOL(__rt_spin_lock__no_mg);
15035 -       /* Store the lock on which owner is blocked or NULL */
15036 -       next_lock = task_blocked_on_lock(owner);
15037 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
15038 +void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int subclass)
15040 +       spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
15041 +       rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock, true);
15043 +EXPORT_SYMBOL(rt_spin_lock_nested);
15044 +#endif
15046 -       raw_spin_unlock(&owner->pi_lock);
15047 +void __lockfunc rt_spin_unlock__no_mg(spinlock_t *lock)
15049 +       /* NOTE: we always pass in '1' for nested, for simplicity */
15050 +       spin_release(&lock->dep_map, 1, _RET_IP_);
15051 +       rt_spin_lock_fastunlock(&lock->lock, rt_spin_lock_slowunlock);
15053 +EXPORT_SYMBOL(rt_spin_unlock__no_mg);
15055 -       /*
15056 +void __lockfunc rt_spin_unlock(spinlock_t *lock)
15058 +       /* NOTE: we always pass in '1' for nested, for simplicity */
15059 +       spin_release(&lock->dep_map, 1, _RET_IP_);
15060 +       rt_spin_lock_fastunlock(&lock->lock, rt_spin_lock_slowunlock);
15061 +       migrate_enable();
15063 +EXPORT_SYMBOL(rt_spin_unlock);
15065 +void __lockfunc __rt_spin_unlock(struct rt_mutex *lock)
15067 +       rt_spin_lock_fastunlock(lock, rt_spin_lock_slowunlock);
15069 +EXPORT_SYMBOL(__rt_spin_unlock);
15072 + * Wait for the lock to get unlocked: instead of polling for an unlock
15073 + * (like raw spinlocks do), we lock and unlock, to force the kernel to
15074 + * schedule if there's contention:
15075 + */
15076 +void __lockfunc rt_spin_unlock_wait(spinlock_t *lock)
15078 +       spin_lock(lock);
15079 +       spin_unlock(lock);
15081 +EXPORT_SYMBOL(rt_spin_unlock_wait);
15083 +int __lockfunc rt_spin_trylock__no_mg(spinlock_t *lock)
15085 +       int ret;
15087 +       ret = rt_mutex_trylock(&lock->lock);
15088 +       if (ret)
15089 +               spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
15090 +       return ret;
15092 +EXPORT_SYMBOL(rt_spin_trylock__no_mg);
15094 +int __lockfunc rt_spin_trylock(spinlock_t *lock)
15096 +       int ret;
15098 +       migrate_disable();
15099 +       ret = rt_mutex_trylock(&lock->lock);
15100 +       if (ret)
15101 +               spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
15102 +       else
15103 +               migrate_enable();
15104 +       return ret;
15106 +EXPORT_SYMBOL(rt_spin_trylock);
15108 +int __lockfunc rt_spin_trylock_bh(spinlock_t *lock)
15110 +       int ret;
15112 +       local_bh_disable();
15113 +       ret = rt_mutex_trylock(&lock->lock);
15114 +       if (ret) {
15115 +               migrate_disable();
15116 +               spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
15117 +       } else
15118 +               local_bh_enable();
15119 +       return ret;
15121 +EXPORT_SYMBOL(rt_spin_trylock_bh);
15123 +int __lockfunc rt_spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags)
15125 +       int ret;
15127 +       *flags = 0;
15128 +       ret = rt_mutex_trylock(&lock->lock);
15129 +       if (ret) {
15130 +               migrate_disable();
15131 +               spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
15132 +       }
15133 +       return ret;
15135 +EXPORT_SYMBOL(rt_spin_trylock_irqsave);
15137 +int atomic_dec_and_spin_lock(atomic_t *atomic, spinlock_t *lock)
15139 +       /* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */
15140 +       if (atomic_add_unless(atomic, -1, 1))
15141 +               return 0;
15142 +       rt_spin_lock(lock);
15143 +       if (atomic_dec_and_test(atomic))
15144 +               return 1;
15145 +       rt_spin_unlock(lock);
15146 +       return 0;
15148 +EXPORT_SYMBOL(atomic_dec_and_spin_lock);
15150 +       void
15151 +__rt_spin_lock_init(spinlock_t *lock, char *name, struct lock_class_key *key)
15153 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
15154 +       /*
15155 +        * Make sure we are not reinitializing a held lock:
15156 +        */
15157 +       debug_check_no_locks_freed((void *)lock, sizeof(*lock));
15158 +       lockdep_init_map(&lock->dep_map, name, key, 0);
15159 +#endif
15161 +EXPORT_SYMBOL(__rt_spin_lock_init);
15163 +#endif /* PREEMPT_RT_FULL */
15165 +#ifdef CONFIG_PREEMPT_RT_FULL
15166 +       static inline int __sched
15167 +__mutex_lock_check_stamp(struct rt_mutex *lock, struct ww_acquire_ctx *ctx)
15169 +       struct ww_mutex *ww = container_of(lock, struct ww_mutex, base.lock);
15170 +       struct ww_acquire_ctx *hold_ctx = ACCESS_ONCE(ww->ctx);
15172 +       if (!hold_ctx)
15173 +               return 0;
15175 +       if (unlikely(ctx == hold_ctx))
15176 +               return -EALREADY;
15178 +       if (ctx->stamp - hold_ctx->stamp <= LONG_MAX &&
15179 +           (ctx->stamp != hold_ctx->stamp || ctx > hold_ctx)) {
15180 +#ifdef CONFIG_DEBUG_MUTEXES
15181 +               DEBUG_LOCKS_WARN_ON(ctx->contending_lock);
15182 +               ctx->contending_lock = ww;
15183 +#endif
15184 +               return -EDEADLK;
15185 +       }
15187 +       return 0;
15189 +#else
15190 +       static inline int __sched
15191 +__mutex_lock_check_stamp(struct rt_mutex *lock, struct ww_acquire_ctx *ctx)
15193 +       BUG();
15194 +       return 0;
15197 +#endif
15199 +static inline int
15200 +try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
15201 +                    struct rt_mutex_waiter *waiter)
15203 +       return __try_to_take_rt_mutex(lock, task, waiter, STEAL_NORMAL);
15207 + * Task blocks on lock.
15208 + *
15209 + * Prepare waiter and propagate pi chain
15210 + *
15211 + * This must be called with lock->wait_lock held and interrupts disabled
15212 + */
15213 +static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
15214 +                                  struct rt_mutex_waiter *waiter,
15215 +                                  struct task_struct *task,
15216 +                                  enum rtmutex_chainwalk chwalk)
15218 +       struct task_struct *owner = rt_mutex_owner(lock);
15219 +       struct rt_mutex_waiter *top_waiter = waiter;
15220 +       struct rt_mutex *next_lock;
15221 +       int chain_walk = 0, res;
15223 +       lockdep_assert_held(&lock->wait_lock);
15225 +       /*
15226 +        * Early deadlock detection. We really don't want the task to
15227 +        * enqueue on itself just to untangle the mess later. It's not
15228 +        * only an optimization. We drop the locks, so another waiter
15229 +        * can come in before the chain walk detects the deadlock. So
15230 +        * the other will detect the deadlock and return -EDEADLOCK,
15231 +        * which is wrong, as the other waiter is not in a deadlock
15232 +        * situation.
15233 +        */
15234 +       if (owner == task)
15235 +               return -EDEADLK;
15237 +       raw_spin_lock(&task->pi_lock);
15239 +       /*
15240 +        * In the case of futex requeue PI, this will be a proxy
15241 +        * lock. The task will wake unaware that it is enqueueed on
15242 +        * this lock. Avoid blocking on two locks and corrupting
15243 +        * pi_blocked_on via the PI_WAKEUP_INPROGRESS
15244 +        * flag. futex_wait_requeue_pi() sets this when it wakes up
15245 +        * before requeue (due to a signal or timeout). Do not enqueue
15246 +        * the task if PI_WAKEUP_INPROGRESS is set.
15247 +        */
15248 +       if (task != current && task->pi_blocked_on == PI_WAKEUP_INPROGRESS) {
15249 +               raw_spin_unlock(&task->pi_lock);
15250 +               return -EAGAIN;
15251 +       }
15253 +       BUG_ON(rt_mutex_real_waiter(task->pi_blocked_on));
15255 +       rt_mutex_adjust_prio(task);
15256 +       waiter->task = task;
15257 +       waiter->lock = lock;
15258 +       waiter->prio = task->prio;
15259 +       waiter->deadline = task->dl.deadline;
15261 +       /* Get the top priority waiter on the lock */
15262 +       if (rt_mutex_has_waiters(lock))
15263 +               top_waiter = rt_mutex_top_waiter(lock);
15264 +       rt_mutex_enqueue(lock, waiter);
15266 +       task->pi_blocked_on = waiter;
15268 +       raw_spin_unlock(&task->pi_lock);
15270 +       if (!owner)
15271 +               return 0;
15273 +       raw_spin_lock(&owner->pi_lock);
15274 +       if (waiter == rt_mutex_top_waiter(lock)) {
15275 +               rt_mutex_dequeue_pi(owner, top_waiter);
15276 +               rt_mutex_enqueue_pi(owner, waiter);
15278 +               rt_mutex_adjust_prio(owner);
15279 +               if (rt_mutex_real_waiter(owner->pi_blocked_on))
15280 +                       chain_walk = 1;
15281 +       } else if (rt_mutex_cond_detect_deadlock(waiter, chwalk)) {
15282 +               chain_walk = 1;
15283 +       }
15285 +       /* Store the lock on which owner is blocked or NULL */
15286 +       next_lock = task_blocked_on_lock(owner);
15288 +       raw_spin_unlock(&owner->pi_lock);
15289 +       /*
15290 +        * Even if full deadlock detection is on, if the owner is not
15291 +        * blocked itself, we can avoid finding this out in the chain
15292 +        * walk.
15293 +        */
15294 +       if (!chain_walk || !next_lock)
15295 +               return 0;
15297 +       /*
15298 +        * The owner can't disappear while holding a lock,
15299 +        * so the owner struct is protected by wait_lock.
15300 +        * Gets dropped in rt_mutex_adjust_prio_chain()!
15301 +        */
15302 +       get_task_struct(owner);
15304 +       raw_spin_unlock_irq(&lock->wait_lock);
15306 +       res = rt_mutex_adjust_prio_chain(owner, chwalk, lock,
15307 +                                        next_lock, waiter, task);
15309 +       raw_spin_lock_irq(&lock->wait_lock);
15311 +       return res;
15315 + * Remove the top waiter from the current tasks pi waiter tree and
15316 + * queue it up.
15317 + *
15318 + * Called with lock->wait_lock held and interrupts disabled.
15319 + */
15320 +static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
15321 +                                   struct wake_q_head *wake_sleeper_q,
15322 +                                   struct rt_mutex *lock)
15324 +       struct rt_mutex_waiter *waiter;
15326 +       raw_spin_lock(&current->pi_lock);
15328 +       waiter = rt_mutex_top_waiter(lock);
15330 +       /*
15331 +        * Remove it from current->pi_waiters and deboost.
15332 +        *
15333 +        * We must in fact deboost here in order to ensure we call
15334 +        * rt_mutex_setprio() to update p->pi_top_task before the
15335 +        * task unblocks.
15336 +        */
15337 +       rt_mutex_dequeue_pi(current, waiter);
15338 +       rt_mutex_adjust_prio(current);
15340 +       /*
15341 +        * As we are waking up the top waiter, and the waiter stays
15342 +        * queued on the lock until it gets the lock, this lock
15343 +        * obviously has waiters. Just set the bit here and this has
15344 +        * the added benefit of forcing all new tasks into the
15345 +        * slow path making sure no task of lower priority than
15346 +        * the top waiter can steal this lock.
15347 +        */
15348 +       lock->owner = (void *) RT_MUTEX_HAS_WAITERS;
15350 +       /*
15351 +        * We deboosted before waking the top waiter task such that we don't
15352 +        * run two tasks with the 'same' priority (and ensure the
15353 +        * p->pi_top_task pointer points to a blocked task). This however can
15354 +        * lead to priority inversion if we would get preempted after the
15355 +        * deboost but before waking our donor task, hence the preempt_disable()
15356 +        * before unlock.
15357 +        *
15358 +        * Pairs with preempt_enable() in rt_mutex_postunlock();
15359 +        */
15360 +       preempt_disable();
15361 +       if (waiter->savestate)
15362 +               wake_q_add_sleeper(wake_sleeper_q, waiter->task);
15363 +       else
15364 +               wake_q_add(wake_q, waiter->task);
15365 +       raw_spin_unlock(&current->pi_lock);
15369 + * Remove a waiter from a lock and give up
15370 + *
15371 + * Must be called with lock->wait_lock held and interrupts disabled. I must
15372 + * have just failed to try_to_take_rt_mutex().
15373 + */
15374 +static void remove_waiter(struct rt_mutex *lock,
15375 +                         struct rt_mutex_waiter *waiter)
15377 +       bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock));
15378 +       struct task_struct *owner = rt_mutex_owner(lock);
15379 +       struct rt_mutex *next_lock = NULL;
15381 +       lockdep_assert_held(&lock->wait_lock);
15383 +       raw_spin_lock(&current->pi_lock);
15384 +       rt_mutex_dequeue(lock, waiter);
15385 +       current->pi_blocked_on = NULL;
15386 +       raw_spin_unlock(&current->pi_lock);
15388 +       /*
15389 +        * Only update priority if the waiter was the highest priority
15390 +        * waiter of the lock and there is an owner to update.
15391 +        */
15392 +       if (!owner || !is_top_waiter)
15393 +               return;
15395 +       raw_spin_lock(&owner->pi_lock);
15397 +       rt_mutex_dequeue_pi(owner, waiter);
15399 +       if (rt_mutex_has_waiters(lock))
15400 +               rt_mutex_enqueue_pi(owner, rt_mutex_top_waiter(lock));
15402 +       rt_mutex_adjust_prio(owner);
15404 +       /* Store the lock on which owner is blocked or NULL */
15405 +       if (rt_mutex_real_waiter(owner->pi_blocked_on))
15406 +               next_lock = task_blocked_on_lock(owner);
15408 +       raw_spin_unlock(&owner->pi_lock);
15410 +       /*
15411          * Don't walk the chain, if the owner task is not blocked
15412          * itself.
15413          */
15414 @@ -1138,21 +1586,30 @@ void rt_mutex_adjust_pi(struct task_struct *task)
15415         raw_spin_lock_irqsave(&task->pi_lock, flags);
15417         waiter = task->pi_blocked_on;
15418 -       if (!waiter || (waiter->prio == task->prio &&
15419 -                       !dl_prio(task->prio))) {
15420 +       if (!rt_mutex_real_waiter(waiter) ||
15421 +           rt_mutex_waiter_equal(waiter, task_to_waiter(task))) {
15422                 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
15423                 return;
15424         }
15425         next_lock = waiter->lock;
15426 -       raw_spin_unlock_irqrestore(&task->pi_lock, flags);
15428         /* gets dropped in rt_mutex_adjust_prio_chain()! */
15429         get_task_struct(task);
15431 +       raw_spin_unlock_irqrestore(&task->pi_lock, flags);
15432         rt_mutex_adjust_prio_chain(task, RT_MUTEX_MIN_CHAINWALK, NULL,
15433                                    next_lock, NULL, task);
15436 +void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter, bool savestate)
15438 +       debug_rt_mutex_init_waiter(waiter);
15439 +       RB_CLEAR_NODE(&waiter->pi_tree_entry);
15440 +       RB_CLEAR_NODE(&waiter->tree_entry);
15441 +       waiter->task = NULL;
15442 +       waiter->savestate = savestate;
15445  /**
15446   * __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop
15447   * @lock:               the rt_mutex to take
15448 @@ -1166,7 +1623,8 @@ void rt_mutex_adjust_pi(struct task_struct *task)
15449  static int __sched
15450  __rt_mutex_slowlock(struct rt_mutex *lock, int state,
15451                     struct hrtimer_sleeper *timeout,
15452 -                   struct rt_mutex_waiter *waiter)
15453 +                   struct rt_mutex_waiter *waiter,
15454 +                   struct ww_acquire_ctx *ww_ctx)
15456         int ret = 0;
15458 @@ -1175,16 +1633,17 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
15459                 if (try_to_take_rt_mutex(lock, current, waiter))
15460                         break;
15462 -               /*
15463 -                * TASK_INTERRUPTIBLE checks for signals and
15464 -                * timeout. Ignored otherwise.
15465 -                */
15466 -               if (unlikely(state == TASK_INTERRUPTIBLE)) {
15467 -                       /* Signal pending? */
15468 -                       if (signal_pending(current))
15469 -                               ret = -EINTR;
15470 -                       if (timeout && !timeout->task)
15471 -                               ret = -ETIMEDOUT;
15472 +               if (timeout && !timeout->task) {
15473 +                       ret = -ETIMEDOUT;
15474 +                       break;
15475 +               }
15476 +               if (signal_pending_state(state, current)) {
15477 +                       ret = -EINTR;
15478 +                       break;
15479 +               }
15481 +               if (ww_ctx && ww_ctx->acquired > 0) {
15482 +                       ret = __mutex_lock_check_stamp(lock, ww_ctx);
15483                         if (ret)
15484                                 break;
15485                 }
15486 @@ -1223,35 +1682,94 @@ static void rt_mutex_handle_deadlock(int res, int detect_deadlock,
15487         }
15491 - * Slow path lock function:
15492 - */
15493 -static int __sched
15494 -rt_mutex_slowlock(struct rt_mutex *lock, int state,
15495 -                 struct hrtimer_sleeper *timeout,
15496 -                 enum rtmutex_chainwalk chwalk)
15497 +static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww,
15498 +                                                  struct ww_acquire_ctx *ww_ctx)
15500 +#ifdef CONFIG_DEBUG_MUTEXES
15501 +       /*
15502 +        * If this WARN_ON triggers, you used ww_mutex_lock to acquire,
15503 +        * but released with a normal mutex_unlock in this call.
15504 +        *
15505 +        * This should never happen, always use ww_mutex_unlock.
15506 +        */
15507 +       DEBUG_LOCKS_WARN_ON(ww->ctx);
15509 +       /*
15510 +        * Not quite done after calling ww_acquire_done() ?
15511 +        */
15512 +       DEBUG_LOCKS_WARN_ON(ww_ctx->done_acquire);
15514 +       if (ww_ctx->contending_lock) {
15515 +               /*
15516 +                * After -EDEADLK you tried to
15517 +                * acquire a different ww_mutex? Bad!
15518 +                */
15519 +               DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock != ww);
15521 +               /*
15522 +                * You called ww_mutex_lock after receiving -EDEADLK,
15523 +                * but 'forgot' to unlock everything else first?
15524 +                */
15525 +               DEBUG_LOCKS_WARN_ON(ww_ctx->acquired > 0);
15526 +               ww_ctx->contending_lock = NULL;
15527 +       }
15529 +       /*
15530 +        * Naughty, using a different class will lead to undefined behavior!
15531 +        */
15532 +       DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class);
15533 +#endif
15534 +       ww_ctx->acquired++;
15537 +#ifdef CONFIG_PREEMPT_RT_FULL
15538 +static void ww_mutex_account_lock(struct rt_mutex *lock,
15539 +                                 struct ww_acquire_ctx *ww_ctx)
15541 -       struct rt_mutex_waiter waiter;
15542 -       unsigned long flags;
15543 -       int ret = 0;
15544 +       struct ww_mutex *ww = container_of(lock, struct ww_mutex, base.lock);
15545 +       struct rt_mutex_waiter *waiter, *n;
15547 -       debug_rt_mutex_init_waiter(&waiter);
15548 -       RB_CLEAR_NODE(&waiter.pi_tree_entry);
15549 -       RB_CLEAR_NODE(&waiter.tree_entry);
15550 +       /*
15551 +        * This branch gets optimized out for the common case,
15552 +        * and is only important for ww_mutex_lock.
15553 +        */
15554 +       ww_mutex_lock_acquired(ww, ww_ctx);
15555 +       ww->ctx = ww_ctx;
15557         /*
15558 -        * Technically we could use raw_spin_[un]lock_irq() here, but this can
15559 -        * be called in early boot if the cmpxchg() fast path is disabled
15560 -        * (debug, no architecture support). In this case we will acquire the
15561 -        * rtmutex with lock->wait_lock held. But we cannot unconditionally
15562 -        * enable interrupts in that early boot case. So we need to use the
15563 -        * irqsave/restore variants.
15564 +        * Give any possible sleeping processes the chance to wake up,
15565 +        * so they can recheck if they have to back off.
15566          */
15567 -       raw_spin_lock_irqsave(&lock->wait_lock, flags);
15568 +       rbtree_postorder_for_each_entry_safe(waiter, n, &lock->waiters,
15569 +                                            tree_entry) {
15570 +               /* XXX debug rt mutex waiter wakeup */
15572 +               BUG_ON(waiter->lock != lock);
15573 +               rt_mutex_wake_waiter(waiter);
15574 +       }
15577 +#else
15579 +static void ww_mutex_account_lock(struct rt_mutex *lock,
15580 +                                 struct ww_acquire_ctx *ww_ctx)
15582 +       BUG();
15584 +#endif
15586 +int __sched rt_mutex_slowlock_locked(struct rt_mutex *lock, int state,
15587 +                                    struct hrtimer_sleeper *timeout,
15588 +                                    enum rtmutex_chainwalk chwalk,
15589 +                                    struct ww_acquire_ctx *ww_ctx,
15590 +                                    struct rt_mutex_waiter *waiter)
15592 +       int ret;
15594         /* Try to acquire the lock again: */
15595         if (try_to_take_rt_mutex(lock, current, NULL)) {
15596 -               raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
15597 +               if (ww_ctx)
15598 +                       ww_mutex_account_lock(lock, ww_ctx);
15599                 return 0;
15600         }
15602 @@ -1261,17 +1779,27 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
15603         if (unlikely(timeout))
15604                 hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
15606 -       ret = task_blocks_on_rt_mutex(lock, &waiter, current, chwalk);
15607 +       ret = task_blocks_on_rt_mutex(lock, waiter, current, chwalk);
15609 -       if (likely(!ret))
15610 +       if (likely(!ret)) {
15611                 /* sleep on the mutex */
15612 -               ret = __rt_mutex_slowlock(lock, state, timeout, &waiter);
15613 +               ret = __rt_mutex_slowlock(lock, state, timeout, waiter,
15614 +                                         ww_ctx);
15615 +       } else if (ww_ctx) {
15616 +               /* ww_mutex received EDEADLK, let it become EALREADY */
15617 +               ret = __mutex_lock_check_stamp(lock, ww_ctx);
15618 +               BUG_ON(!ret);
15619 +       }
15621         if (unlikely(ret)) {
15622                 __set_current_state(TASK_RUNNING);
15623                 if (rt_mutex_has_waiters(lock))
15624 -                       remove_waiter(lock, &waiter);
15625 -               rt_mutex_handle_deadlock(ret, chwalk, &waiter);
15626 +                       remove_waiter(lock, waiter);
15627 +               /* ww_mutex want to report EDEADLK/EALREADY, let them */
15628 +               if (!ww_ctx)
15629 +                       rt_mutex_handle_deadlock(ret, chwalk, waiter);
15630 +       } else if (ww_ctx) {
15631 +               ww_mutex_account_lock(lock, ww_ctx);
15632         }
15634         /*
15635 @@ -1279,6 +1807,36 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
15636          * unconditionally. We might have to fix that up.
15637          */
15638         fixup_rt_mutex_waiters(lock);
15639 +       return ret;
15643 + * Slow path lock function:
15644 + */
15645 +static int __sched
15646 +rt_mutex_slowlock(struct rt_mutex *lock, int state,
15647 +                 struct hrtimer_sleeper *timeout,
15648 +                 enum rtmutex_chainwalk chwalk,
15649 +                 struct ww_acquire_ctx *ww_ctx)
15651 +       struct rt_mutex_waiter waiter;
15652 +       unsigned long flags;
15653 +       int ret = 0;
15655 +       rt_mutex_init_waiter(&waiter, false);
15657 +       /*
15658 +        * Technically we could use raw_spin_[un]lock_irq() here, but this can
15659 +        * be called in early boot if the cmpxchg() fast path is disabled
15660 +        * (debug, no architecture support). In this case we will acquire the
15661 +        * rtmutex with lock->wait_lock held. But we cannot unconditionally
15662 +        * enable interrupts in that early boot case. So we need to use the
15663 +        * irqsave/restore variants.
15664 +        */
15665 +       raw_spin_lock_irqsave(&lock->wait_lock, flags);
15667 +       ret = rt_mutex_slowlock_locked(lock, state, timeout, chwalk, ww_ctx,
15668 +                                      &waiter);
15670         raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
15672 @@ -1328,10 +1886,12 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
15674  /*
15675   * Slow path to release a rt-mutex.
15676 - * Return whether the current task needs to undo a potential priority boosting.
15677 + *
15678 + * Return whether the current task needs to call rt_mutex_postunlock().
15679   */
15680  static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
15681 -                                       struct wake_q_head *wake_q)
15682 +                                       struct wake_q_head *wake_q,
15683 +                                       struct wake_q_head *wake_sleeper_q)
15685         unsigned long flags;
15687 @@ -1340,8 +1900,6 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
15689         debug_rt_mutex_unlock(lock);
15691 -       rt_mutex_deadlock_account_unlock(current);
15693         /*
15694          * We must be careful here if the fast path is enabled. If we
15695          * have no waiters queued we cannot set owner to NULL here
15696 @@ -1387,12 +1945,10 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
15697          *
15698          * Queue the next waiter for wakeup once we release the wait_lock.
15699          */
15700 -       mark_wakeup_next_waiter(wake_q, lock);
15702 +       mark_wakeup_next_waiter(wake_q, wake_sleeper_q, lock);
15703         raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
15705 -       /* check PI boosting */
15706 -       return true;
15707 +       return true; /* call rt_mutex_postunlock() */
15710  /*
15711 @@ -1403,63 +1959,97 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
15712   */
15713  static inline int
15714  rt_mutex_fastlock(struct rt_mutex *lock, int state,
15715 +                 struct ww_acquire_ctx *ww_ctx,
15716                   int (*slowfn)(struct rt_mutex *lock, int state,
15717                                 struct hrtimer_sleeper *timeout,
15718 -                               enum rtmutex_chainwalk chwalk))
15719 +                               enum rtmutex_chainwalk chwalk,
15720 +                               struct ww_acquire_ctx *ww_ctx))
15722 -       if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) {
15723 -               rt_mutex_deadlock_account_lock(lock, current);
15724 +       if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
15725                 return 0;
15726 -       } else
15727 -               return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK);
15729 +       /*
15730 +        * If rt_mutex blocks, the function sched_submit_work will not call
15731 +        * blk_schedule_flush_plug (because tsk_is_pi_blocked would be true).
15732 +        * We must call blk_schedule_flush_plug here, if we don't call it,
15733 +        * a deadlock in device mapper may happen.
15734 +        */
15735 +       if (unlikely(blk_needs_flush_plug(current)))
15736 +               blk_schedule_flush_plug(current);
15738 +       return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK, ww_ctx);
15741  static inline int
15742  rt_mutex_timed_fastlock(struct rt_mutex *lock, int state,
15743                         struct hrtimer_sleeper *timeout,
15744                         enum rtmutex_chainwalk chwalk,
15745 +                       struct ww_acquire_ctx *ww_ctx,
15746                         int (*slowfn)(struct rt_mutex *lock, int state,
15747                                       struct hrtimer_sleeper *timeout,
15748 -                                     enum rtmutex_chainwalk chwalk))
15749 +                                     enum rtmutex_chainwalk chwalk,
15750 +                                     struct ww_acquire_ctx *ww_ctx))
15752         if (chwalk == RT_MUTEX_MIN_CHAINWALK &&
15753 -           likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) {
15754 -               rt_mutex_deadlock_account_lock(lock, current);
15755 +           likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
15756                 return 0;
15757 -       } else
15758 -               return slowfn(lock, state, timeout, chwalk);
15760 +       if (unlikely(blk_needs_flush_plug(current)))
15761 +               blk_schedule_flush_plug(current);
15763 +       return slowfn(lock, state, timeout, chwalk, ww_ctx);
15766  static inline int
15767  rt_mutex_fasttrylock(struct rt_mutex *lock,
15768                      int (*slowfn)(struct rt_mutex *lock))
15770 -       if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) {
15771 -               rt_mutex_deadlock_account_lock(lock, current);
15772 +       if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
15773                 return 1;
15774 -       }
15776         return slowfn(lock);
15780 + * Performs the wakeup of the the top-waiter and re-enables preemption.
15781 + */
15782 +void rt_mutex_postunlock(struct wake_q_head *wake_q,
15783 +                        struct wake_q_head *wq_sleeper)
15785 +       wake_up_q(wake_q);
15786 +       wake_up_q_sleeper(wq_sleeper);
15788 +       /* Pairs with preempt_disable() in rt_mutex_slowunlock() */
15789 +       preempt_enable();
15792  static inline void
15793  rt_mutex_fastunlock(struct rt_mutex *lock,
15794                     bool (*slowfn)(struct rt_mutex *lock,
15795 -                                  struct wake_q_head *wqh))
15796 +                                  struct wake_q_head *wqh,
15797 +                                  struct wake_q_head *wq_sleeper))
15799         WAKE_Q(wake_q);
15800 +       WAKE_Q(wake_sleeper_q);
15802 -       if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) {
15803 -               rt_mutex_deadlock_account_unlock(current);
15804 +       if (likely(rt_mutex_cmpxchg_release(lock, current, NULL)))
15805 +               return;
15807 -       } else {
15808 -               bool deboost = slowfn(lock, &wake_q);
15809 +       if (slowfn(lock, &wake_q,  &wake_sleeper_q))
15810 +               rt_mutex_postunlock(&wake_q, &wake_sleeper_q);
15813 -               wake_up_q(&wake_q);
15814 +/**
15815 + * rt_mutex_lock_state - lock a rt_mutex with a given state
15816 + *
15817 + * @lock:      The rt_mutex to be locked
15818 + * @state:     The state to set when blocking on the rt_mutex
15819 + */
15820 +int __sched rt_mutex_lock_state(struct rt_mutex *lock, int state)
15822 +       might_sleep();
15824 -               /* Undo pi boosting if necessary: */
15825 -               if (deboost)
15826 -                       rt_mutex_adjust_prio(current);
15827 -       }
15828 +       return rt_mutex_fastlock(lock, state, NULL, rt_mutex_slowlock);
15831  /**
15832 @@ -1469,15 +2059,13 @@ rt_mutex_fastunlock(struct rt_mutex *lock,
15833   */
15834  void __sched rt_mutex_lock(struct rt_mutex *lock)
15836 -       might_sleep();
15838 -       rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, rt_mutex_slowlock);
15839 +       rt_mutex_lock_state(lock, TASK_UNINTERRUPTIBLE);
15841  EXPORT_SYMBOL_GPL(rt_mutex_lock);
15843  /**
15844   * rt_mutex_lock_interruptible - lock a rt_mutex interruptible
15845 - *
15846 + **
15847   * @lock:              the rt_mutex to be locked
15848   *
15849   * Returns:
15850 @@ -1486,23 +2074,32 @@ EXPORT_SYMBOL_GPL(rt_mutex_lock);
15851   */
15852  int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock)
15854 -       might_sleep();
15856 -       return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, rt_mutex_slowlock);
15857 +       return rt_mutex_lock_state(lock, TASK_INTERRUPTIBLE);
15859  EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
15862 - * Futex variant with full deadlock detection.
15863 +/**
15864 + * rt_mutex_lock_killable - lock a rt_mutex killable
15865 + *
15866 + * @lock:              the rt_mutex to be locked
15867 + * @detect_deadlock:   deadlock detection on/off
15868 + *
15869 + * Returns:
15870 + *  0          on success
15871 + * -EINTR      when interrupted by a signal
15872   */
15873 -int rt_mutex_timed_futex_lock(struct rt_mutex *lock,
15874 -                             struct hrtimer_sleeper *timeout)
15875 +int __sched rt_mutex_lock_killable(struct rt_mutex *lock)
15877 -       might_sleep();
15878 +       return rt_mutex_lock_state(lock, TASK_KILLABLE);
15880 +EXPORT_SYMBOL_GPL(rt_mutex_lock_killable);
15882 -       return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
15883 -                                      RT_MUTEX_FULL_CHAINWALK,
15884 -                                      rt_mutex_slowlock);
15886 + * Futex variant, must not use fastpath.
15887 + */
15888 +int __sched rt_mutex_futex_trylock(struct rt_mutex *lock)
15890 +       return rt_mutex_slowtrylock(lock);
15893  /**
15894 @@ -1525,6 +2122,7 @@ rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout)
15896         return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
15897                                        RT_MUTEX_MIN_CHAINWALK,
15898 +                                      NULL,
15899                                        rt_mutex_slowlock);
15901  EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);
15902 @@ -1542,7 +2140,11 @@ EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);
15903   */
15904  int __sched rt_mutex_trylock(struct rt_mutex *lock)
15906 +#ifdef CONFIG_PREEMPT_RT_FULL
15907 +       if (WARN_ON_ONCE(in_irq() || in_nmi()))
15908 +#else
15909         if (WARN_ON_ONCE(in_irq() || in_nmi() || in_serving_softirq()))
15910 +#endif
15911                 return 0;
15913         return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock);
15914 @@ -1560,21 +2162,53 @@ void __sched rt_mutex_unlock(struct rt_mutex *lock)
15916  EXPORT_SYMBOL_GPL(rt_mutex_unlock);
15918 +static bool __sched __rt_mutex_unlock_common(struct rt_mutex *lock,
15919 +                                            struct wake_q_head *wake_q,
15920 +                                            struct wake_q_head *wq_sleeper)
15922 +       lockdep_assert_held(&lock->wait_lock);
15924 +       debug_rt_mutex_unlock(lock);
15926 +       if (!rt_mutex_has_waiters(lock)) {
15927 +               lock->owner = NULL;
15928 +               return false; /* done */
15929 +       }
15931 +       /*
15932 +        * We've already deboosted, mark_wakeup_next_waiter() will
15933 +        * retain preempt_disabled when we drop the wait_lock, to
15934 +        * avoid inversion prior to the wakeup.  preempt_disable()
15935 +        * therein pairs with rt_mutex_postunlock().
15936 +        */
15937 +       mark_wakeup_next_waiter(wake_q, wq_sleeper, lock);
15939 +       return true; /* call postunlock() */
15942  /**
15943 - * rt_mutex_futex_unlock - Futex variant of rt_mutex_unlock
15944 - * @lock: the rt_mutex to be unlocked
15945 - *
15946 - * Returns: true/false indicating whether priority adjustment is
15947 - * required or not.
15948 + * Futex variant, that since futex variants do not use the fast-path, can be
15949 + * simple and will not need to retry.
15950   */
15951 -bool __sched rt_mutex_futex_unlock(struct rt_mutex *lock,
15952 -                                  struct wake_q_head *wqh)
15953 +bool __sched __rt_mutex_futex_unlock(struct rt_mutex *lock,
15954 +                                   struct wake_q_head *wake_q,
15955 +                                   struct wake_q_head *wq_sleeper)
15957 -       if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) {
15958 -               rt_mutex_deadlock_account_unlock(current);
15959 -               return false;
15960 -       }
15961 -       return rt_mutex_slowunlock(lock, wqh);
15962 +       return __rt_mutex_unlock_common(lock, wake_q, wq_sleeper);
15965 +void __sched rt_mutex_futex_unlock(struct rt_mutex *lock)
15967 +       WAKE_Q(wake_q);
15968 +       WAKE_Q(wake_sleeper_q);
15969 +       bool postunlock;
15971 +       raw_spin_lock_irq(&lock->wait_lock);
15972 +       postunlock = __rt_mutex_futex_unlock(lock, &wake_q, &wake_sleeper_q);
15973 +       raw_spin_unlock_irq(&lock->wait_lock);
15975 +       if (postunlock)
15976 +               rt_mutex_postunlock(&wake_q, &wake_sleeper_q);
15979  /**
15980 @@ -1607,13 +2241,12 @@ EXPORT_SYMBOL_GPL(rt_mutex_destroy);
15981  void __rt_mutex_init(struct rt_mutex *lock, const char *name)
15983         lock->owner = NULL;
15984 -       raw_spin_lock_init(&lock->wait_lock);
15985         lock->waiters = RB_ROOT;
15986         lock->waiters_leftmost = NULL;
15988         debug_rt_mutex_init(lock, name);
15990 -EXPORT_SYMBOL_GPL(__rt_mutex_init);
15991 +EXPORT_SYMBOL(__rt_mutex_init);
15993  /**
15994   * rt_mutex_init_proxy_locked - initialize and lock a rt_mutex on behalf of a
15995 @@ -1628,10 +2261,9 @@ EXPORT_SYMBOL_GPL(__rt_mutex_init);
15996  void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
15997                                 struct task_struct *proxy_owner)
15999 -       __rt_mutex_init(lock, NULL);
16000 +       rt_mutex_init(lock);
16001         debug_rt_mutex_proxy_lock(lock, proxy_owner);
16002         rt_mutex_set_owner(lock, proxy_owner);
16003 -       rt_mutex_deadlock_account_lock(lock, proxy_owner);
16006  /**
16007 @@ -1647,34 +2279,44 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock,
16009         debug_rt_mutex_proxy_unlock(lock);
16010         rt_mutex_set_owner(lock, NULL);
16011 -       rt_mutex_deadlock_account_unlock(proxy_owner);
16014 -/**
16015 - * rt_mutex_start_proxy_lock() - Start lock acquisition for another task
16016 - * @lock:              the rt_mutex to take
16017 - * @waiter:            the pre-initialized rt_mutex_waiter
16018 - * @task:              the task to prepare
16019 - *
16020 - * Returns:
16021 - *  0 - task blocked on lock
16022 - *  1 - acquired the lock for task, caller should wake it up
16023 - * <0 - error
16024 - *
16025 - * Special API call for FUTEX_REQUEUE_PI support.
16026 - */
16027 -int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
16028 +int __rt_mutex_start_proxy_lock(struct rt_mutex *lock,
16029                               struct rt_mutex_waiter *waiter,
16030                               struct task_struct *task)
16032         int ret;
16034 -       raw_spin_lock_irq(&lock->wait_lock);
16036 -       if (try_to_take_rt_mutex(lock, task, NULL)) {
16037 -               raw_spin_unlock_irq(&lock->wait_lock);
16038 +       if (try_to_take_rt_mutex(lock, task, NULL))
16039                 return 1;
16041 +#ifdef CONFIG_PREEMPT_RT_FULL
16042 +       /*
16043 +        * In PREEMPT_RT there's an added race.
16044 +        * If the task, that we are about to requeue, times out,
16045 +        * it can set the PI_WAKEUP_INPROGRESS. This tells the requeue
16046 +        * to skip this task. But right after the task sets
16047 +        * its pi_blocked_on to PI_WAKEUP_INPROGRESS it can then
16048 +        * block on the spin_lock(&hb->lock), which in RT is an rtmutex.
16049 +        * This will replace the PI_WAKEUP_INPROGRESS with the actual
16050 +        * lock that it blocks on. We *must not* place this task
16051 +        * on this proxy lock in that case.
16052 +        *
16053 +        * To prevent this race, we first take the task's pi_lock
16054 +        * and check if it has updated its pi_blocked_on. If it has,
16055 +        * we assume that it woke up and we return -EAGAIN.
16056 +        * Otherwise, we set the task's pi_blocked_on to
16057 +        * PI_REQUEUE_INPROGRESS, so that if the task is waking up
16058 +        * it will know that we are in the process of requeuing it.
16059 +        */
16060 +       raw_spin_lock(&task->pi_lock);
16061 +       if (task->pi_blocked_on) {
16062 +               raw_spin_unlock(&task->pi_lock);
16063 +               return -EAGAIN;
16064         }
16065 +       task->pi_blocked_on = PI_REQUEUE_INPROGRESS;
16066 +       raw_spin_unlock(&task->pi_lock);
16067 +#endif
16069         /* We enforce deadlock detection for futexes */
16070         ret = task_blocks_on_rt_mutex(lock, waiter, task,
16071 @@ -1690,16 +2332,40 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
16072                 ret = 0;
16073         }
16075 -       if (unlikely(ret))
16076 +       if (ret && rt_mutex_has_waiters(lock))
16077                 remove_waiter(lock, waiter);
16079 -       raw_spin_unlock_irq(&lock->wait_lock);
16081         debug_rt_mutex_print_deadlock(waiter);
16083         return ret;
16086 +/**
16087 + * rt_mutex_start_proxy_lock() - Start lock acquisition for another task
16088 + * @lock:              the rt_mutex to take
16089 + * @waiter:            the pre-initialized rt_mutex_waiter
16090 + * @task:              the task to prepare
16091 + *
16092 + * Returns:
16093 + *  0 - task blocked on lock
16094 + *  1 - acquired the lock for task, caller should wake it up
16095 + * <0 - error
16096 + *
16097 + * Special API call for FUTEX_REQUEUE_PI support.
16098 + */
16099 +int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
16100 +                             struct rt_mutex_waiter *waiter,
16101 +                             struct task_struct *task)
16103 +       int ret;
16105 +       raw_spin_lock_irq(&lock->wait_lock);
16106 +       ret = __rt_mutex_start_proxy_lock(lock, waiter, task);
16107 +       raw_spin_unlock_irq(&lock->wait_lock);
16109 +       return ret;
16112  /**
16113   * rt_mutex_next_owner - return the next owner of the lock
16114   *
16115 @@ -1721,36 +2387,106 @@ struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock)
16118  /**
16119 - * rt_mutex_finish_proxy_lock() - Complete lock acquisition
16120 + * rt_mutex_wait_proxy_lock() - Wait for lock acquisition
16121   * @lock:              the rt_mutex we were woken on
16122   * @to:                        the timeout, null if none. hrtimer should already have
16123   *                     been started.
16124   * @waiter:            the pre-initialized rt_mutex_waiter
16125   *
16126 - * Complete the lock acquisition started our behalf by another thread.
16127 + * Wait for the the lock acquisition started on our behalf by
16128 + * rt_mutex_start_proxy_lock(). Upon failure, the caller must call
16129 + * rt_mutex_cleanup_proxy_lock().
16130   *
16131   * Returns:
16132   *  0 - success
16133   * <0 - error, one of -EINTR, -ETIMEDOUT
16134   *
16135 - * Special API call for PI-futex requeue support
16136 + * Special API call for PI-futex support
16137   */
16138 -int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
16139 +int rt_mutex_wait_proxy_lock(struct rt_mutex *lock,
16140                                struct hrtimer_sleeper *to,
16141                                struct rt_mutex_waiter *waiter)
16143 +       struct task_struct *tsk = current;
16144         int ret;
16146         raw_spin_lock_irq(&lock->wait_lock);
16148 +       /* sleep on the mutex */
16149         set_current_state(TASK_INTERRUPTIBLE);
16150 +       ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter, NULL);
16151 +       /*
16152 +        * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
16153 +        * have to fix that up.
16154 +        */
16155 +       fixup_rt_mutex_waiters(lock);
16157 -       /* sleep on the mutex */
16158 -       ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter);
16159 +       /*
16160 +        * RT has a problem here when the wait got interrupted by a timeout
16161 +        * or a signal. task->pi_blocked_on is still set. The task must
16162 +        * acquire the hash bucket lock when returning from this function.
16163 +        *
16164 +        * If the hash bucket lock is contended then the
16165 +        * BUG_ON(rt_mutex_real_waiter(task->pi_blocked_on)) in
16166 +        * task_blocks_on_rt_mutex() will trigger. This can be avoided by
16167 +        * clearing task->pi_blocked_on which removes the task from the
16168 +        * boosting chain of the rtmutex. That's correct because the task
16169 +        * is not longer blocked on it.
16170 +        */
16171 +       if (ret) {
16172 +               raw_spin_lock(&tsk->pi_lock);
16173 +               tsk->pi_blocked_on = NULL;
16174 +               raw_spin_unlock(&tsk->pi_lock);
16175 +       }
16176 +       raw_spin_unlock_irq(&lock->wait_lock);
16178 -       if (unlikely(ret))
16179 -               remove_waiter(lock, waiter);
16180 +       return ret;
16183 +/**
16184 + * rt_mutex_cleanup_proxy_lock() - Cleanup failed lock acquisition
16185 + * @lock:              the rt_mutex we were woken on
16186 + * @waiter:            the pre-initialized rt_mutex_waiter
16187 + *
16188 + * Attempt to clean up after a failed rt_mutex_wait_proxy_lock().
16189 + *
16190 + * Unless we acquired the lock; we're still enqueued on the wait-list and can
16191 + * in fact still be granted ownership until we're removed. Therefore we can
16192 + * find we are in fact the owner and must disregard the
16193 + * rt_mutex_wait_proxy_lock() failure.
16194 + *
16195 + * Returns:
16196 + *  true  - did the cleanup, we done.
16197 + *  false - we acquired the lock after rt_mutex_wait_proxy_lock() returned,
16198 + *          caller should disregards its return value.
16199 + *
16200 + * Special API call for PI-futex support
16201 + */
16202 +bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock,
16203 +                                struct rt_mutex_waiter *waiter)
16205 +       bool cleanup = false;
16207 +       raw_spin_lock_irq(&lock->wait_lock);
16208 +       /*
16209 +        * Do an unconditional try-lock, this deals with the lock stealing
16210 +        * state where __rt_mutex_futex_unlock() -> mark_wakeup_next_waiter()
16211 +        * sets a NULL owner.
16212 +        *
16213 +        * We're not interested in the return value, because the subsequent
16214 +        * test on rt_mutex_owner() will infer that. If the trylock succeeded,
16215 +        * we will own the lock and it will have removed the waiter. If we
16216 +        * failed the trylock, we're still not owner and we need to remove
16217 +        * ourselves.
16218 +        */
16219 +       try_to_take_rt_mutex(lock, current, waiter);
16220 +       /*
16221 +        * Unless we're the owner; we're still enqueued on the wait_list.
16222 +        * So check if we became owner, if not, take us off the wait_list.
16223 +        */
16224 +       if (rt_mutex_owner(lock) != current) {
16225 +               remove_waiter(lock, waiter);
16226 +               cleanup = true;
16227 +       }
16228         /*
16229          * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
16230          * have to fix that up.
16231 @@ -1759,5 +2495,91 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
16233         raw_spin_unlock_irq(&lock->wait_lock);
16235 +       return cleanup;
16238 +static inline int
16239 +ww_mutex_deadlock_injection(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
16241 +#ifdef CONFIG_DEBUG_WW_MUTEX_SLOWPATH
16242 +       unsigned tmp;
16244 +       if (ctx->deadlock_inject_countdown-- == 0) {
16245 +               tmp = ctx->deadlock_inject_interval;
16246 +               if (tmp > UINT_MAX/4)
16247 +                       tmp = UINT_MAX;
16248 +               else
16249 +                       tmp = tmp*2 + tmp + tmp/2;
16251 +               ctx->deadlock_inject_interval = tmp;
16252 +               ctx->deadlock_inject_countdown = tmp;
16253 +               ctx->contending_lock = lock;
16255 +               ww_mutex_unlock(lock);
16257 +               return -EDEADLK;
16258 +       }
16259 +#endif
16261 +       return 0;
16264 +#ifdef CONFIG_PREEMPT_RT_FULL
16265 +int __sched
16266 +__ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ww_ctx)
16268 +       int ret;
16270 +       might_sleep();
16272 +       mutex_acquire_nest(&lock->base.dep_map, 0, 0, &ww_ctx->dep_map, _RET_IP_);
16273 +       ret = rt_mutex_slowlock(&lock->base.lock, TASK_INTERRUPTIBLE, NULL, 0, ww_ctx);
16274 +       if (ret)
16275 +               mutex_release(&lock->base.dep_map, 1, _RET_IP_);
16276 +       else if (!ret && ww_ctx->acquired > 1)
16277 +               return ww_mutex_deadlock_injection(lock, ww_ctx);
16279 +       return ret;
16281 +EXPORT_SYMBOL_GPL(__ww_mutex_lock_interruptible);
16283 +int __sched
16284 +__ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ww_ctx)
16286 +       int ret;
16288 +       might_sleep();
16290 +       mutex_acquire_nest(&lock->base.dep_map, 0, 0, &ww_ctx->dep_map, _RET_IP_);
16291 +       ret = rt_mutex_slowlock(&lock->base.lock, TASK_UNINTERRUPTIBLE, NULL, 0, ww_ctx);
16292 +       if (ret)
16293 +               mutex_release(&lock->base.dep_map, 1, _RET_IP_);
16294 +       else if (!ret && ww_ctx->acquired > 1)
16295 +               return ww_mutex_deadlock_injection(lock, ww_ctx);
16297         return ret;
16299 +EXPORT_SYMBOL_GPL(__ww_mutex_lock);
16301 +void __sched ww_mutex_unlock(struct ww_mutex *lock)
16303 +       int nest = !!lock->ctx;
16305 +       /*
16306 +        * The unlocking fastpath is the 0->1 transition from 'locked'
16307 +        * into 'unlocked' state:
16308 +        */
16309 +       if (nest) {
16310 +#ifdef CONFIG_DEBUG_MUTEXES
16311 +               DEBUG_LOCKS_WARN_ON(!lock->ctx->acquired);
16312 +#endif
16313 +               if (lock->ctx->acquired > 0)
16314 +                       lock->ctx->acquired--;
16315 +               lock->ctx = NULL;
16316 +       }
16318 +       mutex_release(&lock->base.dep_map, nest, _RET_IP_);
16319 +       rt_mutex_unlock(&lock->base.lock);
16321 +EXPORT_SYMBOL(ww_mutex_unlock);
16322 +#endif
16323 diff --git a/kernel/locking/rtmutex.h b/kernel/locking/rtmutex.h
16324 index c4060584c407..6607802efa8b 100644
16325 --- a/kernel/locking/rtmutex.h
16326 +++ b/kernel/locking/rtmutex.h
16327 @@ -11,8 +11,6 @@
16328   */
16330  #define rt_mutex_deadlock_check(l)                     (0)
16331 -#define rt_mutex_deadlock_account_lock(m, t)           do { } while (0)
16332 -#define rt_mutex_deadlock_account_unlock(l)            do { } while (0)
16333  #define debug_rt_mutex_init_waiter(w)                  do { } while (0)
16334  #define debug_rt_mutex_free_waiter(w)                  do { } while (0)
16335  #define debug_rt_mutex_lock(l)                         do { } while (0)
16336 diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
16337 index e317e1cbb3eb..64d89d780059 100644
16338 --- a/kernel/locking/rtmutex_common.h
16339 +++ b/kernel/locking/rtmutex_common.h
16340 @@ -27,12 +27,14 @@ struct rt_mutex_waiter {
16341         struct rb_node          pi_tree_entry;
16342         struct task_struct      *task;
16343         struct rt_mutex         *lock;
16344 +       bool                    savestate;
16345  #ifdef CONFIG_DEBUG_RT_MUTEXES
16346         unsigned long           ip;
16347         struct pid              *deadlock_task_pid;
16348         struct rt_mutex         *deadlock_lock;
16349  #endif
16350         int prio;
16351 +       u64 deadline;
16352  };
16354  /*
16355 @@ -98,21 +100,45 @@ enum rtmutex_chainwalk {
16356  /*
16357   * PI-futex support (proxy locking functions, etc.):
16358   */
16359 +#define PI_WAKEUP_INPROGRESS   ((struct rt_mutex_waiter *) 1)
16360 +#define PI_REQUEUE_INPROGRESS  ((struct rt_mutex_waiter *) 2)
16362  extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock);
16363  extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
16364                                        struct task_struct *proxy_owner);
16365  extern void rt_mutex_proxy_unlock(struct rt_mutex *lock,
16366                                   struct task_struct *proxy_owner);
16367 +extern void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter, bool savetate);
16368 +extern int __rt_mutex_start_proxy_lock(struct rt_mutex *lock,
16369 +                                    struct rt_mutex_waiter *waiter,
16370 +                                    struct task_struct *task);
16371  extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
16372                                      struct rt_mutex_waiter *waiter,
16373                                      struct task_struct *task);
16374 -extern int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
16375 -                                     struct hrtimer_sleeper *to,
16376 -                                     struct rt_mutex_waiter *waiter);
16377 -extern int rt_mutex_timed_futex_lock(struct rt_mutex *l, struct hrtimer_sleeper *to);
16378 -extern bool rt_mutex_futex_unlock(struct rt_mutex *lock,
16379 -                                 struct wake_q_head *wqh);
16380 -extern void rt_mutex_adjust_prio(struct task_struct *task);
16381 +extern int rt_mutex_wait_proxy_lock(struct rt_mutex *lock,
16382 +                              struct hrtimer_sleeper *to,
16383 +                              struct rt_mutex_waiter *waiter);
16384 +extern bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock,
16385 +                                struct rt_mutex_waiter *waiter);
16387 +extern int rt_mutex_futex_trylock(struct rt_mutex *l);
16389 +extern void rt_mutex_futex_unlock(struct rt_mutex *lock);
16390 +extern bool __rt_mutex_futex_unlock(struct rt_mutex *lock,
16391 +                                struct wake_q_head *wqh,
16392 +                                struct wake_q_head *wq_sleeper);
16394 +extern void rt_mutex_postunlock(struct wake_q_head *wake_q,
16395 +                               struct wake_q_head *wq_sleeper);
16397 +/* RW semaphore special interface */
16398 +struct ww_acquire_ctx;
16400 +int __sched rt_mutex_slowlock_locked(struct rt_mutex *lock, int state,
16401 +                                    struct hrtimer_sleeper *timeout,
16402 +                                    enum rtmutex_chainwalk chwalk,
16403 +                                    struct ww_acquire_ctx *ww_ctx,
16404 +                                    struct rt_mutex_waiter *waiter);
16406  #ifdef CONFIG_DEBUG_RT_MUTEXES
16407  # include "rtmutex-debug.h"
16408 diff --git a/kernel/locking/rwsem-rt.c b/kernel/locking/rwsem-rt.c
16409 new file mode 100644
16410 index 000000000000..4a708ffcded6
16411 --- /dev/null
16412 +++ b/kernel/locking/rwsem-rt.c
16413 @@ -0,0 +1,268 @@
16415 + */
16416 +#include <linux/rwsem.h>
16417 +#include <linux/sched.h>
16418 +#include <linux/export.h>
16420 +#include "rtmutex_common.h"
16423 + * RT-specific reader/writer semaphores
16424 + *
16425 + * down_write()
16426 + *  1) Lock sem->rtmutex
16427 + *  2) Remove the reader BIAS to force readers into the slow path
16428 + *  3) Wait until all readers have left the critical region
16429 + *  4) Mark it write locked
16430 + *
16431 + * up_write()
16432 + *  1) Remove the write locked marker
16433 + *  2) Set the reader BIAS so readers can use the fast path again
16434 + *  3) Unlock sem->rtmutex to release blocked readers
16435 + *
16436 + * down_read()
16437 + *  1) Try fast path acquisition (reader BIAS is set)
16438 + *  2) Take sem->rtmutex.wait_lock which protects the writelocked flag
16439 + *  3) If !writelocked, acquire it for read
16440 + *  4) If writelocked, block on sem->rtmutex
16441 + *  5) unlock sem->rtmutex, goto 1)
16442 + *
16443 + * up_read()
16444 + *  1) Try fast path release (reader count != 1)
16445 + *  2) Wake the writer waiting in down_write()#3
16446 + *
16447 + * down_read()#3 has the consequence, that rw semaphores on RT are not writer
16448 + * fair, but writers, which should be avoided in RT tasks (think mmap_sem),
16449 + * are subject to the rtmutex priority/DL inheritance mechanism.
16450 + *
16451 + * It's possible to make the rw semaphores writer fair by keeping a list of
16452 + * active readers. A blocked writer would force all newly incoming readers to
16453 + * block on the rtmutex, but the rtmutex would have to be proxy locked for one
16454 + * reader after the other. We can't use multi-reader inheritance because there
16455 + * is no way to support that with SCHED_DEADLINE. Implementing the one by one
16456 + * reader boosting/handover mechanism is a major surgery for a very dubious
16457 + * value.
16458 + *
16459 + * The risk of writer starvation is there, but the pathological use cases
16460 + * which trigger it are not necessarily the typical RT workloads.
16461 + */
16463 +void __rwsem_init(struct rw_semaphore *sem, const char *name,
16464 +                 struct lock_class_key *key)
16466 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
16467 +       /*
16468 +        * Make sure we are not reinitializing a held semaphore:
16469 +        */
16470 +       debug_check_no_locks_freed((void *)sem, sizeof(*sem));
16471 +       lockdep_init_map(&sem->dep_map, name, key, 0);
16472 +#endif
16473 +       atomic_set(&sem->readers, READER_BIAS);
16475 +EXPORT_SYMBOL(__rwsem_init);
16477 +int __down_read_trylock(struct rw_semaphore *sem)
16479 +       int r, old;
16481 +       /*
16482 +        * Increment reader count, if sem->readers < 0, i.e. READER_BIAS is
16483 +        * set.
16484 +        */
16485 +       for (r = atomic_read(&sem->readers); r < 0;) {
16486 +               old = atomic_cmpxchg(&sem->readers, r, r + 1);
16487 +               if (likely(old == r))
16488 +                       return 1;
16489 +               r = old;
16490 +       }
16491 +       return 0;
16494 +void __sched __down_read(struct rw_semaphore *sem)
16496 +       struct rt_mutex *m = &sem->rtmutex;
16497 +       struct rt_mutex_waiter waiter;
16499 +       if (__down_read_trylock(sem))
16500 +               return;
16502 +       might_sleep();
16503 +       raw_spin_lock_irq(&m->wait_lock);
16504 +       /*
16505 +        * Allow readers as long as the writer has not completely
16506 +        * acquired the semaphore for write.
16507 +        */
16508 +       if (atomic_read(&sem->readers) != WRITER_BIAS) {
16509 +               atomic_inc(&sem->readers);
16510 +               raw_spin_unlock_irq(&m->wait_lock);
16511 +               return;
16512 +       }
16514 +       /*
16515 +        * Call into the slow lock path with the rtmutex->wait_lock
16516 +        * held, so this can't result in the following race:
16517 +        *
16518 +        * Reader1              Reader2         Writer
16519 +        *                      down_read()
16520 +        *                                      down_write()
16521 +        *                                      rtmutex_lock(m)
16522 +        *                                      swait()
16523 +        * down_read()
16524 +        * unlock(m->wait_lock)
16525 +        *                      up_read()
16526 +        *                      swake()
16527 +        *                                      lock(m->wait_lock)
16528 +        *                                      sem->writelocked=true
16529 +        *                                      unlock(m->wait_lock)
16530 +        *
16531 +        *                                      up_write()
16532 +        *                                      sem->writelocked=false
16533 +        *                                      rtmutex_unlock(m)
16534 +        *                      down_read()
16535 +        *                                      down_write()
16536 +        *                                      rtmutex_lock(m)
16537 +        *                                      swait()
16538 +        * rtmutex_lock(m)
16539 +        *
16540 +        * That would put Reader1 behind the writer waiting on
16541 +        * Reader2 to call up_read() which might be unbound.
16542 +        */
16543 +       rt_mutex_init_waiter(&waiter, false);
16544 +       rt_mutex_slowlock_locked(m, TASK_UNINTERRUPTIBLE, NULL,
16545 +                                RT_MUTEX_MIN_CHAINWALK, NULL,
16546 +                                &waiter);
16547 +       /*
16548 +        * The slowlock() above is guaranteed to return with the rtmutex is
16549 +        * now held, so there can't be a writer active. Increment the reader
16550 +        * count and immediately drop the rtmutex again.
16551 +        */
16552 +       atomic_inc(&sem->readers);
16553 +       raw_spin_unlock_irq(&m->wait_lock);
16554 +       rt_mutex_unlock(m);
16556 +       debug_rt_mutex_free_waiter(&waiter);
16559 +void __up_read(struct rw_semaphore *sem)
16561 +       struct rt_mutex *m = &sem->rtmutex;
16562 +       struct task_struct *tsk;
16564 +       /*
16565 +        * sem->readers can only hit 0 when a writer is waiting for the
16566 +        * active readers to leave the critical region.
16567 +        */
16568 +       if (!atomic_dec_and_test(&sem->readers))
16569 +               return;
16571 +       might_sleep();
16572 +       raw_spin_lock_irq(&m->wait_lock);
16573 +       /*
16574 +        * Wake the writer, i.e. the rtmutex owner. It might release the
16575 +        * rtmutex concurrently in the fast path (due to a signal), but to
16576 +        * clean up the rwsem it needs to acquire m->wait_lock. The worst
16577 +        * case which can happen is a spurious wakeup.
16578 +        */
16579 +       tsk = rt_mutex_owner(m);
16580 +       if (tsk)
16581 +               wake_up_process(tsk);
16583 +       raw_spin_unlock_irq(&m->wait_lock);
16586 +static void __up_write_unlock(struct rw_semaphore *sem, int bias,
16587 +                             unsigned long flags)
16589 +       struct rt_mutex *m = &sem->rtmutex;
16591 +       atomic_add(READER_BIAS - bias, &sem->readers);
16592 +       raw_spin_unlock_irqrestore(&m->wait_lock, flags);
16593 +       rt_mutex_unlock(m);
16596 +static int __sched __down_write_common(struct rw_semaphore *sem, int state)
16598 +       struct rt_mutex *m = &sem->rtmutex;
16599 +       unsigned long flags;
16601 +       /* Take the rtmutex as a first step */
16602 +       if (rt_mutex_lock_state(m, state))
16603 +               return -EINTR;
16605 +       /* Force readers into slow path */
16606 +       atomic_sub(READER_BIAS, &sem->readers);
16607 +       might_sleep();
16609 +       set_current_state(state);
16610 +       for (;;) {
16611 +               raw_spin_lock_irqsave(&m->wait_lock, flags);
16612 +               /* Have all readers left the critical region? */
16613 +               if (!atomic_read(&sem->readers)) {
16614 +                       atomic_set(&sem->readers, WRITER_BIAS);
16615 +                       __set_current_state(TASK_RUNNING);
16616 +                       raw_spin_unlock_irqrestore(&m->wait_lock, flags);
16617 +                       return 0;
16618 +               }
16620 +               if (signal_pending_state(state, current)) {
16621 +                       __set_current_state(TASK_RUNNING);
16622 +                       __up_write_unlock(sem, 0, flags);
16623 +                       return -EINTR;
16624 +               }
16625 +               raw_spin_unlock_irqrestore(&m->wait_lock, flags);
16627 +               if (atomic_read(&sem->readers) != 0) {
16628 +                       schedule();
16629 +                       set_current_state(state);
16630 +               }
16631 +       }
16634 +void __sched __down_write(struct rw_semaphore *sem)
16636 +       __down_write_common(sem, TASK_UNINTERRUPTIBLE);
16639 +int __sched __down_write_killable(struct rw_semaphore *sem)
16641 +       return __down_write_common(sem, TASK_KILLABLE);
16644 +int __down_write_trylock(struct rw_semaphore *sem)
16646 +       struct rt_mutex *m = &sem->rtmutex;
16647 +       unsigned long flags;
16649 +       if (!rt_mutex_trylock(m))
16650 +               return 0;
16652 +       atomic_sub(READER_BIAS, &sem->readers);
16654 +       raw_spin_lock_irqsave(&m->wait_lock, flags);
16655 +       if (!atomic_read(&sem->readers)) {
16656 +               atomic_set(&sem->readers, WRITER_BIAS);
16657 +               raw_spin_unlock_irqrestore(&m->wait_lock, flags);
16658 +               return 1;
16659 +       }
16660 +       __up_write_unlock(sem, 0, flags);
16661 +       return 0;
16664 +void __up_write(struct rw_semaphore *sem)
16666 +       struct rt_mutex *m = &sem->rtmutex;
16667 +       unsigned long flags;
16669 +       raw_spin_lock_irqsave(&m->wait_lock, flags);
16670 +       __up_write_unlock(sem, WRITER_BIAS, flags);
16673 +void __downgrade_write(struct rw_semaphore *sem)
16675 +       struct rt_mutex *m = &sem->rtmutex;
16676 +       unsigned long flags;
16678 +       raw_spin_lock_irqsave(&m->wait_lock, flags);
16679 +       /* Release it and account current as reader */
16680 +       __up_write_unlock(sem, WRITER_BIAS - 1, flags);
16682 diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c
16683 index db3ccb1dd614..909779647bd1 100644
16684 --- a/kernel/locking/spinlock.c
16685 +++ b/kernel/locking/spinlock.c
16686 @@ -124,8 +124,11 @@ void __lockfunc __raw_##op##_lock_bh(locktype##_t *lock)           \
16687   *         __[spin|read|write]_lock_bh()
16688   */
16689  BUILD_LOCK_OPS(spin, raw_spinlock);
16691 +#ifndef CONFIG_PREEMPT_RT_FULL
16692  BUILD_LOCK_OPS(read, rwlock);
16693  BUILD_LOCK_OPS(write, rwlock);
16694 +#endif
16696  #endif
16698 @@ -209,6 +212,8 @@ void __lockfunc _raw_spin_unlock_bh(raw_spinlock_t *lock)
16699  EXPORT_SYMBOL(_raw_spin_unlock_bh);
16700  #endif
16702 +#ifndef CONFIG_PREEMPT_RT_FULL
16704  #ifndef CONFIG_INLINE_READ_TRYLOCK
16705  int __lockfunc _raw_read_trylock(rwlock_t *lock)
16707 @@ -353,6 +358,8 @@ void __lockfunc _raw_write_unlock_bh(rwlock_t *lock)
16708  EXPORT_SYMBOL(_raw_write_unlock_bh);
16709  #endif
16711 +#endif /* !PREEMPT_RT_FULL */
16713  #ifdef CONFIG_DEBUG_LOCK_ALLOC
16715  void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass)
16716 diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c
16717 index 9aa0fccd5d43..76d0b40d9193 100644
16718 --- a/kernel/locking/spinlock_debug.c
16719 +++ b/kernel/locking/spinlock_debug.c
16720 @@ -31,6 +31,7 @@ void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name,
16722  EXPORT_SYMBOL(__raw_spin_lock_init);
16724 +#ifndef CONFIG_PREEMPT_RT_FULL
16725  void __rwlock_init(rwlock_t *lock, const char *name,
16726                    struct lock_class_key *key)
16728 @@ -48,6 +49,7 @@ void __rwlock_init(rwlock_t *lock, const char *name,
16731  EXPORT_SYMBOL(__rwlock_init);
16732 +#endif
16734  static void spin_dump(raw_spinlock_t *lock, const char *msg)
16736 @@ -135,6 +137,7 @@ void do_raw_spin_unlock(raw_spinlock_t *lock)
16737         arch_spin_unlock(&lock->raw_lock);
16740 +#ifndef CONFIG_PREEMPT_RT_FULL
16741  static void rwlock_bug(rwlock_t *lock, const char *msg)
16743         if (!debug_locks_off())
16744 @@ -224,3 +227,5 @@ void do_raw_write_unlock(rwlock_t *lock)
16745         debug_write_unlock(lock);
16746         arch_write_unlock(&lock->raw_lock);
16749 +#endif
16750 diff --git a/kernel/module.c b/kernel/module.c
16751 index 0e54d5bf0097..f27764fbfa24 100644
16752 --- a/kernel/module.c
16753 +++ b/kernel/module.c
16754 @@ -660,16 +660,7 @@ static void percpu_modcopy(struct module *mod,
16755                 memcpy(per_cpu_ptr(mod->percpu, cpu), from, size);
16758 -/**
16759 - * is_module_percpu_address - test whether address is from module static percpu
16760 - * @addr: address to test
16761 - *
16762 - * Test whether @addr belongs to module static percpu area.
16763 - *
16764 - * RETURNS:
16765 - * %true if @addr is from module static percpu area
16766 - */
16767 -bool is_module_percpu_address(unsigned long addr)
16768 +bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr)
16770         struct module *mod;
16771         unsigned int cpu;
16772 @@ -683,9 +674,15 @@ bool is_module_percpu_address(unsigned long addr)
16773                         continue;
16774                 for_each_possible_cpu(cpu) {
16775                         void *start = per_cpu_ptr(mod->percpu, cpu);
16777 -                       if ((void *)addr >= start &&
16778 -                           (void *)addr < start + mod->percpu_size) {
16779 +                       void *va = (void *)addr;
16781 +                       if (va >= start && va < start + mod->percpu_size) {
16782 +                               if (can_addr) {
16783 +                                       *can_addr = (unsigned long) (va - start);
16784 +                                       *can_addr += (unsigned long)
16785 +                                               per_cpu_ptr(mod->percpu,
16786 +                                                           get_boot_cpu_id());
16787 +                               }
16788                                 preempt_enable();
16789                                 return true;
16790                         }
16791 @@ -696,6 +693,20 @@ bool is_module_percpu_address(unsigned long addr)
16792         return false;
16795 +/**
16796 + * is_module_percpu_address - test whether address is from module static percpu
16797 + * @addr: address to test
16798 + *
16799 + * Test whether @addr belongs to module static percpu area.
16800 + *
16801 + * RETURNS:
16802 + * %true if @addr is from module static percpu area
16803 + */
16804 +bool is_module_percpu_address(unsigned long addr)
16806 +       return __is_module_percpu_address(addr, NULL);
16809  #else /* ... !CONFIG_SMP */
16811  static inline void __percpu *mod_percpu(struct module *mod)
16812 @@ -727,6 +738,11 @@ bool is_module_percpu_address(unsigned long addr)
16813         return false;
16816 +bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr)
16818 +       return false;
16821  #endif /* CONFIG_SMP */
16823  #define MODINFO_ATTR(field)    \
16824 diff --git a/kernel/panic.c b/kernel/panic.c
16825 index dbec387099b1..b67a4803ff2b 100644
16826 --- a/kernel/panic.c
16827 +++ b/kernel/panic.c
16828 @@ -482,9 +482,11 @@ static u64 oops_id;
16830  static int init_oops_id(void)
16832 +#ifndef CONFIG_PREEMPT_RT_FULL
16833         if (!oops_id)
16834                 get_random_bytes(&oops_id, sizeof(oops_id));
16835         else
16836 +#endif
16837                 oops_id++;
16839         return 0;
16840 diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
16841 index b26dbc48c75b..968255f27a33 100644
16842 --- a/kernel/power/hibernate.c
16843 +++ b/kernel/power/hibernate.c
16844 @@ -286,6 +286,8 @@ static int create_image(int platform_mode)
16846         local_irq_disable();
16848 +       system_state = SYSTEM_SUSPEND;
16850         error = syscore_suspend();
16851         if (error) {
16852                 printk(KERN_ERR "PM: Some system devices failed to power down, "
16853 @@ -317,6 +319,7 @@ static int create_image(int platform_mode)
16854         syscore_resume();
16856   Enable_irqs:
16857 +       system_state = SYSTEM_RUNNING;
16858         local_irq_enable();
16860   Enable_cpus:
16861 @@ -446,6 +449,7 @@ static int resume_target_kernel(bool platform_mode)
16862                 goto Enable_cpus;
16864         local_irq_disable();
16865 +       system_state = SYSTEM_SUSPEND;
16867         error = syscore_suspend();
16868         if (error)
16869 @@ -479,6 +483,7 @@ static int resume_target_kernel(bool platform_mode)
16870         syscore_resume();
16872   Enable_irqs:
16873 +       system_state = SYSTEM_RUNNING;
16874         local_irq_enable();
16876   Enable_cpus:
16877 @@ -564,6 +569,7 @@ int hibernation_platform_enter(void)
16878                 goto Enable_cpus;
16880         local_irq_disable();
16881 +       system_state = SYSTEM_SUSPEND;
16882         syscore_suspend();
16883         if (pm_wakeup_pending()) {
16884                 error = -EAGAIN;
16885 @@ -576,6 +582,7 @@ int hibernation_platform_enter(void)
16887   Power_up:
16888         syscore_resume();
16889 +       system_state = SYSTEM_RUNNING;
16890         local_irq_enable();
16892   Enable_cpus:
16893 @@ -676,6 +683,10 @@ static int load_image_and_restore(void)
16894         return error;
16897 +#ifndef CONFIG_SUSPEND
16898 +bool pm_in_action;
16899 +#endif
16901  /**
16902   * hibernate - Carry out system hibernation, including saving the image.
16903   */
16904 @@ -689,6 +700,8 @@ int hibernate(void)
16905                 return -EPERM;
16906         }
16908 +       pm_in_action = true;
16910         lock_system_sleep();
16911         /* The snapshot device should not be opened while we're running */
16912         if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
16913 @@ -766,6 +779,7 @@ int hibernate(void)
16914         atomic_inc(&snapshot_device_available);
16915   Unlock:
16916         unlock_system_sleep();
16917 +       pm_in_action = false;
16918         return error;
16921 diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
16922 index 6ccb08f57fcb..c8cbb5ed2fe3 100644
16923 --- a/kernel/power/suspend.c
16924 +++ b/kernel/power/suspend.c
16925 @@ -369,6 +369,8 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
16926         arch_suspend_disable_irqs();
16927         BUG_ON(!irqs_disabled());
16929 +       system_state = SYSTEM_SUSPEND;
16931         error = syscore_suspend();
16932         if (!error) {
16933                 *wakeup = pm_wakeup_pending();
16934 @@ -385,6 +387,8 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
16935                 syscore_resume();
16936         }
16938 +       system_state = SYSTEM_RUNNING;
16940         arch_suspend_enable_irqs();
16941         BUG_ON(irqs_disabled());
16943 @@ -527,6 +531,8 @@ static int enter_state(suspend_state_t state)
16944         return error;
16947 +bool pm_in_action;
16949  /**
16950   * pm_suspend - Externally visible function for suspending the system.
16951   * @state: System sleep state to enter.
16952 @@ -541,6 +547,8 @@ int pm_suspend(suspend_state_t state)
16953         if (state <= PM_SUSPEND_ON || state >= PM_SUSPEND_MAX)
16954                 return -EINVAL;
16956 +       pm_in_action = true;
16958         error = enter_state(state);
16959         if (error) {
16960                 suspend_stats.fail++;
16961 @@ -548,6 +556,7 @@ int pm_suspend(suspend_state_t state)
16962         } else {
16963                 suspend_stats.success++;
16964         }
16965 +       pm_in_action = false;
16966         return error;
16968  EXPORT_SYMBOL(pm_suspend);
16969 diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
16970 index 9c5b231684d0..cf15bdb6855b 100644
16971 --- a/kernel/printk/printk.c
16972 +++ b/kernel/printk/printk.c
16973 @@ -351,6 +351,65 @@ __packed __aligned(4)
16974   */
16975  DEFINE_RAW_SPINLOCK(logbuf_lock);
16977 +#ifdef CONFIG_EARLY_PRINTK
16978 +struct console *early_console;
16980 +static void early_vprintk(const char *fmt, va_list ap)
16982 +       if (early_console) {
16983 +               char buf[512];
16984 +               int n = vscnprintf(buf, sizeof(buf), fmt, ap);
16986 +               early_console->write(early_console, buf, n);
16987 +       }
16990 +asmlinkage void early_printk(const char *fmt, ...)
16992 +       va_list ap;
16994 +       va_start(ap, fmt);
16995 +       early_vprintk(fmt, ap);
16996 +       va_end(ap);
17000 + * This is independent of any log levels - a global
17001 + * kill switch that turns off all of printk.
17002 + *
17003 + * Used by the NMI watchdog if early-printk is enabled.
17004 + */
17005 +static bool __read_mostly printk_killswitch;
17007 +static int __init force_early_printk_setup(char *str)
17009 +       printk_killswitch = true;
17010 +       return 0;
17012 +early_param("force_early_printk", force_early_printk_setup);
17014 +void printk_kill(void)
17016 +       printk_killswitch = true;
17019 +#ifdef CONFIG_PRINTK
17020 +static int forced_early_printk(const char *fmt, va_list ap)
17022 +       if (!printk_killswitch)
17023 +               return 0;
17024 +       early_vprintk(fmt, ap);
17025 +       return 1;
17027 +#endif
17029 +#else
17030 +static inline int forced_early_printk(const char *fmt, va_list ap)
17032 +       return 0;
17034 +#endif
17036  #ifdef CONFIG_PRINTK
17037  DECLARE_WAIT_QUEUE_HEAD(log_wait);
17038  /* the next printk record to read by syslog(READ) or /proc/kmsg */
17039 @@ -1337,6 +1396,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
17041         char *text;
17042         int len = 0;
17043 +       int attempts = 0;
17045         text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL);
17046         if (!text)
17047 @@ -1348,6 +1408,14 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
17048                 u64 seq;
17049                 u32 idx;
17050                 enum log_flags prev;
17051 +               int num_msg;
17052 +try_again:
17053 +               attempts++;
17054 +               if (attempts > 10) {
17055 +                       len = -EBUSY;
17056 +                       goto out;
17057 +               }
17058 +               num_msg = 0;
17060                 /*
17061                  * Find first record that fits, including all following records,
17062 @@ -1363,6 +1431,14 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
17063                         prev = msg->flags;
17064                         idx = log_next(idx);
17065                         seq++;
17066 +                       num_msg++;
17067 +                       if (num_msg > 5) {
17068 +                               num_msg = 0;
17069 +                               raw_spin_unlock_irq(&logbuf_lock);
17070 +                               raw_spin_lock_irq(&logbuf_lock);
17071 +                               if (clear_seq < log_first_seq)
17072 +                                       goto try_again;
17073 +                       }
17074                 }
17076                 /* move first record forward until length fits into the buffer */
17077 @@ -1376,6 +1452,14 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
17078                         prev = msg->flags;
17079                         idx = log_next(idx);
17080                         seq++;
17081 +                       num_msg++;
17082 +                       if (num_msg > 5) {
17083 +                               num_msg = 0;
17084 +                               raw_spin_unlock_irq(&logbuf_lock);
17085 +                               raw_spin_lock_irq(&logbuf_lock);
17086 +                               if (clear_seq < log_first_seq)
17087 +                                       goto try_again;
17088 +                       }
17089                 }
17091                 /* last message fitting into this dump */
17092 @@ -1416,6 +1500,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
17093                 clear_seq = log_next_seq;
17094                 clear_idx = log_next_idx;
17095         }
17096 +out:
17097         raw_spin_unlock_irq(&logbuf_lock);
17099         kfree(text);
17100 @@ -1569,6 +1654,12 @@ static void call_console_drivers(int level,
17101         if (!console_drivers)
17102                 return;
17104 +       if (IS_ENABLED(CONFIG_PREEMPT_RT_BASE)) {
17105 +               if (in_irq() || in_nmi())
17106 +                       return;
17107 +       }
17109 +       migrate_disable();
17110         for_each_console(con) {
17111                 if (exclusive_console && con != exclusive_console)
17112                         continue;
17113 @@ -1584,6 +1675,7 @@ static void call_console_drivers(int level,
17114                 else
17115                         con->write(con, text, len);
17116         }
17117 +       migrate_enable();
17120  /*
17121 @@ -1781,6 +1873,13 @@ asmlinkage int vprintk_emit(int facility, int level,
17122         /* cpu currently holding logbuf_lock in this function */
17123         static unsigned int logbuf_cpu = UINT_MAX;
17125 +       /*
17126 +        * Fall back to early_printk if a debugging subsystem has
17127 +        * killed printk output
17128 +        */
17129 +       if (unlikely(forced_early_printk(fmt, args)))
17130 +               return 1;
17132         if (level == LOGLEVEL_SCHED) {
17133                 level = LOGLEVEL_DEFAULT;
17134                 in_sched = true;
17135 @@ -1885,13 +1984,23 @@ asmlinkage int vprintk_emit(int facility, int level,
17137         /* If called from the scheduler, we can not call up(). */
17138         if (!in_sched) {
17139 +               int may_trylock = 1;
17141                 lockdep_off();
17142 +#ifdef CONFIG_PREEMPT_RT_FULL
17143 +               /*
17144 +                * we can't take a sleeping lock with IRQs or preeption disabled
17145 +                * so we can't print in these contexts
17146 +                */
17147 +               if (!(preempt_count() == 0 && !irqs_disabled()))
17148 +                       may_trylock = 0;
17149 +#endif
17150                 /*
17151                  * Try to acquire and then immediately release the console
17152                  * semaphore.  The release will print out buffers and wake up
17153                  * /dev/kmsg and syslog() users.
17154                  */
17155 -               if (console_trylock())
17156 +               if (may_trylock && console_trylock())
17157                         console_unlock();
17158                 lockdep_on();
17159         }
17160 @@ -2014,26 +2123,6 @@ DEFINE_PER_CPU(printk_func_t, printk_func);
17162  #endif /* CONFIG_PRINTK */
17164 -#ifdef CONFIG_EARLY_PRINTK
17165 -struct console *early_console;
17167 -asmlinkage __visible void early_printk(const char *fmt, ...)
17169 -       va_list ap;
17170 -       char buf[512];
17171 -       int n;
17173 -       if (!early_console)
17174 -               return;
17176 -       va_start(ap, fmt);
17177 -       n = vscnprintf(buf, sizeof(buf), fmt, ap);
17178 -       va_end(ap);
17180 -       early_console->write(early_console, buf, n);
17182 -#endif
17184  static int __add_preferred_console(char *name, int idx, char *options,
17185                                    char *brl_options)
17187 @@ -2303,11 +2392,16 @@ static void console_cont_flush(char *text, size_t size)
17188                 goto out;
17190         len = cont_print_text(text, size);
17191 +#ifdef CONFIG_PREEMPT_RT_FULL
17192 +       raw_spin_unlock_irqrestore(&logbuf_lock, flags);
17193 +       call_console_drivers(cont.level, NULL, 0, text, len);
17194 +#else
17195         raw_spin_unlock(&logbuf_lock);
17196         stop_critical_timings();
17197         call_console_drivers(cont.level, NULL, 0, text, len);
17198         start_critical_timings();
17199         local_irq_restore(flags);
17200 +#endif
17201         return;
17202  out:
17203         raw_spin_unlock_irqrestore(&logbuf_lock, flags);
17204 @@ -2431,13 +2525,17 @@ void console_unlock(void)
17205                 console_idx = log_next(console_idx);
17206                 console_seq++;
17207                 console_prev = msg->flags;
17208 +#ifdef CONFIG_PREEMPT_RT_FULL
17209 +               raw_spin_unlock_irqrestore(&logbuf_lock, flags);
17210 +               call_console_drivers(level, ext_text, ext_len, text, len);
17211 +#else
17212                 raw_spin_unlock(&logbuf_lock);
17214                 stop_critical_timings();        /* don't trace print latency */
17215                 call_console_drivers(level, ext_text, ext_len, text, len);
17216                 start_critical_timings();
17217                 local_irq_restore(flags);
17219 +#endif
17220                 if (do_cond_resched)
17221                         cond_resched();
17222         }
17223 @@ -2489,6 +2587,11 @@ void console_unblank(void)
17225         struct console *c;
17227 +       if (IS_ENABLED(CONFIG_PREEMPT_RT_BASE)) {
17228 +               if (in_irq() || in_nmi())
17229 +                       return;
17230 +       }
17232         /*
17233          * console_unblank can no longer be called in interrupt context unless
17234          * oops_in_progress is set to 1..
17235 diff --git a/kernel/ptrace.c b/kernel/ptrace.c
17236 index f39a7be98fc1..583ce3aad891 100644
17237 --- a/kernel/ptrace.c
17238 +++ b/kernel/ptrace.c
17239 @@ -172,7 +172,14 @@ static bool ptrace_freeze_traced(struct task_struct *task)
17241         spin_lock_irq(&task->sighand->siglock);
17242         if (task_is_traced(task) && !__fatal_signal_pending(task)) {
17243 -               task->state = __TASK_TRACED;
17244 +               unsigned long flags;
17246 +               raw_spin_lock_irqsave(&task->pi_lock, flags);
17247 +               if (task->state & __TASK_TRACED)
17248 +                       task->state = __TASK_TRACED;
17249 +               else
17250 +                       task->saved_state = __TASK_TRACED;
17251 +               raw_spin_unlock_irqrestore(&task->pi_lock, flags);
17252                 ret = true;
17253         }
17254         spin_unlock_irq(&task->sighand->siglock);
17255 diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
17256 index bf08fee53dc7..eeb8ce4ad7b6 100644
17257 --- a/kernel/rcu/rcutorture.c
17258 +++ b/kernel/rcu/rcutorture.c
17259 @@ -404,6 +404,7 @@ static struct rcu_torture_ops rcu_ops = {
17260         .name           = "rcu"
17261  };
17263 +#ifndef CONFIG_PREEMPT_RT_FULL
17264  /*
17265   * Definitions for rcu_bh torture testing.
17266   */
17267 @@ -443,6 +444,12 @@ static struct rcu_torture_ops rcu_bh_ops = {
17268         .name           = "rcu_bh"
17269  };
17271 +#else
17272 +static struct rcu_torture_ops rcu_bh_ops = {
17273 +       .ttype          = INVALID_RCU_FLAVOR,
17275 +#endif
17277  /*
17278   * Don't even think about trying any of these in real life!!!
17279   * The names includes "busted", and they really means it!
17280 diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
17281 index d1a02877a42c..a7b11a29e03a 100644
17282 --- a/kernel/rcu/tree.c
17283 +++ b/kernel/rcu/tree.c
17284 @@ -55,6 +55,11 @@
17285  #include <linux/random.h>
17286  #include <linux/trace_events.h>
17287  #include <linux/suspend.h>
17288 +#include <linux/delay.h>
17289 +#include <linux/gfp.h>
17290 +#include <linux/oom.h>
17291 +#include <linux/smpboot.h>
17292 +#include "../time/tick-internal.h"
17294  #include "tree.h"
17295  #include "rcu.h"
17296 @@ -260,6 +265,19 @@ void rcu_sched_qs(void)
17297                            this_cpu_ptr(&rcu_sched_data), true);
17300 +#ifdef CONFIG_PREEMPT_RT_FULL
17301 +static void rcu_preempt_qs(void);
17303 +void rcu_bh_qs(void)
17305 +       unsigned long flags;
17307 +       /* Callers to this function, rcu_preempt_qs(), must disable irqs. */
17308 +       local_irq_save(flags);
17309 +       rcu_preempt_qs();
17310 +       local_irq_restore(flags);
17312 +#else
17313  void rcu_bh_qs(void)
17315         if (__this_cpu_read(rcu_bh_data.cpu_no_qs.s)) {
17316 @@ -269,6 +287,7 @@ void rcu_bh_qs(void)
17317                 __this_cpu_write(rcu_bh_data.cpu_no_qs.b.norm, false);
17318         }
17320 +#endif
17322  static DEFINE_PER_CPU(int, rcu_sched_qs_mask);
17324 @@ -449,11 +468,13 @@ EXPORT_SYMBOL_GPL(rcu_batches_started_sched);
17325  /*
17326   * Return the number of RCU BH batches started thus far for debug & stats.
17327   */
17328 +#ifndef CONFIG_PREEMPT_RT_FULL
17329  unsigned long rcu_batches_started_bh(void)
17331         return rcu_bh_state.gpnum;
17333  EXPORT_SYMBOL_GPL(rcu_batches_started_bh);
17334 +#endif
17336  /*
17337   * Return the number of RCU batches completed thus far for debug & stats.
17338 @@ -473,6 +494,7 @@ unsigned long rcu_batches_completed_sched(void)
17340  EXPORT_SYMBOL_GPL(rcu_batches_completed_sched);
17342 +#ifndef CONFIG_PREEMPT_RT_FULL
17343  /*
17344   * Return the number of RCU BH batches completed thus far for debug & stats.
17345   */
17346 @@ -481,6 +503,7 @@ unsigned long rcu_batches_completed_bh(void)
17347         return rcu_bh_state.completed;
17349  EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
17350 +#endif
17352  /*
17353   * Return the number of RCU expedited batches completed thus far for
17354 @@ -504,6 +527,7 @@ unsigned long rcu_exp_batches_completed_sched(void)
17356  EXPORT_SYMBOL_GPL(rcu_exp_batches_completed_sched);
17358 +#ifndef CONFIG_PREEMPT_RT_FULL
17359  /*
17360   * Force a quiescent state.
17361   */
17362 @@ -522,6 +546,13 @@ void rcu_bh_force_quiescent_state(void)
17364  EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state);
17366 +#else
17367 +void rcu_force_quiescent_state(void)
17370 +EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
17371 +#endif
17373  /*
17374   * Force a quiescent state for RCU-sched.
17375   */
17376 @@ -572,9 +603,11 @@ void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
17377         case RCU_FLAVOR:
17378                 rsp = rcu_state_p;
17379                 break;
17380 +#ifndef CONFIG_PREEMPT_RT_FULL
17381         case RCU_BH_FLAVOR:
17382                 rsp = &rcu_bh_state;
17383                 break;
17384 +#endif
17385         case RCU_SCHED_FLAVOR:
17386                 rsp = &rcu_sched_state;
17387                 break;
17388 @@ -3026,18 +3059,17 @@ __rcu_process_callbacks(struct rcu_state *rsp)
17389  /*
17390   * Do RCU core processing for the current CPU.
17391   */
17392 -static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused)
17393 +static __latent_entropy void rcu_process_callbacks(void)
17395         struct rcu_state *rsp;
17397         if (cpu_is_offline(smp_processor_id()))
17398                 return;
17399 -       trace_rcu_utilization(TPS("Start RCU core"));
17400         for_each_rcu_flavor(rsp)
17401                 __rcu_process_callbacks(rsp);
17402 -       trace_rcu_utilization(TPS("End RCU core"));
17405 +static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
17406  /*
17407   * Schedule RCU callback invocation.  If the specified type of RCU
17408   * does not support RCU priority boosting, just do a direct call,
17409 @@ -3049,19 +3081,106 @@ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
17411         if (unlikely(!READ_ONCE(rcu_scheduler_fully_active)))
17412                 return;
17413 -       if (likely(!rsp->boost)) {
17414 -               rcu_do_batch(rsp, rdp);
17415 -               return;
17416 -       }
17417 -       invoke_rcu_callbacks_kthread();
17418 +       rcu_do_batch(rsp, rdp);
17421 +static void rcu_wake_cond(struct task_struct *t, int status)
17423 +       /*
17424 +        * If the thread is yielding, only wake it when this
17425 +        * is invoked from idle
17426 +        */
17427 +       if (t && (status != RCU_KTHREAD_YIELDING || is_idle_task(current)))
17428 +               wake_up_process(t);
17432 + * Wake up this CPU's rcuc kthread to do RCU core processing.
17433 + */
17434  static void invoke_rcu_core(void)
17436 -       if (cpu_online(smp_processor_id()))
17437 -               raise_softirq(RCU_SOFTIRQ);
17438 +       unsigned long flags;
17439 +       struct task_struct *t;
17441 +       if (!cpu_online(smp_processor_id()))
17442 +               return;
17443 +       local_irq_save(flags);
17444 +       __this_cpu_write(rcu_cpu_has_work, 1);
17445 +       t = __this_cpu_read(rcu_cpu_kthread_task);
17446 +       if (t != NULL && current != t)
17447 +               rcu_wake_cond(t, __this_cpu_read(rcu_cpu_kthread_status));
17448 +       local_irq_restore(flags);
17451 +static void rcu_cpu_kthread_park(unsigned int cpu)
17453 +       per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
17456 +static int rcu_cpu_kthread_should_run(unsigned int cpu)
17458 +       return __this_cpu_read(rcu_cpu_has_work);
17462 + * Per-CPU kernel thread that invokes RCU callbacks.  This replaces the
17463 + * RCU softirq used in flavors and configurations of RCU that do not
17464 + * support RCU priority boosting.
17465 + */
17466 +static void rcu_cpu_kthread(unsigned int cpu)
17468 +       unsigned int *statusp = this_cpu_ptr(&rcu_cpu_kthread_status);
17469 +       char work, *workp = this_cpu_ptr(&rcu_cpu_has_work);
17470 +       int spincnt;
17472 +       for (spincnt = 0; spincnt < 10; spincnt++) {
17473 +               trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait"));
17474 +               local_bh_disable();
17475 +               *statusp = RCU_KTHREAD_RUNNING;
17476 +               this_cpu_inc(rcu_cpu_kthread_loops);
17477 +               local_irq_disable();
17478 +               work = *workp;
17479 +               *workp = 0;
17480 +               local_irq_enable();
17481 +               if (work)
17482 +                       rcu_process_callbacks();
17483 +               local_bh_enable();
17484 +               if (*workp == 0) {
17485 +                       trace_rcu_utilization(TPS("End CPU kthread@rcu_wait"));
17486 +                       *statusp = RCU_KTHREAD_WAITING;
17487 +                       return;
17488 +               }
17489 +       }
17490 +       *statusp = RCU_KTHREAD_YIELDING;
17491 +       trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield"));
17492 +       schedule_timeout_interruptible(2);
17493 +       trace_rcu_utilization(TPS("End CPU kthread@rcu_yield"));
17494 +       *statusp = RCU_KTHREAD_WAITING;
17497 +static struct smp_hotplug_thread rcu_cpu_thread_spec = {
17498 +       .store                  = &rcu_cpu_kthread_task,
17499 +       .thread_should_run      = rcu_cpu_kthread_should_run,
17500 +       .thread_fn              = rcu_cpu_kthread,
17501 +       .thread_comm            = "rcuc/%u",
17502 +       .setup                  = rcu_cpu_kthread_setup,
17503 +       .park                   = rcu_cpu_kthread_park,
17507 + * Spawn per-CPU RCU core processing kthreads.
17508 + */
17509 +static int __init rcu_spawn_core_kthreads(void)
17511 +       int cpu;
17513 +       for_each_possible_cpu(cpu)
17514 +               per_cpu(rcu_cpu_has_work, cpu) = 0;
17515 +       BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec));
17516 +       return 0;
17518 +early_initcall(rcu_spawn_core_kthreads);
17520  /*
17521   * Handle any core-RCU processing required by a call_rcu() invocation.
17522   */
17523 @@ -3205,6 +3324,7 @@ void call_rcu_sched(struct rcu_head *head, rcu_callback_t func)
17525  EXPORT_SYMBOL_GPL(call_rcu_sched);
17527 +#ifndef CONFIG_PREEMPT_RT_FULL
17528  /*
17529   * Queue an RCU callback for invocation after a quicker grace period.
17530   */
17531 @@ -3213,6 +3333,7 @@ void call_rcu_bh(struct rcu_head *head, rcu_callback_t func)
17532         __call_rcu(head, func, &rcu_bh_state, -1, 0);
17534  EXPORT_SYMBOL_GPL(call_rcu_bh);
17535 +#endif
17537  /*
17538   * Queue an RCU callback for lazy invocation after a grace period.
17539 @@ -3304,6 +3425,7 @@ void synchronize_sched(void)
17541  EXPORT_SYMBOL_GPL(synchronize_sched);
17543 +#ifndef CONFIG_PREEMPT_RT_FULL
17544  /**
17545   * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed.
17546   *
17547 @@ -3330,6 +3452,7 @@ void synchronize_rcu_bh(void)
17548                 wait_rcu_gp(call_rcu_bh);
17550  EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
17551 +#endif
17553  /**
17554   * get_state_synchronize_rcu - Snapshot current RCU state
17555 @@ -3708,6 +3831,7 @@ static void _rcu_barrier(struct rcu_state *rsp)
17556         mutex_unlock(&rsp->barrier_mutex);
17559 +#ifndef CONFIG_PREEMPT_RT_FULL
17560  /**
17561   * rcu_barrier_bh - Wait until all in-flight call_rcu_bh() callbacks complete.
17562   */
17563 @@ -3716,6 +3840,7 @@ void rcu_barrier_bh(void)
17564         _rcu_barrier(&rcu_bh_state);
17566  EXPORT_SYMBOL_GPL(rcu_barrier_bh);
17567 +#endif
17569  /**
17570   * rcu_barrier_sched - Wait for in-flight call_rcu_sched() callbacks.
17571 @@ -4237,12 +4362,13 @@ void __init rcu_init(void)
17573         rcu_bootup_announce();
17574         rcu_init_geometry();
17575 +#ifndef CONFIG_PREEMPT_RT_FULL
17576         rcu_init_one(&rcu_bh_state);
17577 +#endif
17578         rcu_init_one(&rcu_sched_state);
17579         if (dump_tree)
17580                 rcu_dump_rcu_node_tree(&rcu_sched_state);
17581         __rcu_init_preempt();
17582 -       open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
17584         /*
17585          * We don't need protection against CPU-hotplug here because
17586 diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
17587 index e99a5234d9ed..958ac107062c 100644
17588 --- a/kernel/rcu/tree.h
17589 +++ b/kernel/rcu/tree.h
17590 @@ -588,18 +588,18 @@ extern struct list_head rcu_struct_flavors;
17591   */
17592  extern struct rcu_state rcu_sched_state;
17594 +#ifndef CONFIG_PREEMPT_RT_FULL
17595  extern struct rcu_state rcu_bh_state;
17596 +#endif
17598  #ifdef CONFIG_PREEMPT_RCU
17599  extern struct rcu_state rcu_preempt_state;
17600  #endif /* #ifdef CONFIG_PREEMPT_RCU */
17602 -#ifdef CONFIG_RCU_BOOST
17603  DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
17604  DECLARE_PER_CPU(int, rcu_cpu_kthread_cpu);
17605  DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
17606  DECLARE_PER_CPU(char, rcu_cpu_has_work);
17607 -#endif /* #ifdef CONFIG_RCU_BOOST */
17609  #ifndef RCU_TREE_NONCORE
17611 @@ -619,10 +619,9 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func);
17612  static void __init __rcu_init_preempt(void);
17613  static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
17614  static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
17615 -static void invoke_rcu_callbacks_kthread(void);
17616  static bool rcu_is_callbacks_kthread(void);
17617 +static void rcu_cpu_kthread_setup(unsigned int cpu);
17618  #ifdef CONFIG_RCU_BOOST
17619 -static void rcu_preempt_do_callbacks(void);
17620  static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
17621                                                  struct rcu_node *rnp);
17622  #endif /* #ifdef CONFIG_RCU_BOOST */
17623 diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
17624 index e3944c4b072d..be12d1aac840 100644
17625 --- a/kernel/rcu/tree_plugin.h
17626 +++ b/kernel/rcu/tree_plugin.h
17627 @@ -24,25 +24,10 @@
17628   *        Paul E. McKenney <paulmck@linux.vnet.ibm.com>
17629   */
17631 -#include <linux/delay.h>
17632 -#include <linux/gfp.h>
17633 -#include <linux/oom.h>
17634 -#include <linux/smpboot.h>
17635 -#include "../time/tick-internal.h"
17637  #ifdef CONFIG_RCU_BOOST
17639  #include "../locking/rtmutex_common.h"
17642 - * Control variables for per-CPU and per-rcu_node kthreads.  These
17643 - * handle all flavors of RCU.
17644 - */
17645 -static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
17646 -DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
17647 -DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
17648 -DEFINE_PER_CPU(char, rcu_cpu_has_work);
17650  #else /* #ifdef CONFIG_RCU_BOOST */
17652  /*
17653 @@ -55,6 +40,14 @@ DEFINE_PER_CPU(char, rcu_cpu_has_work);
17655  #endif /* #else #ifdef CONFIG_RCU_BOOST */
17658 + * Control variables for per-CPU and per-rcu_node kthreads.  These
17659 + * handle all flavors of RCU.
17660 + */
17661 +DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
17662 +DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
17663 +DEFINE_PER_CPU(char, rcu_cpu_has_work);
17665  #ifdef CONFIG_RCU_NOCB_CPU
17666  static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */
17667  static bool have_rcu_nocb_mask;            /* Was rcu_nocb_mask allocated? */
17668 @@ -426,7 +419,7 @@ void rcu_read_unlock_special(struct task_struct *t)
17669         }
17671         /* Hardware IRQ handlers cannot block, complain if they get here. */
17672 -       if (in_irq() || in_serving_softirq()) {
17673 +       if (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_OFFSET)) {
17674                 lockdep_rcu_suspicious(__FILE__, __LINE__,
17675                                        "rcu_read_unlock() from irq or softirq with blocking in critical section!!!\n");
17676                 pr_alert("->rcu_read_unlock_special: %#x (b: %d, enq: %d nq: %d)\n",
17677 @@ -632,15 +625,6 @@ static void rcu_preempt_check_callbacks(void)
17678                 t->rcu_read_unlock_special.b.need_qs = true;
17681 -#ifdef CONFIG_RCU_BOOST
17683 -static void rcu_preempt_do_callbacks(void)
17685 -       rcu_do_batch(rcu_state_p, this_cpu_ptr(rcu_data_p));
17688 -#endif /* #ifdef CONFIG_RCU_BOOST */
17690  /*
17691   * Queue a preemptible-RCU callback for invocation after a grace period.
17692   */
17693 @@ -829,6 +813,19 @@ void exit_rcu(void)
17695  #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
17698 + * If boosting, set rcuc kthreads to realtime priority.
17699 + */
17700 +static void rcu_cpu_kthread_setup(unsigned int cpu)
17702 +#ifdef CONFIG_RCU_BOOST
17703 +       struct sched_param sp;
17705 +       sp.sched_priority = kthread_prio;
17706 +       sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
17707 +#endif /* #ifdef CONFIG_RCU_BOOST */
17710  #ifdef CONFIG_RCU_BOOST
17712  #include "../locking/rtmutex_common.h"
17713 @@ -860,16 +857,6 @@ static void rcu_initiate_boost_trace(struct rcu_node *rnp)
17715  #endif /* #else #ifdef CONFIG_RCU_TRACE */
17717 -static void rcu_wake_cond(struct task_struct *t, int status)
17719 -       /*
17720 -        * If the thread is yielding, only wake it when this
17721 -        * is invoked from idle
17722 -        */
17723 -       if (status != RCU_KTHREAD_YIELDING || is_idle_task(current))
17724 -               wake_up_process(t);
17727  /*
17728   * Carry out RCU priority boosting on the task indicated by ->exp_tasks
17729   * or ->boost_tasks, advancing the pointer to the next task in the
17730 @@ -1012,23 +999,6 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
17731         }
17735 - * Wake up the per-CPU kthread to invoke RCU callbacks.
17736 - */
17737 -static void invoke_rcu_callbacks_kthread(void)
17739 -       unsigned long flags;
17741 -       local_irq_save(flags);
17742 -       __this_cpu_write(rcu_cpu_has_work, 1);
17743 -       if (__this_cpu_read(rcu_cpu_kthread_task) != NULL &&
17744 -           current != __this_cpu_read(rcu_cpu_kthread_task)) {
17745 -               rcu_wake_cond(__this_cpu_read(rcu_cpu_kthread_task),
17746 -                             __this_cpu_read(rcu_cpu_kthread_status));
17747 -       }
17748 -       local_irq_restore(flags);
17751  /*
17752   * Is the current CPU running the RCU-callbacks kthread?
17753   * Caller must have preemption disabled.
17754 @@ -1083,67 +1053,6 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
17755         return 0;
17758 -static void rcu_kthread_do_work(void)
17760 -       rcu_do_batch(&rcu_sched_state, this_cpu_ptr(&rcu_sched_data));
17761 -       rcu_do_batch(&rcu_bh_state, this_cpu_ptr(&rcu_bh_data));
17762 -       rcu_preempt_do_callbacks();
17765 -static void rcu_cpu_kthread_setup(unsigned int cpu)
17767 -       struct sched_param sp;
17769 -       sp.sched_priority = kthread_prio;
17770 -       sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
17773 -static void rcu_cpu_kthread_park(unsigned int cpu)
17775 -       per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
17778 -static int rcu_cpu_kthread_should_run(unsigned int cpu)
17780 -       return __this_cpu_read(rcu_cpu_has_work);
17784 - * Per-CPU kernel thread that invokes RCU callbacks.  This replaces the
17785 - * RCU softirq used in flavors and configurations of RCU that do not
17786 - * support RCU priority boosting.
17787 - */
17788 -static void rcu_cpu_kthread(unsigned int cpu)
17790 -       unsigned int *statusp = this_cpu_ptr(&rcu_cpu_kthread_status);
17791 -       char work, *workp = this_cpu_ptr(&rcu_cpu_has_work);
17792 -       int spincnt;
17794 -       for (spincnt = 0; spincnt < 10; spincnt++) {
17795 -               trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait"));
17796 -               local_bh_disable();
17797 -               *statusp = RCU_KTHREAD_RUNNING;
17798 -               this_cpu_inc(rcu_cpu_kthread_loops);
17799 -               local_irq_disable();
17800 -               work = *workp;
17801 -               *workp = 0;
17802 -               local_irq_enable();
17803 -               if (work)
17804 -                       rcu_kthread_do_work();
17805 -               local_bh_enable();
17806 -               if (*workp == 0) {
17807 -                       trace_rcu_utilization(TPS("End CPU kthread@rcu_wait"));
17808 -                       *statusp = RCU_KTHREAD_WAITING;
17809 -                       return;
17810 -               }
17811 -       }
17812 -       *statusp = RCU_KTHREAD_YIELDING;
17813 -       trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield"));
17814 -       schedule_timeout_interruptible(2);
17815 -       trace_rcu_utilization(TPS("End CPU kthread@rcu_yield"));
17816 -       *statusp = RCU_KTHREAD_WAITING;
17819  /*
17820   * Set the per-rcu_node kthread's affinity to cover all CPUs that are
17821   * served by the rcu_node in question.  The CPU hotplug lock is still
17822 @@ -1174,26 +1083,12 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
17823         free_cpumask_var(cm);
17826 -static struct smp_hotplug_thread rcu_cpu_thread_spec = {
17827 -       .store                  = &rcu_cpu_kthread_task,
17828 -       .thread_should_run      = rcu_cpu_kthread_should_run,
17829 -       .thread_fn              = rcu_cpu_kthread,
17830 -       .thread_comm            = "rcuc/%u",
17831 -       .setup                  = rcu_cpu_kthread_setup,
17832 -       .park                   = rcu_cpu_kthread_park,
17835  /*
17836   * Spawn boost kthreads -- called as soon as the scheduler is running.
17837   */
17838  static void __init rcu_spawn_boost_kthreads(void)
17840         struct rcu_node *rnp;
17841 -       int cpu;
17843 -       for_each_possible_cpu(cpu)
17844 -               per_cpu(rcu_cpu_has_work, cpu) = 0;
17845 -       BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec));
17846         rcu_for_each_leaf_node(rcu_state_p, rnp)
17847                 (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp);
17849 @@ -1216,11 +1111,6 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
17850         raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
17853 -static void invoke_rcu_callbacks_kthread(void)
17855 -       WARN_ON_ONCE(1);
17858  static bool rcu_is_callbacks_kthread(void)
17860         return false;
17861 @@ -1244,7 +1134,7 @@ static void rcu_prepare_kthreads(int cpu)
17863  #endif /* #else #ifdef CONFIG_RCU_BOOST */
17865 -#if !defined(CONFIG_RCU_FAST_NO_HZ)
17866 +#if !defined(CONFIG_RCU_FAST_NO_HZ) || defined(CONFIG_PREEMPT_RT_FULL)
17868  /*
17869   * Check to see if any future RCU-related work will need to be done
17870 @@ -1261,7 +1151,9 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt)
17871         return IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL)
17872                ? 0 : rcu_cpu_has_callbacks(NULL);
17874 +#endif /* !defined(CONFIG_RCU_FAST_NO_HZ) || defined(CONFIG_PREEMPT_RT_FULL) */
17876 +#if !defined(CONFIG_RCU_FAST_NO_HZ)
17877  /*
17878   * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up
17879   * after it.
17880 @@ -1357,6 +1249,8 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void)
17881         return cbs_ready;
17884 +#ifndef CONFIG_PREEMPT_RT_FULL
17886  /*
17887   * Allow the CPU to enter dyntick-idle mode unless it has callbacks ready
17888   * to invoke.  If the CPU has callbacks, try to advance them.  Tell the
17889 @@ -1402,6 +1296,7 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt)
17890         *nextevt = basemono + dj * TICK_NSEC;
17891         return 0;
17893 +#endif /* #ifndef CONFIG_PREEMPT_RT_FULL */
17895  /*
17896   * Prepare a CPU for idle from an RCU perspective.  The first major task
17897 diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
17898 index 4f6db7e6a117..ee02e1e1b3e5 100644
17899 --- a/kernel/rcu/update.c
17900 +++ b/kernel/rcu/update.c
17901 @@ -62,7 +62,7 @@
17902  #ifndef CONFIG_TINY_RCU
17903  module_param(rcu_expedited, int, 0);
17904  module_param(rcu_normal, int, 0);
17905 -static int rcu_normal_after_boot;
17906 +static int rcu_normal_after_boot = IS_ENABLED(CONFIG_PREEMPT_RT_FULL);
17907  module_param(rcu_normal_after_boot, int, 0);
17908  #endif /* #ifndef CONFIG_TINY_RCU */
17910 @@ -132,8 +132,7 @@ bool rcu_gp_is_normal(void)
17912  EXPORT_SYMBOL_GPL(rcu_gp_is_normal);
17914 -static atomic_t rcu_expedited_nesting =
17915 -       ATOMIC_INIT(IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT) ? 1 : 0);
17916 +static atomic_t rcu_expedited_nesting =        ATOMIC_INIT(1);
17918  /*
17919   * Should normal grace-period primitives be expedited?  Intended for
17920 @@ -182,8 +181,7 @@ EXPORT_SYMBOL_GPL(rcu_unexpedite_gp);
17921   */
17922  void rcu_end_inkernel_boot(void)
17924 -       if (IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT))
17925 -               rcu_unexpedite_gp();
17926 +       rcu_unexpedite_gp();
17927         if (rcu_normal_after_boot)
17928                 WRITE_ONCE(rcu_normal, 1);
17930 @@ -298,6 +296,7 @@ int rcu_read_lock_held(void)
17932  EXPORT_SYMBOL_GPL(rcu_read_lock_held);
17934 +#ifndef CONFIG_PREEMPT_RT_FULL
17935  /**
17936   * rcu_read_lock_bh_held() - might we be in RCU-bh read-side critical section?
17937   *
17938 @@ -324,6 +323,7 @@ int rcu_read_lock_bh_held(void)
17939         return in_softirq() || irqs_disabled();
17941  EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
17942 +#endif
17944  #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
17946 diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
17947 index 5e59b832ae2b..7337a7f60e3f 100644
17948 --- a/kernel/sched/Makefile
17949 +++ b/kernel/sched/Makefile
17950 @@ -17,7 +17,7 @@ endif
17952  obj-y += core.o loadavg.o clock.o cputime.o
17953  obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
17954 -obj-y += wait.o swait.o completion.o idle.o
17955 +obj-y += wait.o swait.o swork.o completion.o idle.o
17956  obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o
17957  obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
17958  obj-$(CONFIG_SCHEDSTATS) += stats.o
17959 diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
17960 index 8d0f35debf35..b62cf6400fe0 100644
17961 --- a/kernel/sched/completion.c
17962 +++ b/kernel/sched/completion.c
17963 @@ -30,10 +30,10 @@ void complete(struct completion *x)
17965         unsigned long flags;
17967 -       spin_lock_irqsave(&x->wait.lock, flags);
17968 +       raw_spin_lock_irqsave(&x->wait.lock, flags);
17969         x->done++;
17970 -       __wake_up_locked(&x->wait, TASK_NORMAL, 1);
17971 -       spin_unlock_irqrestore(&x->wait.lock, flags);
17972 +       swake_up_locked(&x->wait);
17973 +       raw_spin_unlock_irqrestore(&x->wait.lock, flags);
17975  EXPORT_SYMBOL(complete);
17977 @@ -50,10 +50,10 @@ void complete_all(struct completion *x)
17979         unsigned long flags;
17981 -       spin_lock_irqsave(&x->wait.lock, flags);
17982 +       raw_spin_lock_irqsave(&x->wait.lock, flags);
17983         x->done += UINT_MAX/2;
17984 -       __wake_up_locked(&x->wait, TASK_NORMAL, 0);
17985 -       spin_unlock_irqrestore(&x->wait.lock, flags);
17986 +       swake_up_all_locked(&x->wait);
17987 +       raw_spin_unlock_irqrestore(&x->wait.lock, flags);
17989  EXPORT_SYMBOL(complete_all);
17991 @@ -62,20 +62,20 @@ do_wait_for_common(struct completion *x,
17992                    long (*action)(long), long timeout, int state)
17994         if (!x->done) {
17995 -               DECLARE_WAITQUEUE(wait, current);
17996 +               DECLARE_SWAITQUEUE(wait);
17998 -               __add_wait_queue_tail_exclusive(&x->wait, &wait);
17999 +               __prepare_to_swait(&x->wait, &wait);
18000                 do {
18001                         if (signal_pending_state(state, current)) {
18002                                 timeout = -ERESTARTSYS;
18003                                 break;
18004                         }
18005                         __set_current_state(state);
18006 -                       spin_unlock_irq(&x->wait.lock);
18007 +                       raw_spin_unlock_irq(&x->wait.lock);
18008                         timeout = action(timeout);
18009 -                       spin_lock_irq(&x->wait.lock);
18010 +                       raw_spin_lock_irq(&x->wait.lock);
18011                 } while (!x->done && timeout);
18012 -               __remove_wait_queue(&x->wait, &wait);
18013 +               __finish_swait(&x->wait, &wait);
18014                 if (!x->done)
18015                         return timeout;
18016         }
18017 @@ -89,9 +89,9 @@ __wait_for_common(struct completion *x,
18019         might_sleep();
18021 -       spin_lock_irq(&x->wait.lock);
18022 +       raw_spin_lock_irq(&x->wait.lock);
18023         timeout = do_wait_for_common(x, action, timeout, state);
18024 -       spin_unlock_irq(&x->wait.lock);
18025 +       raw_spin_unlock_irq(&x->wait.lock);
18026         return timeout;
18029 @@ -277,12 +277,12 @@ bool try_wait_for_completion(struct completion *x)
18030         if (!READ_ONCE(x->done))
18031                 return 0;
18033 -       spin_lock_irqsave(&x->wait.lock, flags);
18034 +       raw_spin_lock_irqsave(&x->wait.lock, flags);
18035         if (!x->done)
18036                 ret = 0;
18037         else
18038                 x->done--;
18039 -       spin_unlock_irqrestore(&x->wait.lock, flags);
18040 +       raw_spin_unlock_irqrestore(&x->wait.lock, flags);
18041         return ret;
18043  EXPORT_SYMBOL(try_wait_for_completion);
18044 @@ -311,7 +311,7 @@ bool completion_done(struct completion *x)
18045          * after it's acquired the lock.
18046          */
18047         smp_rmb();
18048 -       spin_unlock_wait(&x->wait.lock);
18049 +       raw_spin_unlock_wait(&x->wait.lock);
18050         return true;
18052  EXPORT_SYMBOL(completion_done);
18053 diff --git a/kernel/sched/core.c b/kernel/sched/core.c
18054 index e5066955cc3a..ed1ebcc2ff3d 100644
18055 --- a/kernel/sched/core.c
18056 +++ b/kernel/sched/core.c
18057 @@ -129,7 +129,11 @@ const_debug unsigned int sysctl_sched_features =
18058   * Number of tasks to iterate in a single balance run.
18059   * Limited because this is done with IRQs disabled.
18060   */
18061 +#ifndef CONFIG_PREEMPT_RT_FULL
18062  const_debug unsigned int sysctl_sched_nr_migrate = 32;
18063 +#else
18064 +const_debug unsigned int sysctl_sched_nr_migrate = 8;
18065 +#endif
18067  /*
18068   * period over which we average the RT time consumption, measured
18069 @@ -345,6 +349,7 @@ static void init_rq_hrtick(struct rq *rq)
18071         hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
18072         rq->hrtick_timer.function = hrtick;
18073 +       rq->hrtick_timer.irqsafe = 1;
18075  #else  /* CONFIG_SCHED_HRTICK */
18076  static inline void hrtick_clear(struct rq *rq)
18077 @@ -425,9 +430,15 @@ static bool set_nr_if_polling(struct task_struct *p)
18078  #endif
18079  #endif
18081 -void wake_q_add(struct wake_q_head *head, struct task_struct *task)
18082 +void __wake_q_add(struct wake_q_head *head, struct task_struct *task,
18083 +                 bool sleeper)
18085 -       struct wake_q_node *node = &task->wake_q;
18086 +       struct wake_q_node *node;
18088 +       if (sleeper)
18089 +               node = &task->wake_q_sleeper;
18090 +       else
18091 +               node = &task->wake_q;
18093         /*
18094          * Atomically grab the task, if ->wake_q is !nil already it means
18095 @@ -449,24 +460,33 @@ void wake_q_add(struct wake_q_head *head, struct task_struct *task)
18096         head->lastp = &node->next;
18099 -void wake_up_q(struct wake_q_head *head)
18100 +void __wake_up_q(struct wake_q_head *head, bool sleeper)
18102         struct wake_q_node *node = head->first;
18104         while (node != WAKE_Q_TAIL) {
18105                 struct task_struct *task;
18107 -               task = container_of(node, struct task_struct, wake_q);
18108 +               if (sleeper)
18109 +                       task = container_of(node, struct task_struct, wake_q_sleeper);
18110 +               else
18111 +                       task = container_of(node, struct task_struct, wake_q);
18112                 BUG_ON(!task);
18113                 /* task can safely be re-inserted now */
18114                 node = node->next;
18115 -               task->wake_q.next = NULL;
18116 +               if (sleeper)
18117 +                       task->wake_q_sleeper.next = NULL;
18118 +               else
18119 +                       task->wake_q.next = NULL;
18121                 /*
18122                  * wake_up_process() implies a wmb() to pair with the queueing
18123                  * in wake_q_add() so as not to miss wakeups.
18124                  */
18125 -               wake_up_process(task);
18126 +               if (sleeper)
18127 +                       wake_up_lock_sleeper(task);
18128 +               else
18129 +                       wake_up_process(task);
18130                 put_task_struct(task);
18131         }
18133 @@ -502,6 +522,38 @@ void resched_curr(struct rq *rq)
18134                 trace_sched_wake_idle_without_ipi(cpu);
18137 +#ifdef CONFIG_PREEMPT_LAZY
18138 +void resched_curr_lazy(struct rq *rq)
18140 +       struct task_struct *curr = rq->curr;
18141 +       int cpu;
18143 +       if (!sched_feat(PREEMPT_LAZY)) {
18144 +               resched_curr(rq);
18145 +               return;
18146 +       }
18148 +       lockdep_assert_held(&rq->lock);
18150 +       if (test_tsk_need_resched(curr))
18151 +               return;
18153 +       if (test_tsk_need_resched_lazy(curr))
18154 +               return;
18156 +       set_tsk_need_resched_lazy(curr);
18158 +       cpu = cpu_of(rq);
18159 +       if (cpu == smp_processor_id())
18160 +               return;
18162 +       /* NEED_RESCHED_LAZY must be visible before we test polling */
18163 +       smp_mb();
18164 +       if (!tsk_is_polling(curr))
18165 +               smp_send_reschedule(cpu);
18167 +#endif
18169  void resched_cpu(int cpu)
18171         struct rq *rq = cpu_rq(cpu);
18172 @@ -524,11 +576,14 @@ void resched_cpu(int cpu)
18173   */
18174  int get_nohz_timer_target(void)
18176 -       int i, cpu = smp_processor_id();
18177 +       int i, cpu;
18178         struct sched_domain *sd;
18180 +       preempt_disable_rt();
18181 +       cpu = smp_processor_id();
18183         if (!idle_cpu(cpu) && is_housekeeping_cpu(cpu))
18184 -               return cpu;
18185 +               goto preempt_en_rt;
18187         rcu_read_lock();
18188         for_each_domain(cpu, sd) {
18189 @@ -547,6 +602,8 @@ int get_nohz_timer_target(void)
18190                 cpu = housekeeping_any_cpu();
18191  unlock:
18192         rcu_read_unlock();
18193 +preempt_en_rt:
18194 +       preempt_enable_rt();
18195         return cpu;
18197  /*
18198 @@ -1092,7 +1149,8 @@ void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_ma
18199         p->nr_cpus_allowed = cpumask_weight(new_mask);
18202 -void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
18203 +static void __do_set_cpus_allowed_tail(struct task_struct *p,
18204 +                                      const struct cpumask *new_mask)
18206         struct rq *rq = task_rq(p);
18207         bool queued, running;
18208 @@ -1121,6 +1179,98 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
18209                 set_curr_task(rq, p);
18212 +void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
18214 +       if (__migrate_disabled(p)) {
18215 +               lockdep_assert_held(&p->pi_lock);
18217 +               cpumask_copy(&p->cpus_allowed, new_mask);
18218 +#if defined(CONFIG_PREEMPT_RT_FULL) && defined(CONFIG_SMP)
18219 +               p->migrate_disable_update = 1;
18220 +#endif
18221 +               return;
18222 +       }
18223 +       __do_set_cpus_allowed_tail(p, new_mask);
18226 +static DEFINE_PER_CPU(struct cpumask, sched_cpumasks);
18227 +static DEFINE_MUTEX(sched_down_mutex);
18228 +static cpumask_t sched_down_cpumask;
18230 +void tell_sched_cpu_down_begin(int cpu)
18232 +       mutex_lock(&sched_down_mutex);
18233 +       cpumask_set_cpu(cpu, &sched_down_cpumask);
18234 +       mutex_unlock(&sched_down_mutex);
18237 +void tell_sched_cpu_down_done(int cpu)
18239 +       mutex_lock(&sched_down_mutex);
18240 +       cpumask_clear_cpu(cpu, &sched_down_cpumask);
18241 +       mutex_unlock(&sched_down_mutex);
18244 +/**
18245 + * migrate_me - try to move the current task off this cpu
18246 + *
18247 + * Used by the pin_current_cpu() code to try to get tasks
18248 + * to move off the current CPU as it is going down.
18249 + * It will only move the task if the task isn't pinned to
18250 + * the CPU (with migrate_disable, affinity or NO_SETAFFINITY)
18251 + * and the task has to be in a RUNNING state. Otherwise the
18252 + * movement of the task will wake it up (change its state
18253 + * to running) when the task did not expect it.
18254 + *
18255 + * Returns 1 if it succeeded in moving the current task
18256 + *         0 otherwise.
18257 + */
18258 +int migrate_me(void)
18260 +       struct task_struct *p = current;
18261 +       struct migration_arg arg;
18262 +       struct cpumask *cpumask;
18263 +       struct cpumask *mask;
18264 +       unsigned int dest_cpu;
18265 +       struct rq_flags rf;
18266 +       struct rq *rq;
18268 +       /*
18269 +        * We can not migrate tasks bounded to a CPU or tasks not
18270 +        * running. The movement of the task will wake it up.
18271 +        */
18272 +       if (p->flags & PF_NO_SETAFFINITY || p->state)
18273 +               return 0;
18275 +       mutex_lock(&sched_down_mutex);
18276 +       rq = task_rq_lock(p, &rf);
18278 +       cpumask = this_cpu_ptr(&sched_cpumasks);
18279 +       mask = &p->cpus_allowed;
18281 +       cpumask_andnot(cpumask, mask, &sched_down_cpumask);
18283 +       if (!cpumask_weight(cpumask)) {
18284 +               /* It's only on this CPU? */
18285 +               task_rq_unlock(rq, p, &rf);
18286 +               mutex_unlock(&sched_down_mutex);
18287 +               return 0;
18288 +       }
18290 +       dest_cpu = cpumask_any_and(cpu_active_mask, cpumask);
18292 +       arg.task = p;
18293 +       arg.dest_cpu = dest_cpu;
18295 +       task_rq_unlock(rq, p, &rf);
18297 +       stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
18298 +       tlb_migrate_finish(p->mm);
18299 +       mutex_unlock(&sched_down_mutex);
18301 +       return 1;
18304  /*
18305   * Change a given task's CPU affinity. Migrate the thread to a
18306   * proper CPU and schedule it away if the CPU it's executing on
18307 @@ -1179,7 +1329,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
18308         }
18310         /* Can the task run on the task's current CPU? If so, we're done */
18311 -       if (cpumask_test_cpu(task_cpu(p), new_mask))
18312 +       if (cpumask_test_cpu(task_cpu(p), new_mask) || __migrate_disabled(p))
18313                 goto out;
18315         dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);
18316 @@ -1366,6 +1516,18 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p)
18317         return ret;
18320 +static bool check_task_state(struct task_struct *p, long match_state)
18322 +       bool match = false;
18324 +       raw_spin_lock_irq(&p->pi_lock);
18325 +       if (p->state == match_state || p->saved_state == match_state)
18326 +               match = true;
18327 +       raw_spin_unlock_irq(&p->pi_lock);
18329 +       return match;
18332  /*
18333   * wait_task_inactive - wait for a thread to unschedule.
18334   *
18335 @@ -1410,7 +1572,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
18336                  * is actually now running somewhere else!
18337                  */
18338                 while (task_running(rq, p)) {
18339 -                       if (match_state && unlikely(p->state != match_state))
18340 +                       if (match_state && !check_task_state(p, match_state))
18341                                 return 0;
18342                         cpu_relax();
18343                 }
18344 @@ -1425,7 +1587,8 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
18345                 running = task_running(rq, p);
18346                 queued = task_on_rq_queued(p);
18347                 ncsw = 0;
18348 -               if (!match_state || p->state == match_state)
18349 +               if (!match_state || p->state == match_state ||
18350 +                   p->saved_state == match_state)
18351                         ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
18352                 task_rq_unlock(rq, p, &rf);
18354 @@ -1680,10 +1843,6 @@ static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_fl
18356         activate_task(rq, p, en_flags);
18357         p->on_rq = TASK_ON_RQ_QUEUED;
18359 -       /* if a worker is waking up, notify workqueue */
18360 -       if (p->flags & PF_WQ_WORKER)
18361 -               wq_worker_waking_up(p, cpu_of(rq));
18364  /*
18365 @@ -2018,8 +2177,27 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
18366          */
18367         smp_mb__before_spinlock();
18368         raw_spin_lock_irqsave(&p->pi_lock, flags);
18369 -       if (!(p->state & state))
18370 +       if (!(p->state & state)) {
18371 +               /*
18372 +                * The task might be running due to a spinlock sleeper
18373 +                * wakeup. Check the saved state and set it to running
18374 +                * if the wakeup condition is true.
18375 +                */
18376 +               if (!(wake_flags & WF_LOCK_SLEEPER)) {
18377 +                       if (p->saved_state & state) {
18378 +                               p->saved_state = TASK_RUNNING;
18379 +                               success = 1;
18380 +                       }
18381 +               }
18382                 goto out;
18383 +       }
18385 +       /*
18386 +        * If this is a regular wakeup, then we can unconditionally
18387 +        * clear the saved state of a "lock sleeper".
18388 +        */
18389 +       if (!(wake_flags & WF_LOCK_SLEEPER))
18390 +               p->saved_state = TASK_RUNNING;
18392         trace_sched_waking(p);
18394 @@ -2101,53 +2279,6 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
18395         return success;
18398 -/**
18399 - * try_to_wake_up_local - try to wake up a local task with rq lock held
18400 - * @p: the thread to be awakened
18401 - * @cookie: context's cookie for pinning
18402 - *
18403 - * Put @p on the run-queue if it's not already there. The caller must
18404 - * ensure that this_rq() is locked, @p is bound to this_rq() and not
18405 - * the current task.
18406 - */
18407 -static void try_to_wake_up_local(struct task_struct *p, struct pin_cookie cookie)
18409 -       struct rq *rq = task_rq(p);
18411 -       if (WARN_ON_ONCE(rq != this_rq()) ||
18412 -           WARN_ON_ONCE(p == current))
18413 -               return;
18415 -       lockdep_assert_held(&rq->lock);
18417 -       if (!raw_spin_trylock(&p->pi_lock)) {
18418 -               /*
18419 -                * This is OK, because current is on_cpu, which avoids it being
18420 -                * picked for load-balance and preemption/IRQs are still
18421 -                * disabled avoiding further scheduler activity on it and we've
18422 -                * not yet picked a replacement task.
18423 -                */
18424 -               lockdep_unpin_lock(&rq->lock, cookie);
18425 -               raw_spin_unlock(&rq->lock);
18426 -               raw_spin_lock(&p->pi_lock);
18427 -               raw_spin_lock(&rq->lock);
18428 -               lockdep_repin_lock(&rq->lock, cookie);
18429 -       }
18431 -       if (!(p->state & TASK_NORMAL))
18432 -               goto out;
18434 -       trace_sched_waking(p);
18436 -       if (!task_on_rq_queued(p))
18437 -               ttwu_activate(rq, p, ENQUEUE_WAKEUP);
18439 -       ttwu_do_wakeup(rq, p, 0, cookie);
18440 -       ttwu_stat(p, smp_processor_id(), 0);
18441 -out:
18442 -       raw_spin_unlock(&p->pi_lock);
18445  /**
18446   * wake_up_process - Wake up a specific process
18447   * @p: The process to be woken up.
18448 @@ -2166,6 +2297,18 @@ int wake_up_process(struct task_struct *p)
18450  EXPORT_SYMBOL(wake_up_process);
18452 +/**
18453 + * wake_up_lock_sleeper - Wake up a specific process blocked on a "sleeping lock"
18454 + * @p: The process to be woken up.
18455 + *
18456 + * Same as wake_up_process() above, but wake_flags=WF_LOCK_SLEEPER to indicate
18457 + * the nature of the wakeup.
18458 + */
18459 +int wake_up_lock_sleeper(struct task_struct *p)
18461 +       return try_to_wake_up(p, TASK_UNINTERRUPTIBLE, WF_LOCK_SLEEPER);
18464  int wake_up_state(struct task_struct *p, unsigned int state)
18466         return try_to_wake_up(p, state, 0);
18467 @@ -2442,6 +2585,9 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
18468         p->on_cpu = 0;
18469  #endif
18470         init_task_preempt_count(p);
18471 +#ifdef CONFIG_HAVE_PREEMPT_LAZY
18472 +       task_thread_info(p)->preempt_lazy_count = 0;
18473 +#endif
18474  #ifdef CONFIG_SMP
18475         plist_node_init(&p->pushable_tasks, MAX_PRIO);
18476         RB_CLEAR_NODE(&p->pushable_dl_tasks);
18477 @@ -2770,21 +2916,16 @@ static struct rq *finish_task_switch(struct task_struct *prev)
18478         finish_arch_post_lock_switch();
18480         fire_sched_in_preempt_notifiers(current);
18481 +       /*
18482 +        * We use mmdrop_delayed() here so we don't have to do the
18483 +        * full __mmdrop() when we are the last user.
18484 +        */
18485         if (mm)
18486 -               mmdrop(mm);
18487 +               mmdrop_delayed(mm);
18488         if (unlikely(prev_state == TASK_DEAD)) {
18489                 if (prev->sched_class->task_dead)
18490                         prev->sched_class->task_dead(prev);
18492 -               /*
18493 -                * Remove function-return probe instances associated with this
18494 -                * task and put them back on the free list.
18495 -                */
18496 -               kprobe_flush_task(prev);
18498 -               /* Task is done with its stack. */
18499 -               put_task_stack(prev);
18501                 put_task_struct(prev);
18502         }
18504 @@ -3252,6 +3393,114 @@ static inline void schedule_debug(struct task_struct *prev)
18505         schedstat_inc(this_rq()->sched_count);
18508 +#if defined(CONFIG_PREEMPT_RT_FULL) && defined(CONFIG_SMP)
18510 +void migrate_disable(void)
18512 +       struct task_struct *p = current;
18514 +       if (in_atomic() || irqs_disabled()) {
18515 +#ifdef CONFIG_SCHED_DEBUG
18516 +               p->migrate_disable_atomic++;
18517 +#endif
18518 +               return;
18519 +       }
18521 +#ifdef CONFIG_SCHED_DEBUG
18522 +       if (unlikely(p->migrate_disable_atomic)) {
18523 +               tracing_off();
18524 +               WARN_ON_ONCE(1);
18525 +       }
18526 +#endif
18528 +       if (p->migrate_disable) {
18529 +               p->migrate_disable++;
18530 +               return;
18531 +       }
18533 +       preempt_disable();
18534 +       preempt_lazy_disable();
18535 +       pin_current_cpu();
18536 +       p->migrate_disable = 1;
18537 +       preempt_enable();
18539 +EXPORT_SYMBOL(migrate_disable);
18541 +void migrate_enable(void)
18543 +       struct task_struct *p = current;
18545 +       if (in_atomic() || irqs_disabled()) {
18546 +#ifdef CONFIG_SCHED_DEBUG
18547 +               p->migrate_disable_atomic--;
18548 +#endif
18549 +               return;
18550 +       }
18552 +#ifdef CONFIG_SCHED_DEBUG
18553 +       if (unlikely(p->migrate_disable_atomic)) {
18554 +               tracing_off();
18555 +               WARN_ON_ONCE(1);
18556 +       }
18557 +#endif
18558 +       WARN_ON_ONCE(p->migrate_disable <= 0);
18560 +       if (p->migrate_disable > 1) {
18561 +               p->migrate_disable--;
18562 +               return;
18563 +       }
18565 +       preempt_disable();
18566 +       /*
18567 +        * Clearing migrate_disable causes tsk_cpus_allowed to
18568 +        * show the tasks original cpu affinity.
18569 +        */
18570 +       p->migrate_disable = 0;
18572 +       if (p->migrate_disable_update) {
18573 +               struct rq *rq;
18574 +               struct rq_flags rf;
18576 +               rq = task_rq_lock(p, &rf);
18577 +               update_rq_clock(rq);
18579 +               __do_set_cpus_allowed_tail(p, &p->cpus_allowed);
18580 +               task_rq_unlock(rq, p, &rf);
18582 +               p->migrate_disable_update = 0;
18584 +               WARN_ON(smp_processor_id() != task_cpu(p));
18585 +               if (!cpumask_test_cpu(task_cpu(p), &p->cpus_allowed)) {
18586 +                       const struct cpumask *cpu_valid_mask = cpu_active_mask;
18587 +                       struct migration_arg arg;
18588 +                       unsigned int dest_cpu;
18590 +                       if (p->flags & PF_KTHREAD) {
18591 +                               /*
18592 +                                * Kernel threads are allowed on online && !active CPUs
18593 +                                */
18594 +                               cpu_valid_mask = cpu_online_mask;
18595 +                       }
18596 +                       dest_cpu = cpumask_any_and(cpu_valid_mask, &p->cpus_allowed);
18597 +                       arg.task = p;
18598 +                       arg.dest_cpu = dest_cpu;
18600 +                       unpin_current_cpu();
18601 +                       preempt_lazy_enable();
18602 +                       preempt_enable();
18603 +                       stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
18604 +                       tlb_migrate_finish(p->mm);
18605 +                       return;
18606 +               }
18607 +       }
18609 +       unpin_current_cpu();
18610 +       preempt_enable();
18611 +       preempt_lazy_enable();
18613 +EXPORT_SYMBOL(migrate_enable);
18614 +#endif
18616  /*
18617   * Pick up the highest-prio task:
18618   */
18619 @@ -3368,19 +3617,6 @@ static void __sched notrace __schedule(bool preempt)
18620                 } else {
18621                         deactivate_task(rq, prev, DEQUEUE_SLEEP);
18622                         prev->on_rq = 0;
18624 -                       /*
18625 -                        * If a worker went to sleep, notify and ask workqueue
18626 -                        * whether it wants to wake up a task to maintain
18627 -                        * concurrency.
18628 -                        */
18629 -                       if (prev->flags & PF_WQ_WORKER) {
18630 -                               struct task_struct *to_wakeup;
18632 -                               to_wakeup = wq_worker_sleeping(prev);
18633 -                               if (to_wakeup)
18634 -                                       try_to_wake_up_local(to_wakeup, cookie);
18635 -                       }
18636                 }
18637                 switch_count = &prev->nvcsw;
18638         }
18639 @@ -3390,6 +3626,7 @@ static void __sched notrace __schedule(bool preempt)
18641         next = pick_next_task(rq, prev, cookie);
18642         clear_tsk_need_resched(prev);
18643 +       clear_tsk_need_resched_lazy(prev);
18644         clear_preempt_need_resched();
18645         rq->clock_skip_update = 0;
18647 @@ -3437,8 +3674,19 @@ void __noreturn do_task_dead(void)
18649  static inline void sched_submit_work(struct task_struct *tsk)
18651 -       if (!tsk->state || tsk_is_pi_blocked(tsk))
18652 +       if (!tsk->state)
18653                 return;
18654 +       /*
18655 +        * If a worker went to sleep, notify and ask workqueue whether
18656 +        * it wants to wake up a task to maintain concurrency.
18657 +        */
18658 +       if (tsk->flags & PF_WQ_WORKER)
18659 +               wq_worker_sleeping(tsk);
18662 +       if (tsk_is_pi_blocked(tsk))
18663 +               return;
18665         /*
18666          * If we are going to sleep and we have plugged IO queued,
18667          * make sure to submit it to avoid deadlocks.
18668 @@ -3447,6 +3695,12 @@ static inline void sched_submit_work(struct task_struct *tsk)
18669                 blk_schedule_flush_plug(tsk);
18672 +static void sched_update_worker(struct task_struct *tsk)
18674 +       if (tsk->flags & PF_WQ_WORKER)
18675 +               wq_worker_running(tsk);
18678  asmlinkage __visible void __sched schedule(void)
18680         struct task_struct *tsk = current;
18681 @@ -3457,6 +3711,7 @@ asmlinkage __visible void __sched schedule(void)
18682                 __schedule(false);
18683                 sched_preempt_enable_no_resched();
18684         } while (need_resched());
18685 +       sched_update_worker(tsk);
18687  EXPORT_SYMBOL(schedule);
18689 @@ -3520,6 +3775,30 @@ static void __sched notrace preempt_schedule_common(void)
18690         } while (need_resched());
18693 +#ifdef CONFIG_PREEMPT_LAZY
18695 + * If TIF_NEED_RESCHED is then we allow to be scheduled away since this is
18696 + * set by a RT task. Oterwise we try to avoid beeing scheduled out as long as
18697 + * preempt_lazy_count counter >0.
18698 + */
18699 +static __always_inline int preemptible_lazy(void)
18701 +       if (test_thread_flag(TIF_NEED_RESCHED))
18702 +               return 1;
18703 +       if (current_thread_info()->preempt_lazy_count)
18704 +               return 0;
18705 +       return 1;
18708 +#else
18710 +static inline int preemptible_lazy(void)
18712 +       return 1;
18715 +#endif
18717  #ifdef CONFIG_PREEMPT
18718  /*
18719   * this is the entry point to schedule() from in-kernel preemption
18720 @@ -3534,7 +3813,8 @@ asmlinkage __visible void __sched notrace preempt_schedule(void)
18721          */
18722         if (likely(!preemptible()))
18723                 return;
18725 +       if (!preemptible_lazy())
18726 +               return;
18727         preempt_schedule_common();
18729  NOKPROBE_SYMBOL(preempt_schedule);
18730 @@ -3561,6 +3841,9 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
18731         if (likely(!preemptible()))
18732                 return;
18734 +       if (!preemptible_lazy())
18735 +               return;
18737         do {
18738                 /*
18739                  * Because the function tracer can trace preempt_count_sub()
18740 @@ -3583,7 +3866,16 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
18741                  * an infinite recursion.
18742                  */
18743                 prev_ctx = exception_enter();
18744 +               /*
18745 +                * The add/subtract must not be traced by the function
18746 +                * tracer. But we still want to account for the
18747 +                * preempt off latency tracer. Since the _notrace versions
18748 +                * of add/subtract skip the accounting for latency tracer
18749 +                * we must force it manually.
18750 +                */
18751 +               start_critical_timings();
18752                 __schedule(true);
18753 +               stop_critical_timings();
18754                 exception_exit(prev_ctx);
18756                 preempt_latency_stop(1);
18757 @@ -3629,10 +3921,25 @@ EXPORT_SYMBOL(default_wake_function);
18759  #ifdef CONFIG_RT_MUTEXES
18761 +static inline int __rt_effective_prio(struct task_struct *pi_task, int prio)
18763 +       if (pi_task)
18764 +               prio = min(prio, pi_task->prio);
18766 +       return prio;
18769 +static inline int rt_effective_prio(struct task_struct *p, int prio)
18771 +       struct task_struct *pi_task = rt_mutex_get_top_task(p);
18773 +       return __rt_effective_prio(pi_task, prio);
18776  /*
18777   * rt_mutex_setprio - set the current priority of a task
18778 - * @p: task
18779 - * @prio: prio value (kernel-internal form)
18780 + * @p: task to boost
18781 + * @pi_task: donor task
18782   *
18783   * This function changes the 'effective' priority of a task. It does
18784   * not touch ->normal_prio like __setscheduler().
18785 @@ -3640,16 +3947,40 @@ EXPORT_SYMBOL(default_wake_function);
18786   * Used by the rt_mutex code to implement priority inheritance
18787   * logic. Call site only calls if the priority of the task changed.
18788   */
18789 -void rt_mutex_setprio(struct task_struct *p, int prio)
18790 +void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
18792 -       int oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE;
18793 +       int prio, oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE;
18794         const struct sched_class *prev_class;
18795         struct rq_flags rf;
18796         struct rq *rq;
18798 -       BUG_ON(prio > MAX_PRIO);
18799 +       /* XXX used to be waiter->prio, not waiter->task->prio */
18800 +       prio = __rt_effective_prio(pi_task, p->normal_prio);
18802 +       /*
18803 +        * If nothing changed; bail early.
18804 +        */
18805 +       if (p->pi_top_task == pi_task && prio == p->prio && !dl_prio(prio))
18806 +               return;
18808         rq = __task_rq_lock(p, &rf);
18809 +       /*
18810 +        * Set under pi_lock && rq->lock, such that the value can be used under
18811 +        * either lock.
18812 +        *
18813 +        * Note that there is loads of tricky to make this pointer cache work
18814 +        * right. rt_mutex_slowunlock()+rt_mutex_postunlock() work together to
18815 +        * ensure a task is de-boosted (pi_task is set to NULL) before the
18816 +        * task is allowed to run again (and can exit). This ensures the pointer
18817 +        * points to a blocked task -- which guaratees the task is present.
18818 +        */
18819 +       p->pi_top_task = pi_task;
18821 +       /*
18822 +        * For FIFO/RR we only need to set prio, if that matches we're done.
18823 +        */
18824 +       if (prio == p->prio && !dl_prio(prio))
18825 +               goto out_unlock;
18827         /*
18828          * Idle task boosting is a nono in general. There is one
18829 @@ -3669,7 +4000,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
18830                 goto out_unlock;
18831         }
18833 -       trace_sched_pi_setprio(p, prio);
18834 +       trace_sched_pi_setprio(p, pi_task);
18835         oldprio = p->prio;
18837         if (oldprio == prio)
18838 @@ -3693,7 +4024,6 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
18839          *          running task
18840          */
18841         if (dl_prio(prio)) {
18842 -               struct task_struct *pi_task = rt_mutex_get_top_task(p);
18843                 if (!dl_prio(p->normal_prio) ||
18844                     (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
18845                         p->dl.dl_boosted = 1;
18846 @@ -3730,6 +4060,11 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
18847         balance_callback(rq);
18848         preempt_enable();
18850 +#else
18851 +static inline int rt_effective_prio(struct task_struct *p, int prio)
18853 +       return prio;
18855  #endif
18857  void set_user_nice(struct task_struct *p, long nice)
18858 @@ -3974,10 +4309,9 @@ static void __setscheduler(struct rq *rq, struct task_struct *p,
18859          * Keep a potential priority boosting if called from
18860          * sched_setscheduler().
18861          */
18862 +       p->prio = normal_prio(p);
18863         if (keep_boost)
18864 -               p->prio = rt_mutex_get_effective_prio(p, normal_prio(p));
18865 -       else
18866 -               p->prio = normal_prio(p);
18867 +               p->prio = rt_effective_prio(p, p->prio);
18869         if (dl_prio(p->prio))
18870                 p->sched_class = &dl_sched_class;
18871 @@ -4264,7 +4598,7 @@ static int __sched_setscheduler(struct task_struct *p,
18872                  * the runqueue. This will be done when the task deboost
18873                  * itself.
18874                  */
18875 -               new_effective_prio = rt_mutex_get_effective_prio(p, newprio);
18876 +               new_effective_prio = rt_effective_prio(p, newprio);
18877                 if (new_effective_prio == oldprio)
18878                         queue_flags &= ~DEQUEUE_MOVE;
18879         }
18880 @@ -4939,6 +5273,7 @@ int __cond_resched_lock(spinlock_t *lock)
18882  EXPORT_SYMBOL(__cond_resched_lock);
18884 +#ifndef CONFIG_PREEMPT_RT_FULL
18885  int __sched __cond_resched_softirq(void)
18887         BUG_ON(!in_softirq());
18888 @@ -4952,6 +5287,7 @@ int __sched __cond_resched_softirq(void)
18889         return 0;
18891  EXPORT_SYMBOL(__cond_resched_softirq);
18892 +#endif
18894  /**
18895   * yield - yield the current processor to other threads.
18896 @@ -5315,7 +5651,9 @@ void init_idle(struct task_struct *idle, int cpu)
18898         /* Set the preempt count _outside_ the spinlocks! */
18899         init_idle_preempt_count(idle, cpu);
18901 +#ifdef CONFIG_HAVE_PREEMPT_LAZY
18902 +       task_thread_info(idle)->preempt_lazy_count = 0;
18903 +#endif
18904         /*
18905          * The idle tasks have their own, simple scheduling class:
18906          */
18907 @@ -5458,6 +5796,8 @@ void sched_setnuma(struct task_struct *p, int nid)
18908  #endif /* CONFIG_NUMA_BALANCING */
18910  #ifdef CONFIG_HOTPLUG_CPU
18911 +static DEFINE_PER_CPU(struct mm_struct *, idle_last_mm);
18913  /*
18914   * Ensures that the idle task is using init_mm right before its cpu goes
18915   * offline.
18916 @@ -5472,7 +5812,12 @@ void idle_task_exit(void)
18917                 switch_mm(mm, &init_mm, current);
18918                 finish_arch_post_lock_switch();
18919         }
18920 -       mmdrop(mm);
18921 +       /*
18922 +        * Defer the cleanup to an alive cpu. On RT we can neither
18923 +        * call mmdrop() nor mmdrop_delayed() from here.
18924 +        */
18925 +       per_cpu(idle_last_mm, smp_processor_id()) = mm;
18929  /*
18930 @@ -5881,6 +6226,7 @@ static int init_rootdomain(struct root_domain *rd)
18931         rd->rto_cpu = -1;
18932         raw_spin_lock_init(&rd->rto_lock);
18933         init_irq_work(&rd->rto_push_work, rto_push_irq_work_func);
18934 +       rd->rto_push_work.flags |= IRQ_WORK_HARD_IRQ;
18935  #endif
18937         init_dl_bw(&rd->dl_bw);
18938 @@ -7439,6 +7785,10 @@ int sched_cpu_dying(unsigned int cpu)
18939         update_max_interval();
18940         nohz_balance_exit_idle(cpu);
18941         hrtick_clear(rq);
18942 +       if (per_cpu(idle_last_mm, cpu)) {
18943 +               mmdrop_delayed(per_cpu(idle_last_mm, cpu));
18944 +               per_cpu(idle_last_mm, cpu) = NULL;
18945 +       }
18946         return 0;
18948  #endif
18949 @@ -7700,7 +8050,7 @@ void __init sched_init(void)
18950  #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
18951  static inline int preempt_count_equals(int preempt_offset)
18953 -       int nested = preempt_count() + rcu_preempt_depth();
18954 +       int nested = preempt_count() + sched_rcu_preempt_depth();
18956         return (nested == preempt_offset);
18958 diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
18959 index df5c32a0c6ed..c77fd444dc3c 100644
18960 --- a/kernel/sched/deadline.c
18961 +++ b/kernel/sched/deadline.c
18962 @@ -693,6 +693,7 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se)
18964         hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
18965         timer->function = dl_task_timer;
18966 +       timer->irqsafe = 1;
18969  /*
18970 diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
18971 index fa178b62ea79..935224123441 100644
18972 --- a/kernel/sched/debug.c
18973 +++ b/kernel/sched/debug.c
18974 @@ -558,6 +558,9 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
18975         P(rt_throttled);
18976         PN(rt_time);
18977         PN(rt_runtime);
18978 +#ifdef CONFIG_SMP
18979 +       P(rt_nr_migratory);
18980 +#endif
18982  #undef PN
18983  #undef P
18984 @@ -953,6 +956,10 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
18985  #endif
18986         P(policy);
18987         P(prio);
18988 +#ifdef CONFIG_PREEMPT_RT_FULL
18989 +       P(migrate_disable);
18990 +#endif
18991 +       P(nr_cpus_allowed);
18992  #undef PN_SCHEDSTAT
18993  #undef PN
18994  #undef __PN
18995 diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
18996 index 3d862f5b0331..c6db32c0c557 100644
18997 --- a/kernel/sched/fair.c
18998 +++ b/kernel/sched/fair.c
18999 @@ -3518,7 +3518,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
19000         ideal_runtime = sched_slice(cfs_rq, curr);
19001         delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
19002         if (delta_exec > ideal_runtime) {
19003 -               resched_curr(rq_of(cfs_rq));
19004 +               resched_curr_lazy(rq_of(cfs_rq));
19005                 /*
19006                  * The current task ran long enough, ensure it doesn't get
19007                  * re-elected due to buddy favours.
19008 @@ -3542,7 +3542,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
19009                 return;
19011         if (delta > ideal_runtime)
19012 -               resched_curr(rq_of(cfs_rq));
19013 +               resched_curr_lazy(rq_of(cfs_rq));
19016  static void
19017 @@ -3684,7 +3684,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
19018          * validating it and just reschedule.
19019          */
19020         if (queued) {
19021 -               resched_curr(rq_of(cfs_rq));
19022 +               resched_curr_lazy(rq_of(cfs_rq));
19023                 return;
19024         }
19025         /*
19026 @@ -3866,7 +3866,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
19027          * hierarchy can be throttled
19028          */
19029         if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
19030 -               resched_curr(rq_of(cfs_rq));
19031 +               resched_curr_lazy(rq_of(cfs_rq));
19034  static __always_inline
19035 @@ -4494,7 +4494,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
19037                 if (delta < 0) {
19038                         if (rq->curr == p)
19039 -                               resched_curr(rq);
19040 +                               resched_curr_lazy(rq);
19041                         return;
19042                 }
19043                 hrtick_start(rq, delta);
19044 @@ -5862,7 +5862,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
19045         return;
19047  preempt:
19048 -       resched_curr(rq);
19049 +       resched_curr_lazy(rq);
19050         /*
19051          * Only set the backward buddy when the current task is still
19052          * on the rq. This can happen when a wakeup gets interleaved
19053 @@ -8588,7 +8588,7 @@ static void task_fork_fair(struct task_struct *p)
19054                  * 'current' within the tree based on its new key value.
19055                  */
19056                 swap(curr->vruntime, se->vruntime);
19057 -               resched_curr(rq);
19058 +               resched_curr_lazy(rq);
19059         }
19061         se->vruntime -= cfs_rq->min_vruntime;
19062 @@ -8612,7 +8612,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
19063          */
19064         if (rq->curr == p) {
19065                 if (p->prio > oldprio)
19066 -                       resched_curr(rq);
19067 +                       resched_curr_lazy(rq);
19068         } else
19069                 check_preempt_curr(rq, p, 0);
19071 diff --git a/kernel/sched/features.h b/kernel/sched/features.h
19072 index 1b3c8189b286..36086f74e011 100644
19073 --- a/kernel/sched/features.h
19074 +++ b/kernel/sched/features.h
19075 @@ -45,11 +45,19 @@ SCHED_FEAT(LB_BIAS, true)
19076   */
19077  SCHED_FEAT(NONTASK_CAPACITY, true)
19079 +#ifdef CONFIG_PREEMPT_RT_FULL
19080 +SCHED_FEAT(TTWU_QUEUE, false)
19081 +# ifdef CONFIG_PREEMPT_LAZY
19082 +SCHED_FEAT(PREEMPT_LAZY, true)
19083 +# endif
19084 +#else
19086  /*
19087   * Queue remote wakeups on the target CPU and process them
19088   * using the scheduler IPI. Reduces rq->lock contention/bounces.
19089   */
19090  SCHED_FEAT(TTWU_QUEUE, true)
19091 +#endif
19093  /*
19094   * When doing wakeups, attempt to limit superfluous scans of the LLC domain.
19095 diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
19096 index 7a360d6f6798..d361629c0f96 100644
19097 --- a/kernel/sched/rt.c
19098 +++ b/kernel/sched/rt.c
19099 @@ -47,6 +47,7 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
19101         hrtimer_init(&rt_b->rt_period_timer,
19102                         CLOCK_MONOTONIC, HRTIMER_MODE_REL);
19103 +       rt_b->rt_period_timer.irqsafe = 1;
19104         rt_b->rt_period_timer.function = sched_rt_period_timer;
19107 diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
19108 index cff985feb6e7..280c7d5a7657 100644
19109 --- a/kernel/sched/sched.h
19110 +++ b/kernel/sched/sched.h
19111 @@ -1162,6 +1162,7 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
19112  #define WF_SYNC                0x01            /* waker goes to sleep after wakeup */
19113  #define WF_FORK                0x02            /* child wakeup after fork */
19114  #define WF_MIGRATED    0x4             /* internal use, task got migrated */
19115 +#define WF_LOCK_SLEEPER        0x08            /* wakeup spinlock "sleeper" */
19117  /*
19118   * To aid in avoiding the subversion of "niceness" due to uneven distribution
19119 @@ -1345,6 +1346,15 @@ extern void init_sched_fair_class(void);
19120  extern void resched_curr(struct rq *rq);
19121  extern void resched_cpu(int cpu);
19123 +#ifdef CONFIG_PREEMPT_LAZY
19124 +extern void resched_curr_lazy(struct rq *rq);
19125 +#else
19126 +static inline void resched_curr_lazy(struct rq *rq)
19128 +       resched_curr(rq);
19130 +#endif
19132  extern struct rt_bandwidth def_rt_bandwidth;
19133  extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
19135 diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c
19136 index 82f0dff90030..ef027ff3250a 100644
19137 --- a/kernel/sched/swait.c
19138 +++ b/kernel/sched/swait.c
19139 @@ -1,5 +1,6 @@
19140  #include <linux/sched.h>
19141  #include <linux/swait.h>
19142 +#include <linux/suspend.h>
19144  void __init_swait_queue_head(struct swait_queue_head *q, const char *name,
19145                              struct lock_class_key *key)
19146 @@ -29,6 +30,25 @@ void swake_up_locked(struct swait_queue_head *q)
19148  EXPORT_SYMBOL(swake_up_locked);
19150 +void swake_up_all_locked(struct swait_queue_head *q)
19152 +       struct swait_queue *curr;
19153 +       int wakes = 0;
19155 +       while (!list_empty(&q->task_list)) {
19157 +               curr = list_first_entry(&q->task_list, typeof(*curr),
19158 +                                       task_list);
19159 +               wake_up_process(curr->task);
19160 +               list_del_init(&curr->task_list);
19161 +               wakes++;
19162 +       }
19163 +       if (pm_in_action)
19164 +               return;
19165 +       WARN(wakes > 2, "complete_all() with %d waiters\n", wakes);
19167 +EXPORT_SYMBOL(swake_up_all_locked);
19169  void swake_up(struct swait_queue_head *q)
19171         unsigned long flags;
19172 @@ -54,6 +74,7 @@ void swake_up_all(struct swait_queue_head *q)
19173         if (!swait_active(q))
19174                 return;
19176 +       WARN_ON(irqs_disabled());
19177         raw_spin_lock_irq(&q->lock);
19178         list_splice_init(&q->task_list, &tmp);
19179         while (!list_empty(&tmp)) {
19180 diff --git a/kernel/sched/swork.c b/kernel/sched/swork.c
19181 new file mode 100644
19182 index 000000000000..1950f40ca725
19183 --- /dev/null
19184 +++ b/kernel/sched/swork.c
19185 @@ -0,0 +1,173 @@
19187 + * Copyright (C) 2014 BMW Car IT GmbH, Daniel Wagner daniel.wagner@bmw-carit.de
19188 + *
19189 + * Provides a framework for enqueuing callbacks from irq context
19190 + * PREEMPT_RT_FULL safe. The callbacks are executed in kthread context.
19191 + */
19193 +#include <linux/swait.h>
19194 +#include <linux/swork.h>
19195 +#include <linux/kthread.h>
19196 +#include <linux/slab.h>
19197 +#include <linux/spinlock.h>
19198 +#include <linux/export.h>
19200 +#define SWORK_EVENT_PENDING     (1 << 0)
19202 +static DEFINE_MUTEX(worker_mutex);
19203 +static struct sworker *glob_worker;
19205 +struct sworker {
19206 +       struct list_head events;
19207 +       struct swait_queue_head wq;
19209 +       raw_spinlock_t lock;
19211 +       struct task_struct *task;
19212 +       int refs;
19215 +static bool swork_readable(struct sworker *worker)
19217 +       bool r;
19219 +       if (kthread_should_stop())
19220 +               return true;
19222 +       raw_spin_lock_irq(&worker->lock);
19223 +       r = !list_empty(&worker->events);
19224 +       raw_spin_unlock_irq(&worker->lock);
19226 +       return r;
19229 +static int swork_kthread(void *arg)
19231 +       struct sworker *worker = arg;
19233 +       for (;;) {
19234 +               swait_event_interruptible(worker->wq,
19235 +                                       swork_readable(worker));
19236 +               if (kthread_should_stop())
19237 +                       break;
19239 +               raw_spin_lock_irq(&worker->lock);
19240 +               while (!list_empty(&worker->events)) {
19241 +                       struct swork_event *sev;
19243 +                       sev = list_first_entry(&worker->events,
19244 +                                       struct swork_event, item);
19245 +                       list_del(&sev->item);
19246 +                       raw_spin_unlock_irq(&worker->lock);
19248 +                       WARN_ON_ONCE(!test_and_clear_bit(SWORK_EVENT_PENDING,
19249 +                                                        &sev->flags));
19250 +                       sev->func(sev);
19251 +                       raw_spin_lock_irq(&worker->lock);
19252 +               }
19253 +               raw_spin_unlock_irq(&worker->lock);
19254 +       }
19255 +       return 0;
19258 +static struct sworker *swork_create(void)
19260 +       struct sworker *worker;
19262 +       worker = kzalloc(sizeof(*worker), GFP_KERNEL);
19263 +       if (!worker)
19264 +               return ERR_PTR(-ENOMEM);
19266 +       INIT_LIST_HEAD(&worker->events);
19267 +       raw_spin_lock_init(&worker->lock);
19268 +       init_swait_queue_head(&worker->wq);
19270 +       worker->task = kthread_run(swork_kthread, worker, "kswork");
19271 +       if (IS_ERR(worker->task)) {
19272 +               kfree(worker);
19273 +               return ERR_PTR(-ENOMEM);
19274 +       }
19276 +       return worker;
19279 +static void swork_destroy(struct sworker *worker)
19281 +       kthread_stop(worker->task);
19283 +       WARN_ON(!list_empty(&worker->events));
19284 +       kfree(worker);
19287 +/**
19288 + * swork_queue - queue swork
19289 + *
19290 + * Returns %false if @work was already on a queue, %true otherwise.
19291 + *
19292 + * The work is queued and processed on a random CPU
19293 + */
19294 +bool swork_queue(struct swork_event *sev)
19296 +       unsigned long flags;
19298 +       if (test_and_set_bit(SWORK_EVENT_PENDING, &sev->flags))
19299 +               return false;
19301 +       raw_spin_lock_irqsave(&glob_worker->lock, flags);
19302 +       list_add_tail(&sev->item, &glob_worker->events);
19303 +       raw_spin_unlock_irqrestore(&glob_worker->lock, flags);
19305 +       swake_up(&glob_worker->wq);
19306 +       return true;
19308 +EXPORT_SYMBOL_GPL(swork_queue);
19310 +/**
19311 + * swork_get - get an instance of the sworker
19312 + *
19313 + * Returns an negative error code if the initialization if the worker did not
19314 + * work, %0 otherwise.
19315 + *
19316 + */
19317 +int swork_get(void)
19319 +       struct sworker *worker;
19321 +       mutex_lock(&worker_mutex);
19322 +       if (!glob_worker) {
19323 +               worker = swork_create();
19324 +               if (IS_ERR(worker)) {
19325 +                       mutex_unlock(&worker_mutex);
19326 +                       return -ENOMEM;
19327 +               }
19329 +               glob_worker = worker;
19330 +       }
19332 +       glob_worker->refs++;
19333 +       mutex_unlock(&worker_mutex);
19335 +       return 0;
19337 +EXPORT_SYMBOL_GPL(swork_get);
19339 +/**
19340 + * swork_put - puts an instance of the sworker
19341 + *
19342 + * Will destroy the sworker thread. This function must not be called until all
19343 + * queued events have been completed.
19344 + */
19345 +void swork_put(void)
19347 +       mutex_lock(&worker_mutex);
19349 +       glob_worker->refs--;
19350 +       if (glob_worker->refs > 0)
19351 +               goto out;
19353 +       swork_destroy(glob_worker);
19354 +       glob_worker = NULL;
19355 +out:
19356 +       mutex_unlock(&worker_mutex);
19358 +EXPORT_SYMBOL_GPL(swork_put);
19359 diff --git a/kernel/signal.c b/kernel/signal.c
19360 index 7ebe236a5364..4d094ae3a625 100644
19361 --- a/kernel/signal.c
19362 +++ b/kernel/signal.c
19363 @@ -14,6 +14,7 @@
19364  #include <linux/export.h>
19365  #include <linux/init.h>
19366  #include <linux/sched.h>
19367 +#include <linux/sched/rt.h>
19368  #include <linux/fs.h>
19369  #include <linux/tty.h>
19370  #include <linux/binfmts.h>
19371 @@ -354,13 +355,30 @@ static bool task_participate_group_stop(struct task_struct *task)
19372         return false;
19375 +static inline struct sigqueue *get_task_cache(struct task_struct *t)
19377 +       struct sigqueue *q = t->sigqueue_cache;
19379 +       if (cmpxchg(&t->sigqueue_cache, q, NULL) != q)
19380 +               return NULL;
19381 +       return q;
19384 +static inline int put_task_cache(struct task_struct *t, struct sigqueue *q)
19386 +       if (cmpxchg(&t->sigqueue_cache, NULL, q) == NULL)
19387 +               return 0;
19388 +       return 1;
19391  /*
19392   * allocate a new signal queue record
19393   * - this may be called without locks if and only if t == current, otherwise an
19394   *   appropriate lock must be held to stop the target task from exiting
19395   */
19396  static struct sigqueue *
19397 -__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimit)
19398 +__sigqueue_do_alloc(int sig, struct task_struct *t, gfp_t flags,
19399 +                   int override_rlimit, int fromslab)
19401         struct sigqueue *q = NULL;
19402         struct user_struct *user;
19403 @@ -377,7 +395,10 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi
19404         if (override_rlimit ||
19405             atomic_read(&user->sigpending) <=
19406                         task_rlimit(t, RLIMIT_SIGPENDING)) {
19407 -               q = kmem_cache_alloc(sigqueue_cachep, flags);
19408 +               if (!fromslab)
19409 +                       q = get_task_cache(t);
19410 +               if (!q)
19411 +                       q = kmem_cache_alloc(sigqueue_cachep, flags);
19412         } else {
19413                 print_dropped_signal(sig);
19414         }
19415 @@ -394,6 +415,13 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi
19416         return q;
19419 +static struct sigqueue *
19420 +__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags,
19421 +                int override_rlimit)
19423 +       return __sigqueue_do_alloc(sig, t, flags, override_rlimit, 0);
19426  static void __sigqueue_free(struct sigqueue *q)
19428         if (q->flags & SIGQUEUE_PREALLOC)
19429 @@ -403,6 +431,21 @@ static void __sigqueue_free(struct sigqueue *q)
19430         kmem_cache_free(sigqueue_cachep, q);
19433 +static void sigqueue_free_current(struct sigqueue *q)
19435 +       struct user_struct *up;
19437 +       if (q->flags & SIGQUEUE_PREALLOC)
19438 +               return;
19440 +       up = q->user;
19441 +       if (rt_prio(current->normal_prio) && !put_task_cache(current, q)) {
19442 +               atomic_dec(&up->sigpending);
19443 +               free_uid(up);
19444 +       } else
19445 +                 __sigqueue_free(q);
19448  void flush_sigqueue(struct sigpending *queue)
19450         struct sigqueue *q;
19451 @@ -415,6 +458,21 @@ void flush_sigqueue(struct sigpending *queue)
19452         }
19456 + * Called from __exit_signal. Flush tsk->pending and
19457 + * tsk->sigqueue_cache
19458 + */
19459 +void flush_task_sigqueue(struct task_struct *tsk)
19461 +       struct sigqueue *q;
19463 +       flush_sigqueue(&tsk->pending);
19465 +       q = get_task_cache(tsk);
19466 +       if (q)
19467 +               kmem_cache_free(sigqueue_cachep, q);
19470  /*
19471   * Flush all pending signals for this kthread.
19472   */
19473 @@ -534,7 +592,7 @@ static void collect_signal(int sig, struct sigpending *list, siginfo_t *info,
19474                         (info->si_code == SI_TIMER) &&
19475                         (info->si_sys_private);
19477 -               __sigqueue_free(first);
19478 +               sigqueue_free_current(first);
19479         } else {
19480                 /*
19481                  * Ok, it wasn't in the queue.  This must be
19482 @@ -570,6 +628,8 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
19483         bool resched_timer = false;
19484         int signr;
19486 +       WARN_ON_ONCE(tsk != current);
19488         /* We only dequeue private signals from ourselves, we don't let
19489          * signalfd steal them
19490          */
19491 @@ -1166,8 +1226,8 @@ int do_send_sig_info(int sig, struct siginfo *info, struct task_struct *p,
19492   * We don't want to have recursive SIGSEGV's etc, for example,
19493   * that is why we also clear SIGNAL_UNKILLABLE.
19494   */
19495 -int
19496 -force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
19497 +static int
19498 +do_force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
19500         unsigned long int flags;
19501         int ret, blocked, ignored;
19502 @@ -1192,6 +1252,39 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
19503         return ret;
19506 +int force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
19509 + * On some archs, PREEMPT_RT has to delay sending a signal from a trap
19510 + * since it can not enable preemption, and the signal code's spin_locks
19511 + * turn into mutexes. Instead, it must set TIF_NOTIFY_RESUME which will
19512 + * send the signal on exit of the trap.
19513 + */
19514 +#ifdef ARCH_RT_DELAYS_SIGNAL_SEND
19515 +       if (in_atomic()) {
19516 +               if (WARN_ON_ONCE(t != current))
19517 +                       return 0;
19518 +               if (WARN_ON_ONCE(t->forced_info.si_signo))
19519 +                       return 0;
19521 +               if (is_si_special(info)) {
19522 +                       WARN_ON_ONCE(info != SEND_SIG_PRIV);
19523 +                       t->forced_info.si_signo = sig;
19524 +                       t->forced_info.si_errno = 0;
19525 +                       t->forced_info.si_code = SI_KERNEL;
19526 +                       t->forced_info.si_pid = 0;
19527 +                       t->forced_info.si_uid = 0;
19528 +               } else {
19529 +                       t->forced_info = *info;
19530 +               }
19532 +               set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
19533 +               return 0;
19534 +       }
19535 +#endif
19536 +       return do_force_sig_info(sig, info, t);
19539  /*
19540   * Nuke all other threads in the group.
19541   */
19542 @@ -1226,12 +1319,12 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
19543                  * Disable interrupts early to avoid deadlocks.
19544                  * See rcu_read_unlock() comment header for details.
19545                  */
19546 -               local_irq_save(*flags);
19547 +               local_irq_save_nort(*flags);
19548                 rcu_read_lock();
19549                 sighand = rcu_dereference(tsk->sighand);
19550                 if (unlikely(sighand == NULL)) {
19551                         rcu_read_unlock();
19552 -                       local_irq_restore(*flags);
19553 +                       local_irq_restore_nort(*flags);
19554                         break;
19555                 }
19556                 /*
19557 @@ -1252,7 +1345,7 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
19558                 }
19559                 spin_unlock(&sighand->siglock);
19560                 rcu_read_unlock();
19561 -               local_irq_restore(*flags);
19562 +               local_irq_restore_nort(*flags);
19563         }
19565         return sighand;
19566 @@ -1495,7 +1588,8 @@ EXPORT_SYMBOL(kill_pid);
19567   */
19568  struct sigqueue *sigqueue_alloc(void)
19570 -       struct sigqueue *q = __sigqueue_alloc(-1, current, GFP_KERNEL, 0);
19571 +       /* Preallocated sigqueue objects always from the slabcache ! */
19572 +       struct sigqueue *q = __sigqueue_do_alloc(-1, current, GFP_KERNEL, 0, 1);
19574         if (q)
19575                 q->flags |= SIGQUEUE_PREALLOC;
19576 @@ -1856,15 +1950,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
19577                 if (gstop_done && ptrace_reparented(current))
19578                         do_notify_parent_cldstop(current, false, why);
19580 -               /*
19581 -                * Don't want to allow preemption here, because
19582 -                * sys_ptrace() needs this task to be inactive.
19583 -                *
19584 -                * XXX: implement read_unlock_no_resched().
19585 -                */
19586 -               preempt_disable();
19587                 read_unlock(&tasklist_lock);
19588 -               preempt_enable_no_resched();
19589                 freezable_schedule();
19590         } else {
19591                 /*
19592 diff --git a/kernel/softirq.c b/kernel/softirq.c
19593 index 744fa611cae0..819bd7cf5ad0 100644
19594 --- a/kernel/softirq.c
19595 +++ b/kernel/softirq.c
19596 @@ -21,10 +21,12 @@
19597  #include <linux/freezer.h>
19598  #include <linux/kthread.h>
19599  #include <linux/rcupdate.h>
19600 +#include <linux/delay.h>
19601  #include <linux/ftrace.h>
19602  #include <linux/smp.h>
19603  #include <linux/smpboot.h>
19604  #include <linux/tick.h>
19605 +#include <linux/locallock.h>
19606  #include <linux/irq.h>
19608  #define CREATE_TRACE_POINTS
19609 @@ -56,12 +58,108 @@ EXPORT_SYMBOL(irq_stat);
19610  static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp;
19612  DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
19613 +#ifdef CONFIG_PREEMPT_RT_FULL
19614 +#define TIMER_SOFTIRQS ((1 << TIMER_SOFTIRQ) | (1 << HRTIMER_SOFTIRQ))
19615 +DEFINE_PER_CPU(struct task_struct *, ktimer_softirqd);
19616 +#endif
19618  const char * const softirq_to_name[NR_SOFTIRQS] = {
19619         "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "IRQ_POLL",
19620         "TASKLET", "SCHED", "HRTIMER", "RCU"
19621  };
19623 +#ifdef CONFIG_NO_HZ_COMMON
19624 +# ifdef CONFIG_PREEMPT_RT_FULL
19626 +struct softirq_runner {
19627 +       struct task_struct *runner[NR_SOFTIRQS];
19630 +static DEFINE_PER_CPU(struct softirq_runner, softirq_runners);
19632 +static inline void softirq_set_runner(unsigned int sirq)
19634 +       struct softirq_runner *sr = this_cpu_ptr(&softirq_runners);
19636 +       sr->runner[sirq] = current;
19639 +static inline void softirq_clr_runner(unsigned int sirq)
19641 +       struct softirq_runner *sr = this_cpu_ptr(&softirq_runners);
19643 +       sr->runner[sirq] = NULL;
19647 + * On preempt-rt a softirq running context might be blocked on a
19648 + * lock. There might be no other runnable task on this CPU because the
19649 + * lock owner runs on some other CPU. So we have to go into idle with
19650 + * the pending bit set. Therefor we need to check this otherwise we
19651 + * warn about false positives which confuses users and defeats the
19652 + * whole purpose of this test.
19653 + *
19654 + * This code is called with interrupts disabled.
19655 + */
19656 +void softirq_check_pending_idle(void)
19658 +       static int rate_limit;
19659 +       struct softirq_runner *sr = this_cpu_ptr(&softirq_runners);
19660 +       u32 warnpending;
19661 +       int i;
19663 +       if (rate_limit >= 10)
19664 +               return;
19666 +       warnpending = local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK;
19667 +       for (i = 0; i < NR_SOFTIRQS; i++) {
19668 +               struct task_struct *tsk = sr->runner[i];
19670 +               /*
19671 +                * The wakeup code in rtmutex.c wakes up the task
19672 +                * _before_ it sets pi_blocked_on to NULL under
19673 +                * tsk->pi_lock. So we need to check for both: state
19674 +                * and pi_blocked_on.
19675 +                */
19676 +               if (tsk) {
19677 +                       raw_spin_lock(&tsk->pi_lock);
19678 +                       if (tsk->pi_blocked_on || tsk->state == TASK_RUNNING) {
19679 +                               /* Clear all bits pending in that task */
19680 +                               warnpending &= ~(tsk->softirqs_raised);
19681 +                               warnpending &= ~(1 << i);
19682 +                       }
19683 +                       raw_spin_unlock(&tsk->pi_lock);
19684 +               }
19685 +       }
19687 +       if (warnpending) {
19688 +               printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
19689 +                      warnpending);
19690 +               rate_limit++;
19691 +       }
19693 +# else
19695 + * On !PREEMPT_RT we just printk rate limited:
19696 + */
19697 +void softirq_check_pending_idle(void)
19699 +       static int rate_limit;
19701 +       if (rate_limit < 10 &&
19702 +                       (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) {
19703 +               printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
19704 +                      local_softirq_pending());
19705 +               rate_limit++;
19706 +       }
19708 +# endif
19710 +#else /* !CONFIG_NO_HZ_COMMON */
19711 +static inline void softirq_set_runner(unsigned int sirq) { }
19712 +static inline void softirq_clr_runner(unsigned int sirq) { }
19713 +#endif
19715  /*
19716   * we cannot loop indefinitely here to avoid userspace starvation,
19717   * but we also don't want to introduce a worst case 1/HZ latency
19718 @@ -77,6 +175,38 @@ static void wakeup_softirqd(void)
19719                 wake_up_process(tsk);
19722 +#ifdef CONFIG_PREEMPT_RT_FULL
19723 +static void wakeup_timer_softirqd(void)
19725 +       /* Interrupts are disabled: no need to stop preemption */
19726 +       struct task_struct *tsk = __this_cpu_read(ktimer_softirqd);
19728 +       if (tsk && tsk->state != TASK_RUNNING)
19729 +               wake_up_process(tsk);
19731 +#endif
19733 +static void handle_softirq(unsigned int vec_nr)
19735 +       struct softirq_action *h = softirq_vec + vec_nr;
19736 +       int prev_count;
19738 +       prev_count = preempt_count();
19740 +       kstat_incr_softirqs_this_cpu(vec_nr);
19742 +       trace_softirq_entry(vec_nr);
19743 +       h->action(h);
19744 +       trace_softirq_exit(vec_nr);
19745 +       if (unlikely(prev_count != preempt_count())) {
19746 +               pr_err("huh, entered softirq %u %s %p with preempt_count %08x, exited with %08x?\n",
19747 +                      vec_nr, softirq_to_name[vec_nr], h->action,
19748 +                      prev_count, preempt_count());
19749 +               preempt_count_set(prev_count);
19750 +       }
19753 +#ifndef CONFIG_PREEMPT_RT_FULL
19754  /*
19755   * If ksoftirqd is scheduled, we do not want to process pending softirqs
19756   * right now. Let ksoftirqd handle this at its own rate, to get fairness.
19757 @@ -88,6 +218,47 @@ static bool ksoftirqd_running(void)
19758         return tsk && (tsk->state == TASK_RUNNING);
19761 +static inline int ksoftirqd_softirq_pending(void)
19763 +       return local_softirq_pending();
19766 +static void handle_pending_softirqs(u32 pending)
19768 +       struct softirq_action *h = softirq_vec;
19769 +       int softirq_bit;
19771 +       local_irq_enable();
19773 +       h = softirq_vec;
19775 +       while ((softirq_bit = ffs(pending))) {
19776 +               unsigned int vec_nr;
19778 +               h += softirq_bit - 1;
19779 +               vec_nr = h - softirq_vec;
19780 +               handle_softirq(vec_nr);
19782 +               h++;
19783 +               pending >>= softirq_bit;
19784 +       }
19786 +       rcu_bh_qs();
19787 +       local_irq_disable();
19790 +static void run_ksoftirqd(unsigned int cpu)
19792 +       local_irq_disable();
19793 +       if (ksoftirqd_softirq_pending()) {
19794 +               __do_softirq();
19795 +               local_irq_enable();
19796 +               cond_resched_rcu_qs();
19797 +               return;
19798 +       }
19799 +       local_irq_enable();
19802  /*
19803   * preempt_count and SOFTIRQ_OFFSET usage:
19804   * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving
19805 @@ -243,10 +414,8 @@ asmlinkage __visible void __softirq_entry __do_softirq(void)
19806         unsigned long end = jiffies + MAX_SOFTIRQ_TIME;
19807         unsigned long old_flags = current->flags;
19808         int max_restart = MAX_SOFTIRQ_RESTART;
19809 -       struct softirq_action *h;
19810         bool in_hardirq;
19811         __u32 pending;
19812 -       int softirq_bit;
19814         /*
19815          * Mask out PF_MEMALLOC s current task context is borrowed for the
19816 @@ -265,36 +434,7 @@ asmlinkage __visible void __softirq_entry __do_softirq(void)
19817         /* Reset the pending bitmask before enabling irqs */
19818         set_softirq_pending(0);
19820 -       local_irq_enable();
19822 -       h = softirq_vec;
19824 -       while ((softirq_bit = ffs(pending))) {
19825 -               unsigned int vec_nr;
19826 -               int prev_count;
19828 -               h += softirq_bit - 1;
19830 -               vec_nr = h - softirq_vec;
19831 -               prev_count = preempt_count();
19833 -               kstat_incr_softirqs_this_cpu(vec_nr);
19835 -               trace_softirq_entry(vec_nr);
19836 -               h->action(h);
19837 -               trace_softirq_exit(vec_nr);
19838 -               if (unlikely(prev_count != preempt_count())) {
19839 -                       pr_err("huh, entered softirq %u %s %p with preempt_count %08x, exited with %08x?\n",
19840 -                              vec_nr, softirq_to_name[vec_nr], h->action,
19841 -                              prev_count, preempt_count());
19842 -                       preempt_count_set(prev_count);
19843 -               }
19844 -               h++;
19845 -               pending >>= softirq_bit;
19846 -       }
19848 -       rcu_bh_qs();
19849 -       local_irq_disable();
19850 +       handle_pending_softirqs(pending);
19852         pending = local_softirq_pending();
19853         if (pending) {
19854 @@ -330,6 +470,309 @@ asmlinkage __visible void do_softirq(void)
19855         local_irq_restore(flags);
19859 + * This function must run with irqs disabled!
19860 + */
19861 +void raise_softirq_irqoff(unsigned int nr)
19863 +       __raise_softirq_irqoff(nr);
19865 +       /*
19866 +        * If we're in an interrupt or softirq, we're done
19867 +        * (this also catches softirq-disabled code). We will
19868 +        * actually run the softirq once we return from
19869 +        * the irq or softirq.
19870 +        *
19871 +        * Otherwise we wake up ksoftirqd to make sure we
19872 +        * schedule the softirq soon.
19873 +        */
19874 +       if (!in_interrupt())
19875 +               wakeup_softirqd();
19878 +void __raise_softirq_irqoff(unsigned int nr)
19880 +       trace_softirq_raise(nr);
19881 +       or_softirq_pending(1UL << nr);
19884 +static inline void local_bh_disable_nort(void) { local_bh_disable(); }
19885 +static inline void _local_bh_enable_nort(void) { _local_bh_enable(); }
19886 +static void ksoftirqd_set_sched_params(unsigned int cpu) { }
19888 +#else /* !PREEMPT_RT_FULL */
19891 + * On RT we serialize softirq execution with a cpu local lock per softirq
19892 + */
19893 +static DEFINE_PER_CPU(struct local_irq_lock [NR_SOFTIRQS], local_softirq_locks);
19895 +void __init softirq_early_init(void)
19897 +       int i;
19899 +       for (i = 0; i < NR_SOFTIRQS; i++)
19900 +               local_irq_lock_init(local_softirq_locks[i]);
19903 +static void lock_softirq(int which)
19905 +       local_lock(local_softirq_locks[which]);
19908 +static void unlock_softirq(int which)
19910 +       local_unlock(local_softirq_locks[which]);
19913 +static void do_single_softirq(int which)
19915 +       unsigned long old_flags = current->flags;
19917 +       current->flags &= ~PF_MEMALLOC;
19918 +       vtime_account_irq_enter(current);
19919 +       current->flags |= PF_IN_SOFTIRQ;
19920 +       lockdep_softirq_enter();
19921 +       local_irq_enable();
19922 +       handle_softirq(which);
19923 +       local_irq_disable();
19924 +       lockdep_softirq_exit();
19925 +       current->flags &= ~PF_IN_SOFTIRQ;
19926 +       vtime_account_irq_enter(current);
19927 +       tsk_restore_flags(current, old_flags, PF_MEMALLOC);
19931 + * Called with interrupts disabled. Process softirqs which were raised
19932 + * in current context (or on behalf of ksoftirqd).
19933 + */
19934 +static void do_current_softirqs(void)
19936 +       while (current->softirqs_raised) {
19937 +               int i = __ffs(current->softirqs_raised);
19938 +               unsigned int pending, mask = (1U << i);
19940 +               current->softirqs_raised &= ~mask;
19941 +               local_irq_enable();
19943 +               /*
19944 +                * If the lock is contended, we boost the owner to
19945 +                * process the softirq or leave the critical section
19946 +                * now.
19947 +                */
19948 +               lock_softirq(i);
19949 +               local_irq_disable();
19950 +               softirq_set_runner(i);
19951 +               /*
19952 +                * Check with the local_softirq_pending() bits,
19953 +                * whether we need to process this still or if someone
19954 +                * else took care of it.
19955 +                */
19956 +               pending = local_softirq_pending();
19957 +               if (pending & mask) {
19958 +                       set_softirq_pending(pending & ~mask);
19959 +                       do_single_softirq(i);
19960 +               }
19961 +               softirq_clr_runner(i);
19962 +               WARN_ON(current->softirq_nestcnt != 1);
19963 +               local_irq_enable();
19964 +               unlock_softirq(i);
19965 +               local_irq_disable();
19966 +       }
19969 +void __local_bh_disable(void)
19971 +       if (++current->softirq_nestcnt == 1)
19972 +               migrate_disable();
19974 +EXPORT_SYMBOL(__local_bh_disable);
19976 +void __local_bh_enable(void)
19978 +       if (WARN_ON(current->softirq_nestcnt == 0))
19979 +               return;
19981 +       local_irq_disable();
19982 +       if (current->softirq_nestcnt == 1 && current->softirqs_raised)
19983 +               do_current_softirqs();
19984 +       local_irq_enable();
19986 +       if (--current->softirq_nestcnt == 0)
19987 +               migrate_enable();
19989 +EXPORT_SYMBOL(__local_bh_enable);
19991 +void _local_bh_enable(void)
19993 +       if (WARN_ON(current->softirq_nestcnt == 0))
19994 +               return;
19995 +       if (--current->softirq_nestcnt == 0)
19996 +               migrate_enable();
19998 +EXPORT_SYMBOL(_local_bh_enable);
20000 +int in_serving_softirq(void)
20002 +       return current->flags & PF_IN_SOFTIRQ;
20004 +EXPORT_SYMBOL(in_serving_softirq);
20006 +/* Called with preemption disabled */
20007 +static void run_ksoftirqd(unsigned int cpu)
20009 +       local_irq_disable();
20010 +       current->softirq_nestcnt++;
20012 +       do_current_softirqs();
20013 +       current->softirq_nestcnt--;
20014 +       local_irq_enable();
20015 +       cond_resched_rcu_qs();
20019 + * Called from netif_rx_ni(). Preemption enabled, but migration
20020 + * disabled. So the cpu can't go away under us.
20021 + */
20022 +void thread_do_softirq(void)
20024 +       if (!in_serving_softirq() && current->softirqs_raised) {
20025 +               current->softirq_nestcnt++;
20026 +               do_current_softirqs();
20027 +               current->softirq_nestcnt--;
20028 +       }
20031 +static void do_raise_softirq_irqoff(unsigned int nr)
20033 +       unsigned int mask;
20035 +       mask = 1UL << nr;
20037 +       trace_softirq_raise(nr);
20038 +       or_softirq_pending(mask);
20040 +       /*
20041 +        * If we are not in a hard interrupt and inside a bh disabled
20042 +        * region, we simply raise the flag on current. local_bh_enable()
20043 +        * will make sure that the softirq is executed. Otherwise we
20044 +        * delegate it to ksoftirqd.
20045 +        */
20046 +       if (!in_irq() && current->softirq_nestcnt)
20047 +               current->softirqs_raised |= mask;
20048 +       else if (!__this_cpu_read(ksoftirqd) || !__this_cpu_read(ktimer_softirqd))
20049 +               return;
20051 +       if (mask & TIMER_SOFTIRQS)
20052 +               __this_cpu_read(ktimer_softirqd)->softirqs_raised |= mask;
20053 +       else
20054 +               __this_cpu_read(ksoftirqd)->softirqs_raised |= mask;
20057 +static void wakeup_proper_softirq(unsigned int nr)
20059 +       if ((1UL << nr) & TIMER_SOFTIRQS)
20060 +               wakeup_timer_softirqd();
20061 +       else
20062 +               wakeup_softirqd();
20065 +void __raise_softirq_irqoff(unsigned int nr)
20067 +       do_raise_softirq_irqoff(nr);
20068 +       if (!in_irq() && !current->softirq_nestcnt)
20069 +               wakeup_proper_softirq(nr);
20073 + * Same as __raise_softirq_irqoff() but will process them in ksoftirqd
20074 + */
20075 +void __raise_softirq_irqoff_ksoft(unsigned int nr)
20077 +       unsigned int mask;
20079 +       if (WARN_ON_ONCE(!__this_cpu_read(ksoftirqd) ||
20080 +                        !__this_cpu_read(ktimer_softirqd)))
20081 +               return;
20082 +       mask = 1UL << nr;
20084 +       trace_softirq_raise(nr);
20085 +       or_softirq_pending(mask);
20086 +       if (mask & TIMER_SOFTIRQS)
20087 +               __this_cpu_read(ktimer_softirqd)->softirqs_raised |= mask;
20088 +       else
20089 +               __this_cpu_read(ksoftirqd)->softirqs_raised |= mask;
20090 +       wakeup_proper_softirq(nr);
20094 + * This function must run with irqs disabled!
20095 + */
20096 +void raise_softirq_irqoff(unsigned int nr)
20098 +       do_raise_softirq_irqoff(nr);
20100 +       /*
20101 +        * If we're in an hard interrupt we let irq return code deal
20102 +        * with the wakeup of ksoftirqd.
20103 +        */
20104 +       if (in_irq())
20105 +               return;
20106 +       /*
20107 +        * If we are in thread context but outside of a bh disabled
20108 +        * region, we need to wake ksoftirqd as well.
20109 +        *
20110 +        * CHECKME: Some of the places which do that could be wrapped
20111 +        * into local_bh_disable/enable pairs. Though it's unclear
20112 +        * whether this is worth the effort. To find those places just
20113 +        * raise a WARN() if the condition is met.
20114 +        */
20115 +       if (!current->softirq_nestcnt)
20116 +               wakeup_proper_softirq(nr);
20119 +static inline int ksoftirqd_softirq_pending(void)
20121 +       return current->softirqs_raised;
20124 +static inline void local_bh_disable_nort(void) { }
20125 +static inline void _local_bh_enable_nort(void) { }
20127 +static inline void ksoftirqd_set_sched_params(unsigned int cpu)
20129 +       /* Take over all but timer pending softirqs when starting */
20130 +       local_irq_disable();
20131 +       current->softirqs_raised = local_softirq_pending() & ~TIMER_SOFTIRQS;
20132 +       local_irq_enable();
20135 +static inline void ktimer_softirqd_set_sched_params(unsigned int cpu)
20137 +       struct sched_param param = { .sched_priority = 1 };
20139 +       sched_setscheduler(current, SCHED_FIFO, &param);
20141 +       /* Take over timer pending softirqs when starting */
20142 +       local_irq_disable();
20143 +       current->softirqs_raised = local_softirq_pending() & TIMER_SOFTIRQS;
20144 +       local_irq_enable();
20147 +static inline void ktimer_softirqd_clr_sched_params(unsigned int cpu,
20148 +                                                   bool online)
20150 +       struct sched_param param = { .sched_priority = 0 };
20152 +       sched_setscheduler(current, SCHED_NORMAL, &param);
20155 +static int ktimer_softirqd_should_run(unsigned int cpu)
20157 +       return current->softirqs_raised;
20160 +#endif /* PREEMPT_RT_FULL */
20161  /*
20162   * Enter an interrupt context.
20163   */
20164 @@ -341,9 +784,9 @@ void irq_enter(void)
20165                  * Prevent raise_softirq from needlessly waking up ksoftirqd
20166                  * here, as softirq will be serviced on return from interrupt.
20167                  */
20168 -               local_bh_disable();
20169 +               local_bh_disable_nort();
20170                 tick_irq_enter();
20171 -               _local_bh_enable();
20172 +               _local_bh_enable_nort();
20173         }
20175         __irq_enter();
20176 @@ -351,6 +794,7 @@ void irq_enter(void)
20178  static inline void invoke_softirq(void)
20180 +#ifndef CONFIG_PREEMPT_RT_FULL
20181         if (ksoftirqd_running())
20182                 return;
20184 @@ -373,6 +817,18 @@ static inline void invoke_softirq(void)
20185         } else {
20186                 wakeup_softirqd();
20187         }
20188 +#else /* PREEMPT_RT_FULL */
20189 +       unsigned long flags;
20191 +       local_irq_save(flags);
20192 +       if (__this_cpu_read(ksoftirqd) &&
20193 +                       __this_cpu_read(ksoftirqd)->softirqs_raised)
20194 +               wakeup_softirqd();
20195 +       if (__this_cpu_read(ktimer_softirqd) &&
20196 +                       __this_cpu_read(ktimer_softirqd)->softirqs_raised)
20197 +               wakeup_timer_softirqd();
20198 +       local_irq_restore(flags);
20199 +#endif
20202  static inline void tick_irq_exit(void)
20203 @@ -409,26 +865,6 @@ void irq_exit(void)
20204         trace_hardirq_exit(); /* must be last! */
20208 - * This function must run with irqs disabled!
20209 - */
20210 -inline void raise_softirq_irqoff(unsigned int nr)
20212 -       __raise_softirq_irqoff(nr);
20214 -       /*
20215 -        * If we're in an interrupt or softirq, we're done
20216 -        * (this also catches softirq-disabled code). We will
20217 -        * actually run the softirq once we return from
20218 -        * the irq or softirq.
20219 -        *
20220 -        * Otherwise we wake up ksoftirqd to make sure we
20221 -        * schedule the softirq soon.
20222 -        */
20223 -       if (!in_interrupt())
20224 -               wakeup_softirqd();
20227  void raise_softirq(unsigned int nr)
20229         unsigned long flags;
20230 @@ -438,12 +874,6 @@ void raise_softirq(unsigned int nr)
20231         local_irq_restore(flags);
20234 -void __raise_softirq_irqoff(unsigned int nr)
20236 -       trace_softirq_raise(nr);
20237 -       or_softirq_pending(1UL << nr);
20240  void open_softirq(int nr, void (*action)(struct softirq_action *))
20242         softirq_vec[nr].action = action;
20243 @@ -460,15 +890,45 @@ struct tasklet_head {
20244  static DEFINE_PER_CPU(struct tasklet_head, tasklet_vec);
20245  static DEFINE_PER_CPU(struct tasklet_head, tasklet_hi_vec);
20247 +static void inline
20248 +__tasklet_common_schedule(struct tasklet_struct *t, struct tasklet_head *head, unsigned int nr)
20250 +       if (tasklet_trylock(t)) {
20251 +again:
20252 +               /* We may have been preempted before tasklet_trylock
20253 +                * and __tasklet_action may have already run.
20254 +                * So double check the sched bit while the takslet
20255 +                * is locked before adding it to the list.
20256 +                */
20257 +               if (test_bit(TASKLET_STATE_SCHED, &t->state)) {
20258 +                       t->next = NULL;
20259 +                       *head->tail = t;
20260 +                       head->tail = &(t->next);
20261 +                       raise_softirq_irqoff(nr);
20262 +                       tasklet_unlock(t);
20263 +               } else {
20264 +                       /* This is subtle. If we hit the corner case above
20265 +                        * It is possible that we get preempted right here,
20266 +                        * and another task has successfully called
20267 +                        * tasklet_schedule(), then this function, and
20268 +                        * failed on the trylock. Thus we must be sure
20269 +                        * before releasing the tasklet lock, that the
20270 +                        * SCHED_BIT is clear. Otherwise the tasklet
20271 +                        * may get its SCHED_BIT set, but not added to the
20272 +                        * list
20273 +                        */
20274 +                       if (!tasklet_tryunlock(t))
20275 +                               goto again;
20276 +               }
20277 +       }
20280  void __tasklet_schedule(struct tasklet_struct *t)
20282         unsigned long flags;
20284         local_irq_save(flags);
20285 -       t->next = NULL;
20286 -       *__this_cpu_read(tasklet_vec.tail) = t;
20287 -       __this_cpu_write(tasklet_vec.tail, &(t->next));
20288 -       raise_softirq_irqoff(TASKLET_SOFTIRQ);
20289 +       __tasklet_common_schedule(t, this_cpu_ptr(&tasklet_vec), TASKLET_SOFTIRQ);
20290         local_irq_restore(flags);
20292  EXPORT_SYMBOL(__tasklet_schedule);
20293 @@ -478,10 +938,7 @@ void __tasklet_hi_schedule(struct tasklet_struct *t)
20294         unsigned long flags;
20296         local_irq_save(flags);
20297 -       t->next = NULL;
20298 -       *__this_cpu_read(tasklet_hi_vec.tail) = t;
20299 -       __this_cpu_write(tasklet_hi_vec.tail,  &(t->next));
20300 -       raise_softirq_irqoff(HI_SOFTIRQ);
20301 +       __tasklet_common_schedule(t, this_cpu_ptr(&tasklet_hi_vec), HI_SOFTIRQ);
20302         local_irq_restore(flags);
20304  EXPORT_SYMBOL(__tasklet_hi_schedule);
20305 @@ -490,82 +947,122 @@ void __tasklet_hi_schedule_first(struct tasklet_struct *t)
20307         BUG_ON(!irqs_disabled());
20309 -       t->next = __this_cpu_read(tasklet_hi_vec.head);
20310 -       __this_cpu_write(tasklet_hi_vec.head, t);
20311 -       __raise_softirq_irqoff(HI_SOFTIRQ);
20312 +       __tasklet_hi_schedule(t);
20314  EXPORT_SYMBOL(__tasklet_hi_schedule_first);
20316 -static __latent_entropy void tasklet_action(struct softirq_action *a)
20317 +void  tasklet_enable(struct tasklet_struct *t)
20319 -       struct tasklet_struct *list;
20320 +       if (!atomic_dec_and_test(&t->count))
20321 +               return;
20322 +       if (test_and_clear_bit(TASKLET_STATE_PENDING, &t->state))
20323 +               tasklet_schedule(t);
20325 +EXPORT_SYMBOL(tasklet_enable);
20327 -       local_irq_disable();
20328 -       list = __this_cpu_read(tasklet_vec.head);
20329 -       __this_cpu_write(tasklet_vec.head, NULL);
20330 -       __this_cpu_write(tasklet_vec.tail, this_cpu_ptr(&tasklet_vec.head));
20331 -       local_irq_enable();
20332 +static void __tasklet_action(struct softirq_action *a,
20333 +                            struct tasklet_struct *list)
20335 +       int loops = 1000000;
20337         while (list) {
20338                 struct tasklet_struct *t = list;
20340                 list = list->next;
20342 -               if (tasklet_trylock(t)) {
20343 -                       if (!atomic_read(&t->count)) {
20344 -                               if (!test_and_clear_bit(TASKLET_STATE_SCHED,
20345 -                                                       &t->state))
20346 -                                       BUG();
20347 -                               t->func(t->data);
20348 -                               tasklet_unlock(t);
20349 -                               continue;
20350 -                       }
20351 -                       tasklet_unlock(t);
20352 +               /*
20353 +                * Should always succeed - after a tasklist got on the
20354 +                * list (after getting the SCHED bit set from 0 to 1),
20355 +                * nothing but the tasklet softirq it got queued to can
20356 +                * lock it:
20357 +                */
20358 +               if (!tasklet_trylock(t)) {
20359 +                       WARN_ON(1);
20360 +                       continue;
20361                 }
20363 -               local_irq_disable();
20364                 t->next = NULL;
20365 -               *__this_cpu_read(tasklet_vec.tail) = t;
20366 -               __this_cpu_write(tasklet_vec.tail, &(t->next));
20367 -               __raise_softirq_irqoff(TASKLET_SOFTIRQ);
20368 -               local_irq_enable();
20370 +               /*
20371 +                * If we cannot handle the tasklet because it's disabled,
20372 +                * mark it as pending. tasklet_enable() will later
20373 +                * re-schedule the tasklet.
20374 +                */
20375 +               if (unlikely(atomic_read(&t->count))) {
20376 +out_disabled:
20377 +                       /* implicit unlock: */
20378 +                       wmb();
20379 +                       t->state = TASKLET_STATEF_PENDING;
20380 +                       continue;
20381 +               }
20383 +               /*
20384 +                * After this point on the tasklet might be rescheduled
20385 +                * on another CPU, but it can only be added to another
20386 +                * CPU's tasklet list if we unlock the tasklet (which we
20387 +                * dont do yet).
20388 +                */
20389 +               if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
20390 +                       WARN_ON(1);
20392 +again:
20393 +               t->func(t->data);
20395 +               /*
20396 +                * Try to unlock the tasklet. We must use cmpxchg, because
20397 +                * another CPU might have scheduled or disabled the tasklet.
20398 +                * We only allow the STATE_RUN -> 0 transition here.
20399 +                */
20400 +               while (!tasklet_tryunlock(t)) {
20401 +                       /*
20402 +                        * If it got disabled meanwhile, bail out:
20403 +                        */
20404 +                       if (atomic_read(&t->count))
20405 +                               goto out_disabled;
20406 +                       /*
20407 +                        * If it got scheduled meanwhile, re-execute
20408 +                        * the tasklet function:
20409 +                        */
20410 +                       if (test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
20411 +                               goto again;
20412 +                       if (!--loops) {
20413 +                               printk("hm, tasklet state: %08lx\n", t->state);
20414 +                               WARN_ON(1);
20415 +                               tasklet_unlock(t);
20416 +                               break;
20417 +                       }
20418 +               }
20419         }
20422 +static void tasklet_action(struct softirq_action *a)
20424 +       struct tasklet_struct *list;
20426 +       local_irq_disable();
20428 +       list = __this_cpu_read(tasklet_vec.head);
20429 +       __this_cpu_write(tasklet_vec.head, NULL);
20430 +       __this_cpu_write(tasklet_vec.tail, this_cpu_ptr(&tasklet_vec.head));
20432 +       local_irq_enable();
20434 +       __tasklet_action(a, list);
20437  static __latent_entropy void tasklet_hi_action(struct softirq_action *a)
20439         struct tasklet_struct *list;
20441         local_irq_disable();
20443         list = __this_cpu_read(tasklet_hi_vec.head);
20444         __this_cpu_write(tasklet_hi_vec.head, NULL);
20445         __this_cpu_write(tasklet_hi_vec.tail, this_cpu_ptr(&tasklet_hi_vec.head));
20446 -       local_irq_enable();
20448 -       while (list) {
20449 -               struct tasklet_struct *t = list;
20451 -               list = list->next;
20453 -               if (tasklet_trylock(t)) {
20454 -                       if (!atomic_read(&t->count)) {
20455 -                               if (!test_and_clear_bit(TASKLET_STATE_SCHED,
20456 -                                                       &t->state))
20457 -                                       BUG();
20458 -                               t->func(t->data);
20459 -                               tasklet_unlock(t);
20460 -                               continue;
20461 -                       }
20462 -                       tasklet_unlock(t);
20463 -               }
20464 +       local_irq_enable();
20466 -               local_irq_disable();
20467 -               t->next = NULL;
20468 -               *__this_cpu_read(tasklet_hi_vec.tail) = t;
20469 -               __this_cpu_write(tasklet_hi_vec.tail, &(t->next));
20470 -               __raise_softirq_irqoff(HI_SOFTIRQ);
20471 -               local_irq_enable();
20472 -       }
20473 +       __tasklet_action(a, list);
20476  void tasklet_init(struct tasklet_struct *t,
20477 @@ -586,7 +1083,7 @@ void tasklet_kill(struct tasklet_struct *t)
20479         while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) {
20480                 do {
20481 -                       yield();
20482 +                       msleep(1);
20483                 } while (test_bit(TASKLET_STATE_SCHED, &t->state));
20484         }
20485         tasklet_unlock_wait(t);
20486 @@ -660,25 +1157,26 @@ void __init softirq_init(void)
20487         open_softirq(HI_SOFTIRQ, tasklet_hi_action);
20490 -static int ksoftirqd_should_run(unsigned int cpu)
20491 +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
20492 +void tasklet_unlock_wait(struct tasklet_struct *t)
20494 -       return local_softirq_pending();
20497 -static void run_ksoftirqd(unsigned int cpu)
20499 -       local_irq_disable();
20500 -       if (local_softirq_pending()) {
20501 +       while (test_bit(TASKLET_STATE_RUN, &(t)->state)) {
20502                 /*
20503 -                * We can safely run softirq on inline stack, as we are not deep
20504 -                * in the task stack here.
20505 +                * Hack for now to avoid this busy-loop:
20506                  */
20507 -               __do_softirq();
20508 -               local_irq_enable();
20509 -               cond_resched_rcu_qs();
20510 -               return;
20511 +#ifdef CONFIG_PREEMPT_RT_FULL
20512 +               msleep(1);
20513 +#else
20514 +               barrier();
20515 +#endif
20516         }
20517 -       local_irq_enable();
20519 +EXPORT_SYMBOL(tasklet_unlock_wait);
20520 +#endif
20522 +static int ksoftirqd_should_run(unsigned int cpu)
20524 +       return ksoftirqd_softirq_pending();
20527  #ifdef CONFIG_HOTPLUG_CPU
20528 @@ -745,17 +1243,31 @@ static int takeover_tasklets(unsigned int cpu)
20530  static struct smp_hotplug_thread softirq_threads = {
20531         .store                  = &ksoftirqd,
20532 +       .setup                  = ksoftirqd_set_sched_params,
20533         .thread_should_run      = ksoftirqd_should_run,
20534         .thread_fn              = run_ksoftirqd,
20535         .thread_comm            = "ksoftirqd/%u",
20536  };
20538 +#ifdef CONFIG_PREEMPT_RT_FULL
20539 +static struct smp_hotplug_thread softirq_timer_threads = {
20540 +       .store                  = &ktimer_softirqd,
20541 +       .setup                  = ktimer_softirqd_set_sched_params,
20542 +       .cleanup                = ktimer_softirqd_clr_sched_params,
20543 +       .thread_should_run      = ktimer_softirqd_should_run,
20544 +       .thread_fn              = run_ksoftirqd,
20545 +       .thread_comm            = "ktimersoftd/%u",
20547 +#endif
20549  static __init int spawn_ksoftirqd(void)
20551         cpuhp_setup_state_nocalls(CPUHP_SOFTIRQ_DEAD, "softirq:dead", NULL,
20552                                   takeover_tasklets);
20553         BUG_ON(smpboot_register_percpu_thread(&softirq_threads));
20555 +#ifdef CONFIG_PREEMPT_RT_FULL
20556 +       BUG_ON(smpboot_register_percpu_thread(&softirq_timer_threads));
20557 +#endif
20558         return 0;
20560  early_initcall(spawn_ksoftirqd);
20561 diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
20562 index ec9ab2f01489..8b89dbedeaff 100644
20563 --- a/kernel/stop_machine.c
20564 +++ b/kernel/stop_machine.c
20565 @@ -36,7 +36,7 @@ struct cpu_stop_done {
20566  struct cpu_stopper {
20567         struct task_struct      *thread;
20569 -       spinlock_t              lock;
20570 +       raw_spinlock_t          lock;
20571         bool                    enabled;        /* is this stopper enabled? */
20572         struct list_head        works;          /* list of pending works */
20574 @@ -78,14 +78,14 @@ static bool cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
20575         unsigned long flags;
20576         bool enabled;
20578 -       spin_lock_irqsave(&stopper->lock, flags);
20579 +       raw_spin_lock_irqsave(&stopper->lock, flags);
20580         enabled = stopper->enabled;
20581         if (enabled)
20582                 __cpu_stop_queue_work(stopper, work);
20583         else if (work->done)
20584                 cpu_stop_signal_done(work->done);
20585 -       spin_unlock_irqrestore(&stopper->lock, flags);
20587 +       raw_spin_unlock_irqrestore(&stopper->lock, flags);
20588         return enabled;
20591 @@ -231,8 +231,8 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1,
20592         struct cpu_stopper *stopper2 = per_cpu_ptr(&cpu_stopper, cpu2);
20593         int err;
20594  retry:
20595 -       spin_lock_irq(&stopper1->lock);
20596 -       spin_lock_nested(&stopper2->lock, SINGLE_DEPTH_NESTING);
20597 +       raw_spin_lock_irq(&stopper1->lock);
20598 +       raw_spin_lock_nested(&stopper2->lock, SINGLE_DEPTH_NESTING);
20600         err = -ENOENT;
20601         if (!stopper1->enabled || !stopper2->enabled)
20602 @@ -255,8 +255,8 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1,
20603         __cpu_stop_queue_work(stopper1, work1);
20604         __cpu_stop_queue_work(stopper2, work2);
20605  unlock:
20606 -       spin_unlock(&stopper2->lock);
20607 -       spin_unlock_irq(&stopper1->lock);
20608 +       raw_spin_unlock(&stopper2->lock);
20609 +       raw_spin_unlock_irq(&stopper1->lock);
20611         if (unlikely(err == -EDEADLK)) {
20612                 while (stop_cpus_in_progress)
20613 @@ -448,9 +448,9 @@ static int cpu_stop_should_run(unsigned int cpu)
20614         unsigned long flags;
20615         int run;
20617 -       spin_lock_irqsave(&stopper->lock, flags);
20618 +       raw_spin_lock_irqsave(&stopper->lock, flags);
20619         run = !list_empty(&stopper->works);
20620 -       spin_unlock_irqrestore(&stopper->lock, flags);
20621 +       raw_spin_unlock_irqrestore(&stopper->lock, flags);
20622         return run;
20625 @@ -461,13 +461,13 @@ static void cpu_stopper_thread(unsigned int cpu)
20627  repeat:
20628         work = NULL;
20629 -       spin_lock_irq(&stopper->lock);
20630 +       raw_spin_lock_irq(&stopper->lock);
20631         if (!list_empty(&stopper->works)) {
20632                 work = list_first_entry(&stopper->works,
20633                                         struct cpu_stop_work, list);
20634                 list_del_init(&work->list);
20635         }
20636 -       spin_unlock_irq(&stopper->lock);
20637 +       raw_spin_unlock_irq(&stopper->lock);
20639         if (work) {
20640                 cpu_stop_fn_t fn = work->fn;
20641 @@ -475,6 +475,8 @@ static void cpu_stopper_thread(unsigned int cpu)
20642                 struct cpu_stop_done *done = work->done;
20643                 int ret;
20645 +               /* XXX */
20647                 /* cpu stop callbacks must not sleep, make in_atomic() == T */
20648                 preempt_count_inc();
20649                 ret = fn(arg);
20650 @@ -541,7 +543,7 @@ static int __init cpu_stop_init(void)
20651         for_each_possible_cpu(cpu) {
20652                 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
20654 -               spin_lock_init(&stopper->lock);
20655 +               raw_spin_lock_init(&stopper->lock);
20656                 INIT_LIST_HEAD(&stopper->works);
20657         }
20659 diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
20660 index eeb7f2f5698d..369203af6406 100644
20661 --- a/kernel/time/hrtimer.c
20662 +++ b/kernel/time/hrtimer.c
20663 @@ -53,6 +53,7 @@
20664  #include <asm/uaccess.h>
20666  #include <trace/events/timer.h>
20667 +#include <trace/events/hist.h>
20669  #include "tick-internal.h"
20671 @@ -693,6 +694,29 @@ static void hrtimer_switch_to_hres(void)
20672         retrigger_next_event(NULL);
20675 +#ifdef CONFIG_PREEMPT_RT_FULL
20677 +static struct swork_event clock_set_delay_work;
20679 +static void run_clock_set_delay(struct swork_event *event)
20681 +       clock_was_set();
20684 +void clock_was_set_delayed(void)
20686 +       swork_queue(&clock_set_delay_work);
20689 +static __init int create_clock_set_delay_thread(void)
20691 +       WARN_ON(swork_get());
20692 +       INIT_SWORK(&clock_set_delay_work, run_clock_set_delay);
20693 +       return 0;
20695 +early_initcall(create_clock_set_delay_thread);
20696 +#else /* PREEMPT_RT_FULL */
20698  static void clock_was_set_work(struct work_struct *work)
20700         clock_was_set();
20701 @@ -708,6 +732,7 @@ void clock_was_set_delayed(void)
20703         schedule_work(&hrtimer_work);
20705 +#endif
20707  #else
20709 @@ -717,11 +742,8 @@ static inline int hrtimer_is_hres_enabled(void) { return 0; }
20710  static inline void hrtimer_switch_to_hres(void) { }
20711  static inline void
20712  hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { }
20713 -static inline int hrtimer_reprogram(struct hrtimer *timer,
20714 -                                   struct hrtimer_clock_base *base)
20716 -       return 0;
20718 +static inline void hrtimer_reprogram(struct hrtimer *timer,
20719 +                                    struct hrtimer_clock_base *base) { }
20720  static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { }
20721  static inline void retrigger_next_event(void *arg) { }
20723 @@ -853,6 +875,32 @@ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
20725  EXPORT_SYMBOL_GPL(hrtimer_forward);
20727 +#ifdef CONFIG_PREEMPT_RT_BASE
20728 +# define wake_up_timer_waiters(b)      wake_up(&(b)->wait)
20730 +/**
20731 + * hrtimer_wait_for_timer - Wait for a running timer
20732 + *
20733 + * @timer:     timer to wait for
20734 + *
20735 + * The function waits in case the timers callback function is
20736 + * currently executed on the waitqueue of the timer base. The
20737 + * waitqueue is woken up after the timer callback function has
20738 + * finished execution.
20739 + */
20740 +void hrtimer_wait_for_timer(const struct hrtimer *timer)
20742 +       struct hrtimer_clock_base *base = timer->base;
20744 +       if (base && base->cpu_base && !timer->irqsafe)
20745 +               wait_event(base->cpu_base->wait,
20746 +                               !(hrtimer_callback_running(timer)));
20749 +#else
20750 +# define wake_up_timer_waiters(b)      do { } while (0)
20751 +#endif
20753  /*
20754   * enqueue_hrtimer - internal function to (re)start a timer
20755   *
20756 @@ -894,6 +942,11 @@ static void __remove_hrtimer(struct hrtimer *timer,
20757         if (!(state & HRTIMER_STATE_ENQUEUED))
20758                 return;
20760 +       if (unlikely(!list_empty(&timer->cb_entry))) {
20761 +               list_del_init(&timer->cb_entry);
20762 +               return;
20763 +       }
20765         if (!timerqueue_del(&base->active, &timer->node))
20766                 cpu_base->active_bases &= ~(1 << base->index);
20768 @@ -989,7 +1042,16 @@ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
20769         new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED);
20771         timer_stats_hrtimer_set_start_info(timer);
20772 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
20773 +       {
20774 +               ktime_t now = new_base->get_time();
20776 +               if (ktime_to_ns(tim) < ktime_to_ns(now))
20777 +                       timer->praecox = now;
20778 +               else
20779 +                       timer->praecox = ktime_set(0, 0);
20780 +       }
20781 +#endif
20782         leftmost = enqueue_hrtimer(timer, new_base);
20783         if (!leftmost)
20784                 goto unlock;
20785 @@ -1061,7 +1123,7 @@ int hrtimer_cancel(struct hrtimer *timer)
20787                 if (ret >= 0)
20788                         return ret;
20789 -               cpu_relax();
20790 +               hrtimer_wait_for_timer(timer);
20791         }
20793  EXPORT_SYMBOL_GPL(hrtimer_cancel);
20794 @@ -1137,6 +1199,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
20796         base = hrtimer_clockid_to_base(clock_id);
20797         timer->base = &cpu_base->clock_base[base];
20798 +       INIT_LIST_HEAD(&timer->cb_entry);
20799         timerqueue_init(&timer->node);
20801  #ifdef CONFIG_TIMER_STATS
20802 @@ -1177,6 +1240,7 @@ bool hrtimer_active(const struct hrtimer *timer)
20803                 seq = raw_read_seqcount_begin(&cpu_base->seq);
20805                 if (timer->state != HRTIMER_STATE_INACTIVE ||
20806 +                   cpu_base->running_soft == timer ||
20807                     cpu_base->running == timer)
20808                         return true;
20810 @@ -1275,10 +1339,112 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
20811         cpu_base->running = NULL;
20814 -static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)
20815 +#ifdef CONFIG_PREEMPT_RT_BASE
20816 +static void hrtimer_rt_reprogram(int restart, struct hrtimer *timer,
20817 +                                struct hrtimer_clock_base *base)
20819 +       int leftmost;
20821 +       if (restart != HRTIMER_NORESTART &&
20822 +           !(timer->state & HRTIMER_STATE_ENQUEUED)) {
20824 +               leftmost = enqueue_hrtimer(timer, base);
20825 +               if (!leftmost)
20826 +                       return;
20827 +#ifdef CONFIG_HIGH_RES_TIMERS
20828 +               if (!hrtimer_is_hres_active(timer)) {
20829 +                       /*
20830 +                        * Kick to reschedule the next tick to handle the new timer
20831 +                        * on dynticks target.
20832 +                        */
20833 +                       if (base->cpu_base->nohz_active)
20834 +                               wake_up_nohz_cpu(base->cpu_base->cpu);
20835 +               } else {
20837 +                       hrtimer_reprogram(timer, base);
20838 +               }
20839 +#endif
20840 +       }
20844 + * The changes in mainline which removed the callback modes from
20845 + * hrtimer are not yet working with -rt. The non wakeup_process()
20846 + * based callbacks which involve sleeping locks need to be treated
20847 + * seperately.
20848 + */
20849 +static void hrtimer_rt_run_pending(void)
20851 +       enum hrtimer_restart (*fn)(struct hrtimer *);
20852 +       struct hrtimer_cpu_base *cpu_base;
20853 +       struct hrtimer_clock_base *base;
20854 +       struct hrtimer *timer;
20855 +       int index, restart;
20857 +       local_irq_disable();
20858 +       cpu_base = &per_cpu(hrtimer_bases, smp_processor_id());
20860 +       raw_spin_lock(&cpu_base->lock);
20862 +       for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) {
20863 +               base = &cpu_base->clock_base[index];
20865 +               while (!list_empty(&base->expired)) {
20866 +                       timer = list_first_entry(&base->expired,
20867 +                                                struct hrtimer, cb_entry);
20869 +                       /*
20870 +                        * Same as the above __run_hrtimer function
20871 +                        * just we run with interrupts enabled.
20872 +                        */
20873 +                       debug_deactivate(timer);
20874 +                       cpu_base->running_soft = timer;
20875 +                       raw_write_seqcount_barrier(&cpu_base->seq);
20877 +                       __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0);
20878 +                       timer_stats_account_hrtimer(timer);
20879 +                       fn = timer->function;
20881 +                       raw_spin_unlock_irq(&cpu_base->lock);
20882 +                       restart = fn(timer);
20883 +                       raw_spin_lock_irq(&cpu_base->lock);
20885 +                       hrtimer_rt_reprogram(restart, timer, base);
20886 +                       raw_write_seqcount_barrier(&cpu_base->seq);
20888 +                       WARN_ON_ONCE(cpu_base->running_soft != timer);
20889 +                       cpu_base->running_soft = NULL;
20890 +               }
20891 +       }
20893 +       raw_spin_unlock_irq(&cpu_base->lock);
20895 +       wake_up_timer_waiters(cpu_base);
20898 +static int hrtimer_rt_defer(struct hrtimer *timer)
20900 +       if (timer->irqsafe)
20901 +               return 0;
20903 +       __remove_hrtimer(timer, timer->base, timer->state, 0);
20904 +       list_add_tail(&timer->cb_entry, &timer->base->expired);
20905 +       return 1;
20908 +#else
20910 +static inline int hrtimer_rt_defer(struct hrtimer *timer) { return 0; }
20912 +#endif
20914 +static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer);
20916 +static int __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)
20918         struct hrtimer_clock_base *base = cpu_base->clock_base;
20919         unsigned int active = cpu_base->active_bases;
20920 +       int raise = 0;
20922         for (; active; base++, active >>= 1) {
20923                 struct timerqueue_node *node;
20924 @@ -1294,6 +1460,15 @@ static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)
20926                         timer = container_of(node, struct hrtimer, node);
20928 +                       trace_hrtimer_interrupt(raw_smp_processor_id(),
20929 +                           ktime_to_ns(ktime_sub(ktime_to_ns(timer->praecox) ?
20930 +                               timer->praecox : hrtimer_get_expires(timer),
20931 +                               basenow)),
20932 +                           current,
20933 +                           timer->function == hrtimer_wakeup ?
20934 +                           container_of(timer, struct hrtimer_sleeper,
20935 +                               timer)->task : NULL);
20937                         /*
20938                          * The immediate goal for using the softexpires is
20939                          * minimizing wakeups, not running timers at the
20940 @@ -1309,9 +1484,13 @@ static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)
20941                         if (basenow.tv64 < hrtimer_get_softexpires_tv64(timer))
20942                                 break;
20944 -                       __run_hrtimer(cpu_base, base, timer, &basenow);
20945 +                       if (!hrtimer_rt_defer(timer))
20946 +                               __run_hrtimer(cpu_base, base, timer, &basenow);
20947 +                       else
20948 +                               raise = 1;
20949                 }
20950         }
20951 +       return raise;
20954  #ifdef CONFIG_HIGH_RES_TIMERS
20955 @@ -1325,6 +1504,7 @@ void hrtimer_interrupt(struct clock_event_device *dev)
20956         struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
20957         ktime_t expires_next, now, entry_time, delta;
20958         int retries = 0;
20959 +       int raise;
20961         BUG_ON(!cpu_base->hres_active);
20962         cpu_base->nr_events++;
20963 @@ -1343,7 +1523,7 @@ void hrtimer_interrupt(struct clock_event_device *dev)
20964          */
20965         cpu_base->expires_next.tv64 = KTIME_MAX;
20967 -       __hrtimer_run_queues(cpu_base, now);
20968 +       raise = __hrtimer_run_queues(cpu_base, now);
20970         /* Reevaluate the clock bases for the next expiry */
20971         expires_next = __hrtimer_get_next_event(cpu_base);
20972 @@ -1354,6 +1534,8 @@ void hrtimer_interrupt(struct clock_event_device *dev)
20973         cpu_base->expires_next = expires_next;
20974         cpu_base->in_hrtirq = 0;
20975         raw_spin_unlock(&cpu_base->lock);
20976 +       if (raise)
20977 +               raise_softirq_irqoff(HRTIMER_SOFTIRQ);
20979         /* Reprogramming necessary ? */
20980         if (!tick_program_event(expires_next, 0)) {
20981 @@ -1433,6 +1615,7 @@ void hrtimer_run_queues(void)
20983         struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
20984         ktime_t now;
20985 +       int raise;
20987         if (__hrtimer_hres_active(cpu_base))
20988                 return;
20989 @@ -1451,8 +1634,10 @@ void hrtimer_run_queues(void)
20991         raw_spin_lock(&cpu_base->lock);
20992         now = hrtimer_update_base(cpu_base);
20993 -       __hrtimer_run_queues(cpu_base, now);
20994 +       raise = __hrtimer_run_queues(cpu_base, now);
20995         raw_spin_unlock(&cpu_base->lock);
20996 +       if (raise)
20997 +               raise_softirq_irqoff(HRTIMER_SOFTIRQ);
21000  /*
21001 @@ -1474,16 +1659,18 @@ static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer)
21002  void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
21004         sl->timer.function = hrtimer_wakeup;
21005 +       sl->timer.irqsafe = 1;
21006         sl->task = task;
21008  EXPORT_SYMBOL_GPL(hrtimer_init_sleeper);
21010 -static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode)
21011 +static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode,
21012 +                               unsigned long state)
21014         hrtimer_init_sleeper(t, current);
21016         do {
21017 -               set_current_state(TASK_INTERRUPTIBLE);
21018 +               set_current_state(state);
21019                 hrtimer_start_expires(&t->timer, mode);
21021                 if (likely(t->task))
21022 @@ -1525,7 +1712,8 @@ long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
21023                                 HRTIMER_MODE_ABS);
21024         hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires);
21026 -       if (do_nanosleep(&t, HRTIMER_MODE_ABS))
21027 +       /* cpu_chill() does not care about restart state. */
21028 +       if (do_nanosleep(&t, HRTIMER_MODE_ABS, TASK_INTERRUPTIBLE))
21029                 goto out;
21031         rmtp = restart->nanosleep.rmtp;
21032 @@ -1542,8 +1730,10 @@ long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
21033         return ret;
21036 -long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
21037 -                      const enum hrtimer_mode mode, const clockid_t clockid)
21038 +static long
21039 +__hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
21040 +                   const enum hrtimer_mode mode, const clockid_t clockid,
21041 +                   unsigned long state)
21043         struct restart_block *restart;
21044         struct hrtimer_sleeper t;
21045 @@ -1556,7 +1746,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
21047         hrtimer_init_on_stack(&t.timer, clockid, mode);
21048         hrtimer_set_expires_range_ns(&t.timer, timespec_to_ktime(*rqtp), slack);
21049 -       if (do_nanosleep(&t, mode))
21050 +       if (do_nanosleep(&t, mode, state))
21051                 goto out;
21053         /* Absolute timers do not update the rmtp value and restart: */
21054 @@ -1583,6 +1773,12 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
21055         return ret;
21058 +long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
21059 +                      const enum hrtimer_mode mode, const clockid_t clockid)
21061 +       return __hrtimer_nanosleep(rqtp, rmtp, mode, clockid, TASK_INTERRUPTIBLE);
21064  SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp,
21065                 struct timespec __user *, rmtp)
21067 @@ -1597,6 +1793,26 @@ SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp,
21068         return hrtimer_nanosleep(&tu, rmtp, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
21071 +#ifdef CONFIG_PREEMPT_RT_FULL
21073 + * Sleep for 1 ms in hope whoever holds what we want will let it go.
21074 + */
21075 +void cpu_chill(void)
21077 +       struct timespec tu = {
21078 +               .tv_nsec = NSEC_PER_MSEC,
21079 +       };
21080 +       unsigned int freeze_flag = current->flags & PF_NOFREEZE;
21082 +       current->flags |= PF_NOFREEZE;
21083 +       __hrtimer_nanosleep(&tu, NULL, HRTIMER_MODE_REL, CLOCK_MONOTONIC,
21084 +                           TASK_UNINTERRUPTIBLE);
21085 +       if (!freeze_flag)
21086 +               current->flags &= ~PF_NOFREEZE;
21088 +EXPORT_SYMBOL(cpu_chill);
21089 +#endif
21091  /*
21092   * Functions related to boot-time initialization:
21093   */
21094 @@ -1608,16 +1824,20 @@ int hrtimers_prepare_cpu(unsigned int cpu)
21095         for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
21096                 cpu_base->clock_base[i].cpu_base = cpu_base;
21097                 timerqueue_init_head(&cpu_base->clock_base[i].active);
21098 +               INIT_LIST_HEAD(&cpu_base->clock_base[i].expired);
21099         }
21101         cpu_base->cpu = cpu;
21102         hrtimer_init_hres(cpu_base);
21103 +#ifdef CONFIG_PREEMPT_RT_BASE
21104 +       init_waitqueue_head(&cpu_base->wait);
21105 +#endif
21106         return 0;
21109  #ifdef CONFIG_HOTPLUG_CPU
21111 -static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
21112 +static int migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
21113                                 struct hrtimer_clock_base *new_base)
21115         struct hrtimer *timer;
21116 @@ -1645,12 +1865,21 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
21117                  */
21118                 enqueue_hrtimer(timer, new_base);
21119         }
21120 +#ifdef CONFIG_PREEMPT_RT_BASE
21121 +       list_splice_tail(&old_base->expired, &new_base->expired);
21122 +       /*
21123 +        * Tell the caller to raise HRTIMER_SOFTIRQ.  We can't safely
21124 +        * acquire ktimersoftd->pi_lock while the base lock is held.
21125 +        */
21126 +       return !list_empty(&new_base->expired);
21127 +#endif
21128 +       return 0;
21131  int hrtimers_dead_cpu(unsigned int scpu)
21133         struct hrtimer_cpu_base *old_base, *new_base;
21134 -       int i;
21135 +       int i, raise = 0;
21137         BUG_ON(cpu_online(scpu));
21138         tick_cancel_sched_timer(scpu);
21139 @@ -1666,13 +1895,16 @@ int hrtimers_dead_cpu(unsigned int scpu)
21140         raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
21142         for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
21143 -               migrate_hrtimer_list(&old_base->clock_base[i],
21144 -                                    &new_base->clock_base[i]);
21145 +               raise |= migrate_hrtimer_list(&old_base->clock_base[i],
21146 +                                             &new_base->clock_base[i]);
21147         }
21149         raw_spin_unlock(&old_base->lock);
21150         raw_spin_unlock(&new_base->lock);
21152 +       if (raise)
21153 +               raise_softirq_irqoff(HRTIMER_SOFTIRQ);
21155         /* Check, if we got expired work to do */
21156         __hrtimer_peek_ahead_timers();
21157         local_irq_enable();
21158 @@ -1681,9 +1913,26 @@ int hrtimers_dead_cpu(unsigned int scpu)
21160  #endif /* CONFIG_HOTPLUG_CPU */
21162 +#ifdef CONFIG_PREEMPT_RT_BASE
21164 +static void run_hrtimer_softirq(struct softirq_action *h)
21166 +       hrtimer_rt_run_pending();
21169 +static void hrtimers_open_softirq(void)
21171 +       open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq);
21174 +#else
21175 +static void hrtimers_open_softirq(void) { }
21176 +#endif
21178  void __init hrtimers_init(void)
21180         hrtimers_prepare_cpu(smp_processor_id());
21181 +       hrtimers_open_softirq();
21184  /**
21185 diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c
21186 index 1d5c7204ddc9..184de6751180 100644
21187 --- a/kernel/time/itimer.c
21188 +++ b/kernel/time/itimer.c
21189 @@ -213,6 +213,7 @@ int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue)
21190                 /* We are sharing ->siglock with it_real_fn() */
21191                 if (hrtimer_try_to_cancel(timer) < 0) {
21192                         spin_unlock_irq(&tsk->sighand->siglock);
21193 +                       hrtimer_wait_for_timer(&tsk->signal->real_timer);
21194                         goto again;
21195                 }
21196                 expires = timeval_to_ktime(value->it_value);
21197 diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
21198 index 555e21f7b966..a5d6435fabbb 100644
21199 --- a/kernel/time/jiffies.c
21200 +++ b/kernel/time/jiffies.c
21201 @@ -74,7 +74,8 @@ static struct clocksource clocksource_jiffies = {
21202         .max_cycles     = 10,
21203  };
21205 -__cacheline_aligned_in_smp DEFINE_SEQLOCK(jiffies_lock);
21206 +__cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(jiffies_lock);
21207 +__cacheline_aligned_in_smp seqcount_t jiffies_seq;
21209  #if (BITS_PER_LONG < 64)
21210  u64 get_jiffies_64(void)
21211 @@ -83,9 +84,9 @@ u64 get_jiffies_64(void)
21212         u64 ret;
21214         do {
21215 -               seq = read_seqbegin(&jiffies_lock);
21216 +               seq = read_seqcount_begin(&jiffies_seq);
21217                 ret = jiffies_64;
21218 -       } while (read_seqretry(&jiffies_lock, seq));
21219 +       } while (read_seqcount_retry(&jiffies_seq, seq));
21220         return ret;
21222  EXPORT_SYMBOL(get_jiffies_64);
21223 diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
21224 index 6df8927c58a5..05b7391bf9bd 100644
21225 --- a/kernel/time/ntp.c
21226 +++ b/kernel/time/ntp.c
21227 @@ -17,6 +17,7 @@
21228  #include <linux/module.h>
21229  #include <linux/rtc.h>
21230  #include <linux/math64.h>
21231 +#include <linux/swork.h>
21233  #include "ntp_internal.h"
21234  #include "timekeeping_internal.h"
21235 @@ -568,10 +569,35 @@ static void sync_cmos_clock(struct work_struct *work)
21236                            &sync_cmos_work, timespec64_to_jiffies(&next));
21239 +#ifdef CONFIG_PREEMPT_RT_FULL
21241 +static void run_clock_set_delay(struct swork_event *event)
21243 +       queue_delayed_work(system_power_efficient_wq, &sync_cmos_work, 0);
21246 +static struct swork_event ntp_cmos_swork;
21248 +void ntp_notify_cmos_timer(void)
21250 +       swork_queue(&ntp_cmos_swork);
21253 +static __init int create_cmos_delay_thread(void)
21255 +       WARN_ON(swork_get());
21256 +       INIT_SWORK(&ntp_cmos_swork, run_clock_set_delay);
21257 +       return 0;
21259 +early_initcall(create_cmos_delay_thread);
21261 +#else
21263  void ntp_notify_cmos_timer(void)
21265         queue_delayed_work(system_power_efficient_wq, &sync_cmos_work, 0);
21267 +#endif /* CONFIG_PREEMPT_RT_FULL */
21269  #else
21270  void ntp_notify_cmos_timer(void) { }
21271 diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
21272 index 39008d78927a..633f4eaca9e7 100644
21273 --- a/kernel/time/posix-cpu-timers.c
21274 +++ b/kernel/time/posix-cpu-timers.c
21275 @@ -3,6 +3,7 @@
21276   */
21278  #include <linux/sched.h>
21279 +#include <linux/sched/rt.h>
21280  #include <linux/posix-timers.h>
21281  #include <linux/errno.h>
21282  #include <linux/math64.h>
21283 @@ -620,7 +621,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
21284         /*
21285          * Disarm any old timer after extracting its expiry time.
21286          */
21287 -       WARN_ON_ONCE(!irqs_disabled());
21288 +       WARN_ON_ONCE_NONRT(!irqs_disabled());
21290         ret = 0;
21291         old_incr = timer->it.cpu.incr;
21292 @@ -1064,7 +1065,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
21293         /*
21294          * Now re-arm for the new expiry time.
21295          */
21296 -       WARN_ON_ONCE(!irqs_disabled());
21297 +       WARN_ON_ONCE_NONRT(!irqs_disabled());
21298         arm_timer(timer);
21299         unlock_task_sighand(p, &flags);
21301 @@ -1153,13 +1154,13 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
21302   * already updated our counts.  We need to check if any timers fire now.
21303   * Interrupts are disabled.
21304   */
21305 -void run_posix_cpu_timers(struct task_struct *tsk)
21306 +static void __run_posix_cpu_timers(struct task_struct *tsk)
21308         LIST_HEAD(firing);
21309         struct k_itimer *timer, *next;
21310         unsigned long flags;
21312 -       WARN_ON_ONCE(!irqs_disabled());
21313 +       WARN_ON_ONCE_NONRT(!irqs_disabled());
21315         /*
21316          * The fast path checks that there are no expired thread or thread
21317 @@ -1213,6 +1214,190 @@ void run_posix_cpu_timers(struct task_struct *tsk)
21318         }
21321 +#ifdef CONFIG_PREEMPT_RT_BASE
21322 +#include <linux/kthread.h>
21323 +#include <linux/cpu.h>
21324 +DEFINE_PER_CPU(struct task_struct *, posix_timer_task);
21325 +DEFINE_PER_CPU(struct task_struct *, posix_timer_tasklist);
21327 +static int posix_cpu_timers_thread(void *data)
21329 +       int cpu = (long)data;
21331 +       BUG_ON(per_cpu(posix_timer_task,cpu) != current);
21333 +       while (!kthread_should_stop()) {
21334 +               struct task_struct *tsk = NULL;
21335 +               struct task_struct *next = NULL;
21337 +               if (cpu_is_offline(cpu))
21338 +                       goto wait_to_die;
21340 +               /* grab task list */
21341 +               raw_local_irq_disable();
21342 +               tsk = per_cpu(posix_timer_tasklist, cpu);
21343 +               per_cpu(posix_timer_tasklist, cpu) = NULL;
21344 +               raw_local_irq_enable();
21346 +               /* its possible the list is empty, just return */
21347 +               if (!tsk) {
21348 +                       set_current_state(TASK_INTERRUPTIBLE);
21349 +                       schedule();
21350 +                       __set_current_state(TASK_RUNNING);
21351 +                       continue;
21352 +               }
21354 +               /* Process task list */
21355 +               while (1) {
21356 +                       /* save next */
21357 +                       next = tsk->posix_timer_list;
21359 +                       /* run the task timers, clear its ptr and
21360 +                        * unreference it
21361 +                        */
21362 +                       __run_posix_cpu_timers(tsk);
21363 +                       tsk->posix_timer_list = NULL;
21364 +                       put_task_struct(tsk);
21366 +                       /* check if this is the last on the list */
21367 +                       if (next == tsk)
21368 +                               break;
21369 +                       tsk = next;
21370 +               }
21371 +       }
21372 +       return 0;
21374 +wait_to_die:
21375 +       /* Wait for kthread_stop */
21376 +       set_current_state(TASK_INTERRUPTIBLE);
21377 +       while (!kthread_should_stop()) {
21378 +               schedule();
21379 +               set_current_state(TASK_INTERRUPTIBLE);
21380 +       }
21381 +       __set_current_state(TASK_RUNNING);
21382 +       return 0;
21385 +static inline int __fastpath_timer_check(struct task_struct *tsk)
21387 +       /* tsk == current, ensure it is safe to use ->signal/sighand */
21388 +       if (unlikely(tsk->exit_state))
21389 +               return 0;
21391 +       if (!task_cputime_zero(&tsk->cputime_expires))
21392 +                       return 1;
21394 +       if (!task_cputime_zero(&tsk->signal->cputime_expires))
21395 +                       return 1;
21397 +       return 0;
21400 +void run_posix_cpu_timers(struct task_struct *tsk)
21402 +       unsigned long cpu = smp_processor_id();
21403 +       struct task_struct *tasklist;
21405 +       BUG_ON(!irqs_disabled());
21406 +       if(!per_cpu(posix_timer_task, cpu))
21407 +               return;
21408 +       /* get per-cpu references */
21409 +       tasklist = per_cpu(posix_timer_tasklist, cpu);
21411 +       /* check to see if we're already queued */
21412 +       if (!tsk->posix_timer_list && __fastpath_timer_check(tsk)) {
21413 +               get_task_struct(tsk);
21414 +               if (tasklist) {
21415 +                       tsk->posix_timer_list = tasklist;
21416 +               } else {
21417 +                       /*
21418 +                        * The list is terminated by a self-pointing
21419 +                        * task_struct
21420 +                        */
21421 +                       tsk->posix_timer_list = tsk;
21422 +               }
21423 +               per_cpu(posix_timer_tasklist, cpu) = tsk;
21425 +               wake_up_process(per_cpu(posix_timer_task, cpu));
21426 +       }
21430 + * posix_cpu_thread_call - callback that gets triggered when a CPU is added.
21431 + * Here we can start up the necessary migration thread for the new CPU.
21432 + */
21433 +static int posix_cpu_thread_call(struct notifier_block *nfb,
21434 +                                unsigned long action, void *hcpu)
21436 +       int cpu = (long)hcpu;
21437 +       struct task_struct *p;
21438 +       struct sched_param param;
21440 +       switch (action) {
21441 +       case CPU_UP_PREPARE:
21442 +               p = kthread_create(posix_cpu_timers_thread, hcpu,
21443 +                                       "posixcputmr/%d",cpu);
21444 +               if (IS_ERR(p))
21445 +                       return NOTIFY_BAD;
21446 +               p->flags |= PF_NOFREEZE;
21447 +               kthread_bind(p, cpu);
21448 +               /* Must be high prio to avoid getting starved */
21449 +               param.sched_priority = MAX_RT_PRIO-1;
21450 +               sched_setscheduler(p, SCHED_FIFO, &param);
21451 +               per_cpu(posix_timer_task,cpu) = p;
21452 +               break;
21453 +       case CPU_ONLINE:
21454 +               /* Strictly unneccessary, as first user will wake it. */
21455 +               wake_up_process(per_cpu(posix_timer_task,cpu));
21456 +               break;
21457 +#ifdef CONFIG_HOTPLUG_CPU
21458 +       case CPU_UP_CANCELED:
21459 +               /* Unbind it from offline cpu so it can run.  Fall thru. */
21460 +               kthread_bind(per_cpu(posix_timer_task, cpu),
21461 +                            cpumask_any(cpu_online_mask));
21462 +               kthread_stop(per_cpu(posix_timer_task,cpu));
21463 +               per_cpu(posix_timer_task,cpu) = NULL;
21464 +               break;
21465 +       case CPU_DEAD:
21466 +               kthread_stop(per_cpu(posix_timer_task,cpu));
21467 +               per_cpu(posix_timer_task,cpu) = NULL;
21468 +               break;
21469 +#endif
21470 +       }
21471 +       return NOTIFY_OK;
21474 +/* Register at highest priority so that task migration (migrate_all_tasks)
21475 + * happens before everything else.
21476 + */
21477 +static struct notifier_block posix_cpu_thread_notifier = {
21478 +       .notifier_call = posix_cpu_thread_call,
21479 +       .priority = 10
21482 +static int __init posix_cpu_thread_init(void)
21484 +       void *hcpu = (void *)(long)smp_processor_id();
21485 +       /* Start one for boot CPU. */
21486 +       unsigned long cpu;
21488 +       /* init the per-cpu posix_timer_tasklets */
21489 +       for_each_possible_cpu(cpu)
21490 +               per_cpu(posix_timer_tasklist, cpu) = NULL;
21492 +       posix_cpu_thread_call(&posix_cpu_thread_notifier, CPU_UP_PREPARE, hcpu);
21493 +       posix_cpu_thread_call(&posix_cpu_thread_notifier, CPU_ONLINE, hcpu);
21494 +       register_cpu_notifier(&posix_cpu_thread_notifier);
21495 +       return 0;
21497 +early_initcall(posix_cpu_thread_init);
21498 +#else /* CONFIG_PREEMPT_RT_BASE */
21499 +void run_posix_cpu_timers(struct task_struct *tsk)
21501 +       __run_posix_cpu_timers(tsk);
21503 +#endif /* CONFIG_PREEMPT_RT_BASE */
21505  /*
21506   * Set one of the process-wide special case CPU timers or RLIMIT_CPU.
21507   * The tsk->sighand->siglock must be held by the caller.
21508 diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
21509 index f2826c35e918..464a98155a0e 100644
21510 --- a/kernel/time/posix-timers.c
21511 +++ b/kernel/time/posix-timers.c
21512 @@ -506,6 +506,7 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer)
21513  static struct pid *good_sigevent(sigevent_t * event)
21515         struct task_struct *rtn = current->group_leader;
21516 +       int sig = event->sigev_signo;
21518         if ((event->sigev_notify & SIGEV_THREAD_ID ) &&
21519                 (!(rtn = find_task_by_vpid(event->sigev_notify_thread_id)) ||
21520 @@ -514,7 +515,8 @@ static struct pid *good_sigevent(sigevent_t * event)
21521                 return NULL;
21523         if (((event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) &&
21524 -           ((event->sigev_signo <= 0) || (event->sigev_signo > SIGRTMAX)))
21525 +           (sig <= 0 || sig > SIGRTMAX || sig_kernel_only(sig) ||
21526 +            sig_kernel_coredump(sig)))
21527                 return NULL;
21529         return task_pid(rtn);
21530 @@ -826,6 +828,20 @@ SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id)
21531         return overrun;
21535 + * Protected by RCU!
21536 + */
21537 +static void timer_wait_for_callback(struct k_clock *kc, struct k_itimer *timr)
21539 +#ifdef CONFIG_PREEMPT_RT_FULL
21540 +       if (kc->timer_set == common_timer_set)
21541 +               hrtimer_wait_for_timer(&timr->it.real.timer);
21542 +       else
21543 +               /* FIXME: Whacky hack for posix-cpu-timers */
21544 +               schedule_timeout(1);
21545 +#endif
21548  /* Set a POSIX.1b interval timer. */
21549  /* timr->it_lock is taken. */
21550  static int
21551 @@ -903,6 +919,7 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
21552         if (!timr)
21553                 return -EINVAL;
21555 +       rcu_read_lock();
21556         kc = clockid_to_kclock(timr->it_clock);
21557         if (WARN_ON_ONCE(!kc || !kc->timer_set))
21558                 error = -EINVAL;
21559 @@ -911,9 +928,12 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
21561         unlock_timer(timr, flag);
21562         if (error == TIMER_RETRY) {
21563 +               timer_wait_for_callback(kc, timr);
21564                 rtn = NULL;     // We already got the old time...
21565 +               rcu_read_unlock();
21566                 goto retry;
21567         }
21568 +       rcu_read_unlock();
21570         if (old_setting && !error &&
21571             copy_to_user(old_setting, &old_spec, sizeof (old_spec)))
21572 @@ -951,10 +971,15 @@ SYSCALL_DEFINE1(timer_delete, timer_t, timer_id)
21573         if (!timer)
21574                 return -EINVAL;
21576 +       rcu_read_lock();
21577         if (timer_delete_hook(timer) == TIMER_RETRY) {
21578                 unlock_timer(timer, flags);
21579 +               timer_wait_for_callback(clockid_to_kclock(timer->it_clock),
21580 +                                       timer);
21581 +               rcu_read_unlock();
21582                 goto retry_delete;
21583         }
21584 +       rcu_read_unlock();
21586         spin_lock(&current->sighand->siglock);
21587         list_del(&timer->list);
21588 @@ -980,8 +1005,18 @@ static void itimer_delete(struct k_itimer *timer)
21589  retry_delete:
21590         spin_lock_irqsave(&timer->it_lock, flags);
21592 +       /* On RT we can race with a deletion */
21593 +       if (!timer->it_signal) {
21594 +               unlock_timer(timer, flags);
21595 +               return;
21596 +       }
21598         if (timer_delete_hook(timer) == TIMER_RETRY) {
21599 +               rcu_read_lock();
21600                 unlock_timer(timer, flags);
21601 +               timer_wait_for_callback(clockid_to_kclock(timer->it_clock),
21602 +                                       timer);
21603 +               rcu_read_unlock();
21604                 goto retry_delete;
21605         }
21606         list_del(&timer->list);
21607 diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c
21608 index 690b797f522e..fe8ba1619879 100644
21609 --- a/kernel/time/tick-broadcast-hrtimer.c
21610 +++ b/kernel/time/tick-broadcast-hrtimer.c
21611 @@ -107,5 +107,6 @@ void tick_setup_hrtimer_broadcast(void)
21613         hrtimer_init(&bctimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
21614         bctimer.function = bc_handler;
21615 +       bctimer.irqsafe = true;
21616         clockevents_register_device(&ce_broadcast_hrtimer);
21618 diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
21619 index 4fcd99e12aa0..5a47f2e98faf 100644
21620 --- a/kernel/time/tick-common.c
21621 +++ b/kernel/time/tick-common.c
21622 @@ -79,13 +79,15 @@ int tick_is_oneshot_available(void)
21623  static void tick_periodic(int cpu)
21625         if (tick_do_timer_cpu == cpu) {
21626 -               write_seqlock(&jiffies_lock);
21627 +               raw_spin_lock(&jiffies_lock);
21628 +               write_seqcount_begin(&jiffies_seq);
21630                 /* Keep track of the next tick event */
21631                 tick_next_period = ktime_add(tick_next_period, tick_period);
21633                 do_timer(1);
21634 -               write_sequnlock(&jiffies_lock);
21635 +               write_seqcount_end(&jiffies_seq);
21636 +               raw_spin_unlock(&jiffies_lock);
21637                 update_wall_time();
21638         }
21640 @@ -157,9 +159,9 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
21641                 ktime_t next;
21643                 do {
21644 -                       seq = read_seqbegin(&jiffies_lock);
21645 +                       seq = read_seqcount_begin(&jiffies_seq);
21646                         next = tick_next_period;
21647 -               } while (read_seqretry(&jiffies_lock, seq));
21648 +               } while (read_seqcount_retry(&jiffies_seq, seq));
21650                 clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT);
21652 diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
21653 index dae1a45be504..c573b1a848b6 100644
21654 --- a/kernel/time/tick-sched.c
21655 +++ b/kernel/time/tick-sched.c
21656 @@ -62,7 +62,8 @@ static void tick_do_update_jiffies64(ktime_t now)
21657                 return;
21659         /* Reevaluate with jiffies_lock held */
21660 -       write_seqlock(&jiffies_lock);
21661 +       raw_spin_lock(&jiffies_lock);
21662 +       write_seqcount_begin(&jiffies_seq);
21664         delta = ktime_sub(now, last_jiffies_update);
21665         if (delta.tv64 >= tick_period.tv64) {
21666 @@ -85,10 +86,12 @@ static void tick_do_update_jiffies64(ktime_t now)
21667                 /* Keep the tick_next_period variable up to date */
21668                 tick_next_period = ktime_add(last_jiffies_update, tick_period);
21669         } else {
21670 -               write_sequnlock(&jiffies_lock);
21671 +               write_seqcount_end(&jiffies_seq);
21672 +               raw_spin_unlock(&jiffies_lock);
21673                 return;
21674         }
21675 -       write_sequnlock(&jiffies_lock);
21676 +       write_seqcount_end(&jiffies_seq);
21677 +       raw_spin_unlock(&jiffies_lock);
21678         update_wall_time();
21681 @@ -99,12 +102,14 @@ static ktime_t tick_init_jiffy_update(void)
21683         ktime_t period;
21685 -       write_seqlock(&jiffies_lock);
21686 +       raw_spin_lock(&jiffies_lock);
21687 +       write_seqcount_begin(&jiffies_seq);
21688         /* Did we start the jiffies update yet ? */
21689         if (last_jiffies_update.tv64 == 0)
21690                 last_jiffies_update = tick_next_period;
21691         period = last_jiffies_update;
21692 -       write_sequnlock(&jiffies_lock);
21693 +       write_seqcount_end(&jiffies_seq);
21694 +       raw_spin_unlock(&jiffies_lock);
21695         return period;
21698 @@ -215,6 +220,7 @@ static void nohz_full_kick_func(struct irq_work *work)
21700  static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = {
21701         .func = nohz_full_kick_func,
21702 +       .flags = IRQ_WORK_HARD_IRQ,
21703  };
21705  /*
21706 @@ -678,10 +684,10 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
21708         /* Read jiffies and the time when jiffies were updated last */
21709         do {
21710 -               seq = read_seqbegin(&jiffies_lock);
21711 +               seq = read_seqcount_begin(&jiffies_seq);
21712                 basemono = last_jiffies_update.tv64;
21713                 basejiff = jiffies;
21714 -       } while (read_seqretry(&jiffies_lock, seq));
21715 +       } while (read_seqcount_retry(&jiffies_seq, seq));
21716         ts->last_jiffies = basejiff;
21718         /*
21719 @@ -892,14 +898,7 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
21720                 return false;
21722         if (unlikely(local_softirq_pending() && cpu_online(cpu))) {
21723 -               static int ratelimit;
21725 -               if (ratelimit < 10 &&
21726 -                   (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) {
21727 -                       pr_warn("NOHZ: local_softirq_pending %02x\n",
21728 -                               (unsigned int) local_softirq_pending());
21729 -                       ratelimit++;
21730 -               }
21731 +               softirq_check_pending_idle();
21732                 return false;
21733         }
21735 @@ -1208,6 +1207,7 @@ void tick_setup_sched_timer(void)
21736          * Emulate tick processing via per-CPU hrtimers:
21737          */
21738         hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
21739 +       ts->sched_timer.irqsafe = 1;
21740         ts->sched_timer.function = tick_sched_timer;
21742         /* Get the next period (per-CPU) */
21743 diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
21744 index d831827d7ab0..76d982c11ac3 100644
21745 --- a/kernel/time/timekeeping.c
21746 +++ b/kernel/time/timekeeping.c
21747 @@ -2348,8 +2348,10 @@ EXPORT_SYMBOL(hardpps);
21748   */
21749  void xtime_update(unsigned long ticks)
21751 -       write_seqlock(&jiffies_lock);
21752 +       raw_spin_lock(&jiffies_lock);
21753 +       write_seqcount_begin(&jiffies_seq);
21754         do_timer(ticks);
21755 -       write_sequnlock(&jiffies_lock);
21756 +       write_seqcount_end(&jiffies_seq);
21757 +       raw_spin_unlock(&jiffies_lock);
21758         update_wall_time();
21760 diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h
21761 index 704f595ce83f..763a3e5121ff 100644
21762 --- a/kernel/time/timekeeping.h
21763 +++ b/kernel/time/timekeeping.h
21764 @@ -19,7 +19,8 @@ extern void timekeeping_resume(void);
21765  extern void do_timer(unsigned long ticks);
21766  extern void update_wall_time(void);
21768 -extern seqlock_t jiffies_lock;
21769 +extern raw_spinlock_t jiffies_lock;
21770 +extern seqcount_t jiffies_seq;
21772  #define CS_NAME_LEN    32
21774 diff --git a/kernel/time/timer.c b/kernel/time/timer.c
21775 index e872f7f05e8a..8e75e7442aaa 100644
21776 --- a/kernel/time/timer.c
21777 +++ b/kernel/time/timer.c
21778 @@ -193,8 +193,11 @@ EXPORT_SYMBOL(jiffies_64);
21779  #endif
21781  struct timer_base {
21782 -       spinlock_t              lock;
21783 +       raw_spinlock_t          lock;
21784         struct timer_list       *running_timer;
21785 +#ifdef CONFIG_PREEMPT_RT_FULL
21786 +       struct swait_queue_head wait_for_running_timer;
21787 +#endif
21788         unsigned long           clk;
21789         unsigned long           next_expiry;
21790         unsigned int            cpu;
21791 @@ -953,10 +956,10 @@ static struct timer_base *lock_timer_base(struct timer_list *timer,
21793                 if (!(tf & TIMER_MIGRATING)) {
21794                         base = get_timer_base(tf);
21795 -                       spin_lock_irqsave(&base->lock, *flags);
21796 +                       raw_spin_lock_irqsave(&base->lock, *flags);
21797                         if (timer->flags == tf)
21798                                 return base;
21799 -                       spin_unlock_irqrestore(&base->lock, *flags);
21800 +                       raw_spin_unlock_irqrestore(&base->lock, *flags);
21801                 }
21802                 cpu_relax();
21803         }
21804 @@ -1033,9 +1036,9 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
21805                         /* See the comment in lock_timer_base() */
21806                         timer->flags |= TIMER_MIGRATING;
21808 -                       spin_unlock(&base->lock);
21809 +                       raw_spin_unlock(&base->lock);
21810                         base = new_base;
21811 -                       spin_lock(&base->lock);
21812 +                       raw_spin_lock(&base->lock);
21813                         WRITE_ONCE(timer->flags,
21814                                    (timer->flags & ~TIMER_BASEMASK) | base->cpu);
21815                         forward_timer_base(base);
21816 @@ -1060,7 +1063,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
21817         }
21819  out_unlock:
21820 -       spin_unlock_irqrestore(&base->lock, flags);
21821 +       raw_spin_unlock_irqrestore(&base->lock, flags);
21823         return ret;
21825 @@ -1154,9 +1157,9 @@ void add_timer_on(struct timer_list *timer, int cpu)
21826         if (base != new_base) {
21827                 timer->flags |= TIMER_MIGRATING;
21829 -               spin_unlock(&base->lock);
21830 +               raw_spin_unlock(&base->lock);
21831                 base = new_base;
21832 -               spin_lock(&base->lock);
21833 +               raw_spin_lock(&base->lock);
21834                 WRITE_ONCE(timer->flags,
21835                            (timer->flags & ~TIMER_BASEMASK) | cpu);
21836         }
21837 @@ -1164,10 +1167,37 @@ void add_timer_on(struct timer_list *timer, int cpu)
21839         debug_activate(timer, timer->expires);
21840         internal_add_timer(base, timer);
21841 -       spin_unlock_irqrestore(&base->lock, flags);
21842 +       raw_spin_unlock_irqrestore(&base->lock, flags);
21844  EXPORT_SYMBOL_GPL(add_timer_on);
21846 +#ifdef CONFIG_PREEMPT_RT_FULL
21848 + * Wait for a running timer
21849 + */
21850 +static void wait_for_running_timer(struct timer_list *timer)
21852 +       struct timer_base *base;
21853 +       u32 tf = timer->flags;
21855 +       if (tf & TIMER_MIGRATING)
21856 +               return;
21858 +       base = get_timer_base(tf);
21859 +       swait_event(base->wait_for_running_timer,
21860 +                  base->running_timer != timer);
21863 +# define wakeup_timer_waiters(b)       swake_up_all(&(b)->wait_for_running_timer)
21864 +#else
21865 +static inline void wait_for_running_timer(struct timer_list *timer)
21867 +       cpu_relax();
21870 +# define wakeup_timer_waiters(b)       do { } while (0)
21871 +#endif
21873  /**
21874   * del_timer - deactive a timer.
21875   * @timer: the timer to be deactivated
21876 @@ -1191,7 +1221,7 @@ int del_timer(struct timer_list *timer)
21877         if (timer_pending(timer)) {
21878                 base = lock_timer_base(timer, &flags);
21879                 ret = detach_if_pending(timer, base, true);
21880 -               spin_unlock_irqrestore(&base->lock, flags);
21881 +               raw_spin_unlock_irqrestore(&base->lock, flags);
21882         }
21884         return ret;
21885 @@ -1219,13 +1249,13 @@ int try_to_del_timer_sync(struct timer_list *timer)
21886                 timer_stats_timer_clear_start_info(timer);
21887                 ret = detach_if_pending(timer, base, true);
21888         }
21889 -       spin_unlock_irqrestore(&base->lock, flags);
21890 +       raw_spin_unlock_irqrestore(&base->lock, flags);
21892         return ret;
21894  EXPORT_SYMBOL(try_to_del_timer_sync);
21896 -#ifdef CONFIG_SMP
21897 +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
21898  /**
21899   * del_timer_sync - deactivate a timer and wait for the handler to finish.
21900   * @timer: the timer to be deactivated
21901 @@ -1285,7 +1315,7 @@ int del_timer_sync(struct timer_list *timer)
21902                 int ret = try_to_del_timer_sync(timer);
21903                 if (ret >= 0)
21904                         return ret;
21905 -               cpu_relax();
21906 +               wait_for_running_timer(timer);
21907         }
21909  EXPORT_SYMBOL(del_timer_sync);
21910 @@ -1350,14 +1380,17 @@ static void expire_timers(struct timer_base *base, struct hlist_head *head)
21911                 fn = timer->function;
21912                 data = timer->data;
21914 -               if (timer->flags & TIMER_IRQSAFE) {
21915 -                       spin_unlock(&base->lock);
21916 +               if (!IS_ENABLED(CONFIG_PREEMPT_RT_FULL) &&
21917 +                   timer->flags & TIMER_IRQSAFE) {
21918 +                       raw_spin_unlock(&base->lock);
21919                         call_timer_fn(timer, fn, data);
21920 -                       spin_lock(&base->lock);
21921 +                       base->running_timer = NULL;
21922 +                       raw_spin_lock(&base->lock);
21923                 } else {
21924 -                       spin_unlock_irq(&base->lock);
21925 +                       raw_spin_unlock_irq(&base->lock);
21926                         call_timer_fn(timer, fn, data);
21927 -                       spin_lock_irq(&base->lock);
21928 +                       base->running_timer = NULL;
21929 +                       raw_spin_lock_irq(&base->lock);
21930                 }
21931         }
21933 @@ -1526,7 +1559,7 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
21934         if (cpu_is_offline(smp_processor_id()))
21935                 return expires;
21937 -       spin_lock(&base->lock);
21938 +       raw_spin_lock(&base->lock);
21939         nextevt = __next_timer_interrupt(base);
21940         is_max_delta = (nextevt == base->clk + NEXT_TIMER_MAX_DELTA);
21941         base->next_expiry = nextevt;
21942 @@ -1560,7 +1593,7 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
21943                         base->is_idle = true;
21944                 }
21945         }
21946 -       spin_unlock(&base->lock);
21947 +       raw_spin_unlock(&base->lock);
21949         return cmp_next_hrtimer_event(basem, expires);
21951 @@ -1625,13 +1658,13 @@ void update_process_times(int user_tick)
21953         /* Note: this timer irq context must be accounted for as well. */
21954         account_process_tick(p, user_tick);
21955 +       scheduler_tick();
21956         run_local_timers();
21957         rcu_check_callbacks(user_tick);
21958 -#ifdef CONFIG_IRQ_WORK
21959 +#if defined(CONFIG_IRQ_WORK)
21960         if (in_irq())
21961                 irq_work_tick();
21962  #endif
21963 -       scheduler_tick();
21964         run_posix_cpu_timers(p);
21967 @@ -1647,7 +1680,7 @@ static inline void __run_timers(struct timer_base *base)
21968         if (!time_after_eq(jiffies, base->clk))
21969                 return;
21971 -       spin_lock_irq(&base->lock);
21972 +       raw_spin_lock_irq(&base->lock);
21974         while (time_after_eq(jiffies, base->clk)) {
21976 @@ -1657,8 +1690,8 @@ static inline void __run_timers(struct timer_base *base)
21977                 while (levels--)
21978                         expire_timers(base, heads + levels);
21979         }
21980 -       base->running_timer = NULL;
21981 -       spin_unlock_irq(&base->lock);
21982 +       raw_spin_unlock_irq(&base->lock);
21983 +       wakeup_timer_waiters(base);
21986  /*
21987 @@ -1681,6 +1714,8 @@ static __latent_entropy void run_timer_softirq(struct softirq_action *h)
21988          */
21989         base->must_forward_clk = false;
21991 +       irq_work_tick_soft();
21993         __run_timers(base);
21994         if (IS_ENABLED(CONFIG_NO_HZ_COMMON))
21995                 __run_timers(this_cpu_ptr(&timer_bases[BASE_DEF]));
21996 @@ -1881,16 +1916,16 @@ int timers_dead_cpu(unsigned int cpu)
21997                  * The caller is globally serialized and nobody else
21998                  * takes two locks at once, deadlock is not possible.
21999                  */
22000 -               spin_lock_irq(&new_base->lock);
22001 -               spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
22002 +               raw_spin_lock_irq(&new_base->lock);
22003 +               raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
22005                 BUG_ON(old_base->running_timer);
22007                 for (i = 0; i < WHEEL_SIZE; i++)
22008                         migrate_timer_list(new_base, old_base->vectors + i);
22010 -               spin_unlock(&old_base->lock);
22011 -               spin_unlock_irq(&new_base->lock);
22012 +               raw_spin_unlock(&old_base->lock);
22013 +               raw_spin_unlock_irq(&new_base->lock);
22014                 put_cpu_ptr(&timer_bases);
22015         }
22016         return 0;
22017 @@ -1906,8 +1941,11 @@ static void __init init_timer_cpu(int cpu)
22018         for (i = 0; i < NR_BASES; i++) {
22019                 base = per_cpu_ptr(&timer_bases[i], cpu);
22020                 base->cpu = cpu;
22021 -               spin_lock_init(&base->lock);
22022 +               raw_spin_lock_init(&base->lock);
22023                 base->clk = jiffies;
22024 +#ifdef CONFIG_PREEMPT_RT_FULL
22025 +               init_swait_queue_head(&base->wait_for_running_timer);
22026 +#endif
22027         }
22030 diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
22031 index 2a96b063d659..812e37237eb8 100644
22032 --- a/kernel/trace/Kconfig
22033 +++ b/kernel/trace/Kconfig
22034 @@ -182,6 +182,24 @@ config IRQSOFF_TRACER
22035           enabled. This option and the preempt-off timing option can be
22036           used together or separately.)
22038 +config INTERRUPT_OFF_HIST
22039 +       bool "Interrupts-off Latency Histogram"
22040 +       depends on IRQSOFF_TRACER
22041 +       help
22042 +         This option generates continuously updated histograms (one per cpu)
22043 +         of the duration of time periods with interrupts disabled. The
22044 +         histograms are disabled by default. To enable them, write a non-zero
22045 +         number to
22047 +             /sys/kernel/debug/tracing/latency_hist/enable/preemptirqsoff
22049 +         If PREEMPT_OFF_HIST is also selected, additional histograms (one
22050 +         per cpu) are generated that accumulate the duration of time periods
22051 +         when both interrupts and preemption are disabled. The histogram data
22052 +         will be located in the debug file system at
22054 +             /sys/kernel/debug/tracing/latency_hist/irqsoff
22056  config PREEMPT_TRACER
22057         bool "Preemption-off Latency Tracer"
22058         default n
22059 @@ -206,6 +224,24 @@ config PREEMPT_TRACER
22060           enabled. This option and the irqs-off timing option can be
22061           used together or separately.)
22063 +config PREEMPT_OFF_HIST
22064 +       bool "Preemption-off Latency Histogram"
22065 +       depends on PREEMPT_TRACER
22066 +       help
22067 +         This option generates continuously updated histograms (one per cpu)
22068 +         of the duration of time periods with preemption disabled. The
22069 +         histograms are disabled by default. To enable them, write a non-zero
22070 +         number to
22072 +             /sys/kernel/debug/tracing/latency_hist/enable/preemptirqsoff
22074 +         If INTERRUPT_OFF_HIST is also selected, additional histograms (one
22075 +         per cpu) are generated that accumulate the duration of time periods
22076 +         when both interrupts and preemption are disabled. The histogram data
22077 +         will be located in the debug file system at
22079 +             /sys/kernel/debug/tracing/latency_hist/preemptoff
22081  config SCHED_TRACER
22082         bool "Scheduling Latency Tracer"
22083         select GENERIC_TRACER
22084 @@ -251,6 +287,74 @@ config HWLAT_TRACER
22085          file. Every time a latency is greater than tracing_thresh, it will
22086          be recorded into the ring buffer.
22088 +config WAKEUP_LATENCY_HIST
22089 +       bool "Scheduling Latency Histogram"
22090 +       depends on SCHED_TRACER
22091 +       help
22092 +         This option generates continuously updated histograms (one per cpu)
22093 +         of the scheduling latency of the highest priority task.
22094 +         The histograms are disabled by default. To enable them, write a
22095 +         non-zero number to
22097 +             /sys/kernel/debug/tracing/latency_hist/enable/wakeup
22099 +         Two different algorithms are used, one to determine the latency of
22100 +         processes that exclusively use the highest priority of the system and
22101 +         another one to determine the latency of processes that share the
22102 +         highest system priority with other processes. The former is used to
22103 +         improve hardware and system software, the latter to optimize the
22104 +         priority design of a given system. The histogram data will be
22105 +         located in the debug file system at
22107 +             /sys/kernel/debug/tracing/latency_hist/wakeup
22109 +         and
22111 +             /sys/kernel/debug/tracing/latency_hist/wakeup/sharedprio
22113 +         If both Scheduling Latency Histogram and Missed Timer Offsets
22114 +         Histogram are selected, additional histogram data will be collected
22115 +         that contain, in addition to the wakeup latency, the timer latency, in
22116 +         case the wakeup was triggered by an expired timer. These histograms
22117 +         are available in the
22119 +             /sys/kernel/debug/tracing/latency_hist/timerandwakeup
22121 +         directory. They reflect the apparent interrupt and scheduling latency
22122 +         and are best suitable to determine the worst-case latency of a given
22123 +         system. To enable these histograms, write a non-zero number to
22125 +             /sys/kernel/debug/tracing/latency_hist/enable/timerandwakeup
22127 +config MISSED_TIMER_OFFSETS_HIST
22128 +       depends on HIGH_RES_TIMERS
22129 +       select GENERIC_TRACER
22130 +       bool "Missed Timer Offsets Histogram"
22131 +       help
22132 +         Generate a histogram of missed timer offsets in microseconds. The
22133 +         histograms are disabled by default. To enable them, write a non-zero
22134 +         number to
22136 +             /sys/kernel/debug/tracing/latency_hist/enable/missed_timer_offsets
22138 +         The histogram data will be located in the debug file system at
22140 +             /sys/kernel/debug/tracing/latency_hist/missed_timer_offsets
22142 +         If both Scheduling Latency Histogram and Missed Timer Offsets
22143 +         Histogram are selected, additional histogram data will be collected
22144 +         that contain, in addition to the wakeup latency, the timer latency, in
22145 +         case the wakeup was triggered by an expired timer. These histograms
22146 +         are available in the
22148 +             /sys/kernel/debug/tracing/latency_hist/timerandwakeup
22150 +         directory. They reflect the apparent interrupt and scheduling latency
22151 +         and are best suitable to determine the worst-case latency of a given
22152 +         system. To enable these histograms, write a non-zero number to
22154 +             /sys/kernel/debug/tracing/latency_hist/enable/timerandwakeup
22156  config ENABLE_DEFAULT_TRACERS
22157         bool "Trace process context switches and events"
22158         depends on !GENERIC_TRACER
22159 diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
22160 index e57980845549..83af000b783c 100644
22161 --- a/kernel/trace/Makefile
22162 +++ b/kernel/trace/Makefile
22163 @@ -38,6 +38,10 @@ obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o
22164  obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o
22165  obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o
22166  obj-$(CONFIG_HWLAT_TRACER) += trace_hwlat.o
22167 +obj-$(CONFIG_INTERRUPT_OFF_HIST) += latency_hist.o
22168 +obj-$(CONFIG_PREEMPT_OFF_HIST) += latency_hist.o
22169 +obj-$(CONFIG_WAKEUP_LATENCY_HIST) += latency_hist.o
22170 +obj-$(CONFIG_MISSED_TIMER_OFFSETS_HIST) += latency_hist.o
22171  obj-$(CONFIG_NOP_TRACER) += trace_nop.o
22172  obj-$(CONFIG_STACK_TRACER) += trace_stack.o
22173  obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
22174 diff --git a/kernel/trace/latency_hist.c b/kernel/trace/latency_hist.c
22175 new file mode 100644
22176 index 000000000000..7f6ee70dea41
22177 --- /dev/null
22178 +++ b/kernel/trace/latency_hist.c
22179 @@ -0,0 +1,1178 @@
22181 + * kernel/trace/latency_hist.c
22182 + *
22183 + * Add support for histograms of preemption-off latency and
22184 + * interrupt-off latency and wakeup latency, it depends on
22185 + * Real-Time Preemption Support.
22186 + *
22187 + *  Copyright (C) 2005 MontaVista Software, Inc.
22188 + *  Yi Yang <yyang@ch.mvista.com>
22189 + *
22190 + *  Converted to work with the new latency tracer.
22191 + *  Copyright (C) 2008 Red Hat, Inc.
22192 + *    Steven Rostedt <srostedt@redhat.com>
22193 + *
22194 + */
22195 +#include <linux/module.h>
22196 +#include <linux/debugfs.h>
22197 +#include <linux/seq_file.h>
22198 +#include <linux/percpu.h>
22199 +#include <linux/kallsyms.h>
22200 +#include <linux/uaccess.h>
22201 +#include <linux/sched.h>
22202 +#include <linux/sched/rt.h>
22203 +#include <linux/slab.h>
22204 +#include <linux/atomic.h>
22205 +#include <asm/div64.h>
22207 +#include "trace.h"
22208 +#include <trace/events/sched.h>
22210 +#define NSECS_PER_USECS 1000L
22212 +#define CREATE_TRACE_POINTS
22213 +#include <trace/events/hist.h>
22215 +enum {
22216 +       IRQSOFF_LATENCY = 0,
22217 +       PREEMPTOFF_LATENCY,
22218 +       PREEMPTIRQSOFF_LATENCY,
22219 +       WAKEUP_LATENCY,
22220 +       WAKEUP_LATENCY_SHAREDPRIO,
22221 +       MISSED_TIMER_OFFSETS,
22222 +       TIMERANDWAKEUP_LATENCY,
22223 +       MAX_LATENCY_TYPE,
22226 +#define MAX_ENTRY_NUM 10240
22228 +struct hist_data {
22229 +       atomic_t hist_mode; /* 0 log, 1 don't log */
22230 +       long offset; /* set it to MAX_ENTRY_NUM/2 for a bipolar scale */
22231 +       long min_lat;
22232 +       long max_lat;
22233 +       unsigned long long below_hist_bound_samples;
22234 +       unsigned long long above_hist_bound_samples;
22235 +       long long accumulate_lat;
22236 +       unsigned long long total_samples;
22237 +       unsigned long long hist_array[MAX_ENTRY_NUM];
22240 +struct enable_data {
22241 +       int latency_type;
22242 +       int enabled;
22245 +static char *latency_hist_dir_root = "latency_hist";
22247 +#ifdef CONFIG_INTERRUPT_OFF_HIST
22248 +static DEFINE_PER_CPU(struct hist_data, irqsoff_hist);
22249 +static char *irqsoff_hist_dir = "irqsoff";
22250 +static DEFINE_PER_CPU(cycles_t, hist_irqsoff_start);
22251 +static DEFINE_PER_CPU(int, hist_irqsoff_counting);
22252 +#endif
22254 +#ifdef CONFIG_PREEMPT_OFF_HIST
22255 +static DEFINE_PER_CPU(struct hist_data, preemptoff_hist);
22256 +static char *preemptoff_hist_dir = "preemptoff";
22257 +static DEFINE_PER_CPU(cycles_t, hist_preemptoff_start);
22258 +static DEFINE_PER_CPU(int, hist_preemptoff_counting);
22259 +#endif
22261 +#if defined(CONFIG_PREEMPT_OFF_HIST) && defined(CONFIG_INTERRUPT_OFF_HIST)
22262 +static DEFINE_PER_CPU(struct hist_data, preemptirqsoff_hist);
22263 +static char *preemptirqsoff_hist_dir = "preemptirqsoff";
22264 +static DEFINE_PER_CPU(cycles_t, hist_preemptirqsoff_start);
22265 +static DEFINE_PER_CPU(int, hist_preemptirqsoff_counting);
22266 +#endif
22268 +#if defined(CONFIG_PREEMPT_OFF_HIST) || defined(CONFIG_INTERRUPT_OFF_HIST)
22269 +static notrace void probe_preemptirqsoff_hist(void *v, int reason, int start);
22270 +static struct enable_data preemptirqsoff_enabled_data = {
22271 +       .latency_type = PREEMPTIRQSOFF_LATENCY,
22272 +       .enabled = 0,
22274 +#endif
22276 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
22277 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
22278 +struct maxlatproc_data {
22279 +       char comm[FIELD_SIZEOF(struct task_struct, comm)];
22280 +       char current_comm[FIELD_SIZEOF(struct task_struct, comm)];
22281 +       int pid;
22282 +       int current_pid;
22283 +       int prio;
22284 +       int current_prio;
22285 +       long latency;
22286 +       long timeroffset;
22287 +       cycle_t timestamp;
22289 +#endif
22291 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
22292 +static DEFINE_PER_CPU(struct hist_data, wakeup_latency_hist);
22293 +static DEFINE_PER_CPU(struct hist_data, wakeup_latency_hist_sharedprio);
22294 +static char *wakeup_latency_hist_dir = "wakeup";
22295 +static char *wakeup_latency_hist_dir_sharedprio = "sharedprio";
22296 +static notrace void probe_wakeup_latency_hist_start(void *v,
22297 +       struct task_struct *p);
22298 +static notrace void probe_wakeup_latency_hist_stop(void *v,
22299 +       bool preempt, struct task_struct *prev, struct task_struct *next);
22300 +static notrace void probe_sched_migrate_task(void *,
22301 +       struct task_struct *task, int cpu);
22302 +static struct enable_data wakeup_latency_enabled_data = {
22303 +       .latency_type = WAKEUP_LATENCY,
22304 +       .enabled = 0,
22306 +static DEFINE_PER_CPU(struct maxlatproc_data, wakeup_maxlatproc);
22307 +static DEFINE_PER_CPU(struct maxlatproc_data, wakeup_maxlatproc_sharedprio);
22308 +static DEFINE_PER_CPU(struct task_struct *, wakeup_task);
22309 +static DEFINE_PER_CPU(int, wakeup_sharedprio);
22310 +static unsigned long wakeup_pid;
22311 +#endif
22313 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
22314 +static DEFINE_PER_CPU(struct hist_data, missed_timer_offsets);
22315 +static char *missed_timer_offsets_dir = "missed_timer_offsets";
22316 +static notrace void probe_hrtimer_interrupt(void *v, int cpu,
22317 +       long long offset, struct task_struct *curr, struct task_struct *task);
22318 +static struct enable_data missed_timer_offsets_enabled_data = {
22319 +       .latency_type = MISSED_TIMER_OFFSETS,
22320 +       .enabled = 0,
22322 +static DEFINE_PER_CPU(struct maxlatproc_data, missed_timer_offsets_maxlatproc);
22323 +static unsigned long missed_timer_offsets_pid;
22324 +#endif
22326 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
22327 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
22328 +static DEFINE_PER_CPU(struct hist_data, timerandwakeup_latency_hist);
22329 +static char *timerandwakeup_latency_hist_dir = "timerandwakeup";
22330 +static struct enable_data timerandwakeup_enabled_data = {
22331 +       .latency_type = TIMERANDWAKEUP_LATENCY,
22332 +       .enabled = 0,
22334 +static DEFINE_PER_CPU(struct maxlatproc_data, timerandwakeup_maxlatproc);
22335 +#endif
22337 +void notrace latency_hist(int latency_type, int cpu, long latency,
22338 +                         long timeroffset, cycle_t stop,
22339 +                         struct task_struct *p)
22341 +       struct hist_data *my_hist;
22342 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
22343 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
22344 +       struct maxlatproc_data *mp = NULL;
22345 +#endif
22347 +       if (!cpu_possible(cpu) || latency_type < 0 ||
22348 +           latency_type >= MAX_LATENCY_TYPE)
22349 +               return;
22351 +       switch (latency_type) {
22352 +#ifdef CONFIG_INTERRUPT_OFF_HIST
22353 +       case IRQSOFF_LATENCY:
22354 +               my_hist = &per_cpu(irqsoff_hist, cpu);
22355 +               break;
22356 +#endif
22357 +#ifdef CONFIG_PREEMPT_OFF_HIST
22358 +       case PREEMPTOFF_LATENCY:
22359 +               my_hist = &per_cpu(preemptoff_hist, cpu);
22360 +               break;
22361 +#endif
22362 +#if defined(CONFIG_PREEMPT_OFF_HIST) && defined(CONFIG_INTERRUPT_OFF_HIST)
22363 +       case PREEMPTIRQSOFF_LATENCY:
22364 +               my_hist = &per_cpu(preemptirqsoff_hist, cpu);
22365 +               break;
22366 +#endif
22367 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
22368 +       case WAKEUP_LATENCY:
22369 +               my_hist = &per_cpu(wakeup_latency_hist, cpu);
22370 +               mp = &per_cpu(wakeup_maxlatproc, cpu);
22371 +               break;
22372 +       case WAKEUP_LATENCY_SHAREDPRIO:
22373 +               my_hist = &per_cpu(wakeup_latency_hist_sharedprio, cpu);
22374 +               mp = &per_cpu(wakeup_maxlatproc_sharedprio, cpu);
22375 +               break;
22376 +#endif
22377 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
22378 +       case MISSED_TIMER_OFFSETS:
22379 +               my_hist = &per_cpu(missed_timer_offsets, cpu);
22380 +               mp = &per_cpu(missed_timer_offsets_maxlatproc, cpu);
22381 +               break;
22382 +#endif
22383 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
22384 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
22385 +       case TIMERANDWAKEUP_LATENCY:
22386 +               my_hist = &per_cpu(timerandwakeup_latency_hist, cpu);
22387 +               mp = &per_cpu(timerandwakeup_maxlatproc, cpu);
22388 +               break;
22389 +#endif
22391 +       default:
22392 +               return;
22393 +       }
22395 +       latency += my_hist->offset;
22397 +       if (atomic_read(&my_hist->hist_mode) == 0)
22398 +               return;
22400 +       if (latency < 0 || latency >= MAX_ENTRY_NUM) {
22401 +               if (latency < 0)
22402 +                       my_hist->below_hist_bound_samples++;
22403 +               else
22404 +                       my_hist->above_hist_bound_samples++;
22405 +       } else
22406 +               my_hist->hist_array[latency]++;
22408 +       if (unlikely(latency > my_hist->max_lat ||
22409 +           my_hist->min_lat == LONG_MAX)) {
22410 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
22411 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
22412 +               if (latency_type == WAKEUP_LATENCY ||
22413 +                   latency_type == WAKEUP_LATENCY_SHAREDPRIO ||
22414 +                   latency_type == MISSED_TIMER_OFFSETS ||
22415 +                   latency_type == TIMERANDWAKEUP_LATENCY) {
22416 +                       strncpy(mp->comm, p->comm, sizeof(mp->comm));
22417 +                       strncpy(mp->current_comm, current->comm,
22418 +                           sizeof(mp->current_comm));
22419 +                       mp->pid = task_pid_nr(p);
22420 +                       mp->current_pid = task_pid_nr(current);
22421 +                       mp->prio = p->prio;
22422 +                       mp->current_prio = current->prio;
22423 +                       mp->latency = latency;
22424 +                       mp->timeroffset = timeroffset;
22425 +                       mp->timestamp = stop;
22426 +               }
22427 +#endif
22428 +               my_hist->max_lat = latency;
22429 +       }
22430 +       if (unlikely(latency < my_hist->min_lat))
22431 +               my_hist->min_lat = latency;
22432 +       my_hist->total_samples++;
22433 +       my_hist->accumulate_lat += latency;
22436 +static void *l_start(struct seq_file *m, loff_t *pos)
22438 +       loff_t *index_ptr = NULL;
22439 +       loff_t index = *pos;
22440 +       struct hist_data *my_hist = m->private;
22442 +       if (index == 0) {
22443 +               char minstr[32], avgstr[32], maxstr[32];
22445 +               atomic_dec(&my_hist->hist_mode);
22447 +               if (likely(my_hist->total_samples)) {
22448 +                       long avg = (long) div64_s64(my_hist->accumulate_lat,
22449 +                           my_hist->total_samples);
22450 +                       snprintf(minstr, sizeof(minstr), "%ld",
22451 +                           my_hist->min_lat - my_hist->offset);
22452 +                       snprintf(avgstr, sizeof(avgstr), "%ld",
22453 +                           avg - my_hist->offset);
22454 +                       snprintf(maxstr, sizeof(maxstr), "%ld",
22455 +                           my_hist->max_lat - my_hist->offset);
22456 +               } else {
22457 +                       strcpy(minstr, "<undef>");
22458 +                       strcpy(avgstr, minstr);
22459 +                       strcpy(maxstr, minstr);
22460 +               }
22462 +               seq_printf(m, "#Minimum latency: %s microseconds\n"
22463 +                          "#Average latency: %s microseconds\n"
22464 +                          "#Maximum latency: %s microseconds\n"
22465 +                          "#Total samples: %llu\n"
22466 +                          "#There are %llu samples lower than %ld"
22467 +                          " microseconds.\n"
22468 +                          "#There are %llu samples greater or equal"
22469 +                          " than %ld microseconds.\n"
22470 +                          "#usecs\t%16s\n",
22471 +                          minstr, avgstr, maxstr,
22472 +                          my_hist->total_samples,
22473 +                          my_hist->below_hist_bound_samples,
22474 +                          -my_hist->offset,
22475 +                          my_hist->above_hist_bound_samples,
22476 +                          MAX_ENTRY_NUM - my_hist->offset,
22477 +                          "samples");
22478 +       }
22479 +       if (index < MAX_ENTRY_NUM) {
22480 +               index_ptr = kmalloc(sizeof(loff_t), GFP_KERNEL);
22481 +               if (index_ptr)
22482 +                       *index_ptr = index;
22483 +       }
22485 +       return index_ptr;
22488 +static void *l_next(struct seq_file *m, void *p, loff_t *pos)
22490 +       loff_t *index_ptr = p;
22491 +       struct hist_data *my_hist = m->private;
22493 +       if (++*pos >= MAX_ENTRY_NUM) {
22494 +               atomic_inc(&my_hist->hist_mode);
22495 +               return NULL;
22496 +       }
22497 +       *index_ptr = *pos;
22498 +       return index_ptr;
22501 +static void l_stop(struct seq_file *m, void *p)
22503 +       kfree(p);
22506 +static int l_show(struct seq_file *m, void *p)
22508 +       int index = *(loff_t *) p;
22509 +       struct hist_data *my_hist = m->private;
22511 +       seq_printf(m, "%6ld\t%16llu\n", index - my_hist->offset,
22512 +           my_hist->hist_array[index]);
22513 +       return 0;
22516 +static const struct seq_operations latency_hist_seq_op = {
22517 +       .start = l_start,
22518 +       .next  = l_next,
22519 +       .stop  = l_stop,
22520 +       .show  = l_show
22523 +static int latency_hist_open(struct inode *inode, struct file *file)
22525 +       int ret;
22527 +       ret = seq_open(file, &latency_hist_seq_op);
22528 +       if (!ret) {
22529 +               struct seq_file *seq = file->private_data;
22530 +               seq->private = inode->i_private;
22531 +       }
22532 +       return ret;
22535 +static const struct file_operations latency_hist_fops = {
22536 +       .open = latency_hist_open,
22537 +       .read = seq_read,
22538 +       .llseek = seq_lseek,
22539 +       .release = seq_release,
22542 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
22543 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
22544 +static void clear_maxlatprocdata(struct maxlatproc_data *mp)
22546 +       mp->comm[0] = mp->current_comm[0] = '\0';
22547 +       mp->prio = mp->current_prio = mp->pid = mp->current_pid =
22548 +           mp->latency = mp->timeroffset = -1;
22549 +       mp->timestamp = 0;
22551 +#endif
22553 +static void hist_reset(struct hist_data *hist)
22555 +       atomic_dec(&hist->hist_mode);
22557 +       memset(hist->hist_array, 0, sizeof(hist->hist_array));
22558 +       hist->below_hist_bound_samples = 0ULL;
22559 +       hist->above_hist_bound_samples = 0ULL;
22560 +       hist->min_lat = LONG_MAX;
22561 +       hist->max_lat = LONG_MIN;
22562 +       hist->total_samples = 0ULL;
22563 +       hist->accumulate_lat = 0LL;
22565 +       atomic_inc(&hist->hist_mode);
22568 +static ssize_t
22569 +latency_hist_reset(struct file *file, const char __user *a,
22570 +                  size_t size, loff_t *off)
22572 +       int cpu;
22573 +       struct hist_data *hist = NULL;
22574 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
22575 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
22576 +       struct maxlatproc_data *mp = NULL;
22577 +#endif
22578 +       off_t latency_type = (off_t) file->private_data;
22580 +       for_each_online_cpu(cpu) {
22582 +               switch (latency_type) {
22583 +#ifdef CONFIG_PREEMPT_OFF_HIST
22584 +               case PREEMPTOFF_LATENCY:
22585 +                       hist = &per_cpu(preemptoff_hist, cpu);
22586 +                       break;
22587 +#endif
22588 +#ifdef CONFIG_INTERRUPT_OFF_HIST
22589 +               case IRQSOFF_LATENCY:
22590 +                       hist = &per_cpu(irqsoff_hist, cpu);
22591 +                       break;
22592 +#endif
22593 +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
22594 +               case PREEMPTIRQSOFF_LATENCY:
22595 +                       hist = &per_cpu(preemptirqsoff_hist, cpu);
22596 +                       break;
22597 +#endif
22598 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
22599 +               case WAKEUP_LATENCY:
22600 +                       hist = &per_cpu(wakeup_latency_hist, cpu);
22601 +                       mp = &per_cpu(wakeup_maxlatproc, cpu);
22602 +                       break;
22603 +               case WAKEUP_LATENCY_SHAREDPRIO:
22604 +                       hist = &per_cpu(wakeup_latency_hist_sharedprio, cpu);
22605 +                       mp = &per_cpu(wakeup_maxlatproc_sharedprio, cpu);
22606 +                       break;
22607 +#endif
22608 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
22609 +               case MISSED_TIMER_OFFSETS:
22610 +                       hist = &per_cpu(missed_timer_offsets, cpu);
22611 +                       mp = &per_cpu(missed_timer_offsets_maxlatproc, cpu);
22612 +                       break;
22613 +#endif
22614 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
22615 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
22616 +               case TIMERANDWAKEUP_LATENCY:
22617 +                       hist = &per_cpu(timerandwakeup_latency_hist, cpu);
22618 +                       mp = &per_cpu(timerandwakeup_maxlatproc, cpu);
22619 +                       break;
22620 +#endif
22621 +               }
22623 +               hist_reset(hist);
22624 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
22625 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
22626 +               if (latency_type == WAKEUP_LATENCY ||
22627 +                   latency_type == WAKEUP_LATENCY_SHAREDPRIO ||
22628 +                   latency_type == MISSED_TIMER_OFFSETS ||
22629 +                   latency_type == TIMERANDWAKEUP_LATENCY)
22630 +                       clear_maxlatprocdata(mp);
22631 +#endif
22632 +       }
22634 +       return size;
22637 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
22638 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
22639 +static ssize_t
22640 +show_pid(struct file *file, char __user *ubuf, size_t cnt, loff_t *ppos)
22642 +       char buf[64];
22643 +       int r;
22644 +       unsigned long *this_pid = file->private_data;
22646 +       r = snprintf(buf, sizeof(buf), "%lu\n", *this_pid);
22647 +       return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
22650 +static ssize_t do_pid(struct file *file, const char __user *ubuf,
22651 +                     size_t cnt, loff_t *ppos)
22653 +       char buf[64];
22654 +       unsigned long pid;
22655 +       unsigned long *this_pid = file->private_data;
22657 +       if (cnt >= sizeof(buf))
22658 +               return -EINVAL;
22660 +       if (copy_from_user(&buf, ubuf, cnt))
22661 +               return -EFAULT;
22663 +       buf[cnt] = '\0';
22665 +       if (kstrtoul(buf, 10, &pid))
22666 +               return -EINVAL;
22668 +       *this_pid = pid;
22670 +       return cnt;
22672 +#endif
22674 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
22675 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
22676 +static ssize_t
22677 +show_maxlatproc(struct file *file, char __user *ubuf, size_t cnt, loff_t *ppos)
22679 +       int r;
22680 +       struct maxlatproc_data *mp = file->private_data;
22681 +       int strmaxlen = (TASK_COMM_LEN * 2) + (8 * 8);
22682 +       unsigned long long t;
22683 +       unsigned long usecs, secs;
22684 +       char *buf;
22686 +       if (mp->pid == -1 || mp->current_pid == -1) {
22687 +               buf = "(none)\n";
22688 +               return simple_read_from_buffer(ubuf, cnt, ppos, buf,
22689 +                   strlen(buf));
22690 +       }
22692 +       buf = kmalloc(strmaxlen, GFP_KERNEL);
22693 +       if (buf == NULL)
22694 +               return -ENOMEM;
22696 +       t = ns2usecs(mp->timestamp);
22697 +       usecs = do_div(t, USEC_PER_SEC);
22698 +       secs = (unsigned long) t;
22699 +       r = snprintf(buf, strmaxlen,
22700 +           "%d %d %ld (%ld) %s <- %d %d %s %lu.%06lu\n", mp->pid,
22701 +           MAX_RT_PRIO-1 - mp->prio, mp->latency, mp->timeroffset, mp->comm,
22702 +           mp->current_pid, MAX_RT_PRIO-1 - mp->current_prio, mp->current_comm,
22703 +           secs, usecs);
22704 +       r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
22705 +       kfree(buf);
22706 +       return r;
22708 +#endif
22710 +static ssize_t
22711 +show_enable(struct file *file, char __user *ubuf, size_t cnt, loff_t *ppos)
22713 +       char buf[64];
22714 +       struct enable_data *ed = file->private_data;
22715 +       int r;
22717 +       r = snprintf(buf, sizeof(buf), "%d\n", ed->enabled);
22718 +       return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
22721 +static ssize_t
22722 +do_enable(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos)
22724 +       char buf[64];
22725 +       long enable;
22726 +       struct enable_data *ed = file->private_data;
22728 +       if (cnt >= sizeof(buf))
22729 +               return -EINVAL;
22731 +       if (copy_from_user(&buf, ubuf, cnt))
22732 +               return -EFAULT;
22734 +       buf[cnt] = 0;
22736 +       if (kstrtoul(buf, 10, &enable))
22737 +               return -EINVAL;
22739 +       if ((enable && ed->enabled) || (!enable && !ed->enabled))
22740 +               return cnt;
22742 +       if (enable) {
22743 +               int ret;
22745 +               switch (ed->latency_type) {
22746 +#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST)
22747 +               case PREEMPTIRQSOFF_LATENCY:
22748 +                       ret = register_trace_preemptirqsoff_hist(
22749 +                           probe_preemptirqsoff_hist, NULL);
22750 +                       if (ret) {
22751 +                               pr_info("wakeup trace: Couldn't assign "
22752 +                                   "probe_preemptirqsoff_hist "
22753 +                                   "to trace_preemptirqsoff_hist\n");
22754 +                               return ret;
22755 +                       }
22756 +                       break;
22757 +#endif
22758 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
22759 +               case WAKEUP_LATENCY:
22760 +                       ret = register_trace_sched_wakeup(
22761 +                           probe_wakeup_latency_hist_start, NULL);
22762 +                       if (ret) {
22763 +                               pr_info("wakeup trace: Couldn't assign "
22764 +                                   "probe_wakeup_latency_hist_start "
22765 +                                   "to trace_sched_wakeup\n");
22766 +                               return ret;
22767 +                       }
22768 +                       ret = register_trace_sched_wakeup_new(
22769 +                           probe_wakeup_latency_hist_start, NULL);
22770 +                       if (ret) {
22771 +                               pr_info("wakeup trace: Couldn't assign "
22772 +                                   "probe_wakeup_latency_hist_start "
22773 +                                   "to trace_sched_wakeup_new\n");
22774 +                               unregister_trace_sched_wakeup(
22775 +                                   probe_wakeup_latency_hist_start, NULL);
22776 +                               return ret;
22777 +                       }
22778 +                       ret = register_trace_sched_switch(
22779 +                           probe_wakeup_latency_hist_stop, NULL);
22780 +                       if (ret) {
22781 +                               pr_info("wakeup trace: Couldn't assign "
22782 +                                   "probe_wakeup_latency_hist_stop "
22783 +                                   "to trace_sched_switch\n");
22784 +                               unregister_trace_sched_wakeup(
22785 +                                   probe_wakeup_latency_hist_start, NULL);
22786 +                               unregister_trace_sched_wakeup_new(
22787 +                                   probe_wakeup_latency_hist_start, NULL);
22788 +                               return ret;
22789 +                       }
22790 +                       ret = register_trace_sched_migrate_task(
22791 +                           probe_sched_migrate_task, NULL);
22792 +                       if (ret) {
22793 +                               pr_info("wakeup trace: Couldn't assign "
22794 +                                   "probe_sched_migrate_task "
22795 +                                   "to trace_sched_migrate_task\n");
22796 +                               unregister_trace_sched_wakeup(
22797 +                                   probe_wakeup_latency_hist_start, NULL);
22798 +                               unregister_trace_sched_wakeup_new(
22799 +                                   probe_wakeup_latency_hist_start, NULL);
22800 +                               unregister_trace_sched_switch(
22801 +                                   probe_wakeup_latency_hist_stop, NULL);
22802 +                               return ret;
22803 +                       }
22804 +                       break;
22805 +#endif
22806 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
22807 +               case MISSED_TIMER_OFFSETS:
22808 +                       ret = register_trace_hrtimer_interrupt(
22809 +                           probe_hrtimer_interrupt, NULL);
22810 +                       if (ret) {
22811 +                               pr_info("wakeup trace: Couldn't assign "
22812 +                                   "probe_hrtimer_interrupt "
22813 +                                   "to trace_hrtimer_interrupt\n");
22814 +                               return ret;
22815 +                       }
22816 +                       break;
22817 +#endif
22818 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
22819 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
22820 +               case TIMERANDWAKEUP_LATENCY:
22821 +                       if (!wakeup_latency_enabled_data.enabled ||
22822 +                           !missed_timer_offsets_enabled_data.enabled)
22823 +                               return -EINVAL;
22824 +                       break;
22825 +#endif
22826 +               default:
22827 +                       break;
22828 +               }
22829 +       } else {
22830 +               switch (ed->latency_type) {
22831 +#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST)
22832 +               case PREEMPTIRQSOFF_LATENCY:
22833 +                       {
22834 +                               int cpu;
22836 +                               unregister_trace_preemptirqsoff_hist(
22837 +                                   probe_preemptirqsoff_hist, NULL);
22838 +                               for_each_online_cpu(cpu) {
22839 +#ifdef CONFIG_INTERRUPT_OFF_HIST
22840 +                                       per_cpu(hist_irqsoff_counting,
22841 +                                           cpu) = 0;
22842 +#endif
22843 +#ifdef CONFIG_PREEMPT_OFF_HIST
22844 +                                       per_cpu(hist_preemptoff_counting,
22845 +                                           cpu) = 0;
22846 +#endif
22847 +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
22848 +                                       per_cpu(hist_preemptirqsoff_counting,
22849 +                                           cpu) = 0;
22850 +#endif
22851 +                               }
22852 +                       }
22853 +                       break;
22854 +#endif
22855 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
22856 +               case WAKEUP_LATENCY:
22857 +                       {
22858 +                               int cpu;
22860 +                               unregister_trace_sched_wakeup(
22861 +                                   probe_wakeup_latency_hist_start, NULL);
22862 +                               unregister_trace_sched_wakeup_new(
22863 +                                   probe_wakeup_latency_hist_start, NULL);
22864 +                               unregister_trace_sched_switch(
22865 +                                   probe_wakeup_latency_hist_stop, NULL);
22866 +                               unregister_trace_sched_migrate_task(
22867 +                                   probe_sched_migrate_task, NULL);
22869 +                               for_each_online_cpu(cpu) {
22870 +                                       per_cpu(wakeup_task, cpu) = NULL;
22871 +                                       per_cpu(wakeup_sharedprio, cpu) = 0;
22872 +                               }
22873 +                       }
22874 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
22875 +                       timerandwakeup_enabled_data.enabled = 0;
22876 +#endif
22877 +                       break;
22878 +#endif
22879 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
22880 +               case MISSED_TIMER_OFFSETS:
22881 +                       unregister_trace_hrtimer_interrupt(
22882 +                           probe_hrtimer_interrupt, NULL);
22883 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
22884 +                       timerandwakeup_enabled_data.enabled = 0;
22885 +#endif
22886 +                       break;
22887 +#endif
22888 +               default:
22889 +                       break;
22890 +               }
22891 +       }
22892 +       ed->enabled = enable;
22893 +       return cnt;
22896 +static const struct file_operations latency_hist_reset_fops = {
22897 +       .open = tracing_open_generic,
22898 +       .write = latency_hist_reset,
22901 +static const struct file_operations enable_fops = {
22902 +       .open = tracing_open_generic,
22903 +       .read = show_enable,
22904 +       .write = do_enable,
22907 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
22908 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
22909 +static const struct file_operations pid_fops = {
22910 +       .open = tracing_open_generic,
22911 +       .read = show_pid,
22912 +       .write = do_pid,
22915 +static const struct file_operations maxlatproc_fops = {
22916 +       .open = tracing_open_generic,
22917 +       .read = show_maxlatproc,
22919 +#endif
22921 +#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST)
22922 +static notrace void probe_preemptirqsoff_hist(void *v, int reason,
22923 +       int starthist)
22925 +       int cpu = raw_smp_processor_id();
22926 +       int time_set = 0;
22928 +       if (starthist) {
22929 +               cycle_t uninitialized_var(start);
22931 +               if (!preempt_count() && !irqs_disabled())
22932 +                       return;
22934 +#ifdef CONFIG_INTERRUPT_OFF_HIST
22935 +               if ((reason == IRQS_OFF || reason == TRACE_START) &&
22936 +                   !per_cpu(hist_irqsoff_counting, cpu)) {
22937 +                       per_cpu(hist_irqsoff_counting, cpu) = 1;
22938 +                       start = ftrace_now(cpu);
22939 +                       time_set++;
22940 +                       per_cpu(hist_irqsoff_start, cpu) = start;
22941 +               }
22942 +#endif
22944 +#ifdef CONFIG_PREEMPT_OFF_HIST
22945 +               if ((reason == PREEMPT_OFF || reason == TRACE_START) &&
22946 +                   !per_cpu(hist_preemptoff_counting, cpu)) {
22947 +                       per_cpu(hist_preemptoff_counting, cpu) = 1;
22948 +                       if (!(time_set++))
22949 +                               start = ftrace_now(cpu);
22950 +                       per_cpu(hist_preemptoff_start, cpu) = start;
22951 +               }
22952 +#endif
22954 +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
22955 +               if (per_cpu(hist_irqsoff_counting, cpu) &&
22956 +                   per_cpu(hist_preemptoff_counting, cpu) &&
22957 +                   !per_cpu(hist_preemptirqsoff_counting, cpu)) {
22958 +                       per_cpu(hist_preemptirqsoff_counting, cpu) = 1;
22959 +                       if (!time_set)
22960 +                               start = ftrace_now(cpu);
22961 +                       per_cpu(hist_preemptirqsoff_start, cpu) = start;
22962 +               }
22963 +#endif
22964 +       } else {
22965 +               cycle_t uninitialized_var(stop);
22967 +#ifdef CONFIG_INTERRUPT_OFF_HIST
22968 +               if ((reason == IRQS_ON || reason == TRACE_STOP) &&
22969 +                   per_cpu(hist_irqsoff_counting, cpu)) {
22970 +                       cycle_t start = per_cpu(hist_irqsoff_start, cpu);
22972 +                       stop = ftrace_now(cpu);
22973 +                       time_set++;
22974 +                       if (start) {
22975 +                               long latency = ((long) (stop - start)) /
22976 +                                   NSECS_PER_USECS;
22978 +                               latency_hist(IRQSOFF_LATENCY, cpu, latency, 0,
22979 +                                   stop, NULL);
22980 +                       }
22981 +                       per_cpu(hist_irqsoff_counting, cpu) = 0;
22982 +               }
22983 +#endif
22985 +#ifdef CONFIG_PREEMPT_OFF_HIST
22986 +               if ((reason == PREEMPT_ON || reason == TRACE_STOP) &&
22987 +                   per_cpu(hist_preemptoff_counting, cpu)) {
22988 +                       cycle_t start = per_cpu(hist_preemptoff_start, cpu);
22990 +                       if (!(time_set++))
22991 +                               stop = ftrace_now(cpu);
22992 +                       if (start) {
22993 +                               long latency = ((long) (stop - start)) /
22994 +                                   NSECS_PER_USECS;
22996 +                               latency_hist(PREEMPTOFF_LATENCY, cpu, latency,
22997 +                                   0, stop, NULL);
22998 +                       }
22999 +                       per_cpu(hist_preemptoff_counting, cpu) = 0;
23000 +               }
23001 +#endif
23003 +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
23004 +               if ((!per_cpu(hist_irqsoff_counting, cpu) ||
23005 +                    !per_cpu(hist_preemptoff_counting, cpu)) &&
23006 +                  per_cpu(hist_preemptirqsoff_counting, cpu)) {
23007 +                       cycle_t start = per_cpu(hist_preemptirqsoff_start, cpu);
23009 +                       if (!time_set)
23010 +                               stop = ftrace_now(cpu);
23011 +                       if (start) {
23012 +                               long latency = ((long) (stop - start)) /
23013 +                                   NSECS_PER_USECS;
23015 +                               latency_hist(PREEMPTIRQSOFF_LATENCY, cpu,
23016 +                                   latency, 0, stop, NULL);
23017 +                       }
23018 +                       per_cpu(hist_preemptirqsoff_counting, cpu) = 0;
23019 +               }
23020 +#endif
23021 +       }
23023 +#endif
23025 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
23026 +static DEFINE_RAW_SPINLOCK(wakeup_lock);
23027 +static notrace void probe_sched_migrate_task(void *v, struct task_struct *task,
23028 +       int cpu)
23030 +       int old_cpu = task_cpu(task);
23032 +       if (cpu != old_cpu) {
23033 +               unsigned long flags;
23034 +               struct task_struct *cpu_wakeup_task;
23036 +               raw_spin_lock_irqsave(&wakeup_lock, flags);
23038 +               cpu_wakeup_task = per_cpu(wakeup_task, old_cpu);
23039 +               if (task == cpu_wakeup_task) {
23040 +                       put_task_struct(cpu_wakeup_task);
23041 +                       per_cpu(wakeup_task, old_cpu) = NULL;
23042 +                       cpu_wakeup_task = per_cpu(wakeup_task, cpu) = task;
23043 +                       get_task_struct(cpu_wakeup_task);
23044 +               }
23046 +               raw_spin_unlock_irqrestore(&wakeup_lock, flags);
23047 +       }
23050 +static notrace void probe_wakeup_latency_hist_start(void *v,
23051 +       struct task_struct *p)
23053 +       unsigned long flags;
23054 +       struct task_struct *curr = current;
23055 +       int cpu = task_cpu(p);
23056 +       struct task_struct *cpu_wakeup_task;
23058 +       raw_spin_lock_irqsave(&wakeup_lock, flags);
23060 +       cpu_wakeup_task = per_cpu(wakeup_task, cpu);
23062 +       if (wakeup_pid) {
23063 +               if ((cpu_wakeup_task && p->prio == cpu_wakeup_task->prio) ||
23064 +                   p->prio == curr->prio)
23065 +                       per_cpu(wakeup_sharedprio, cpu) = 1;
23066 +               if (likely(wakeup_pid != task_pid_nr(p)))
23067 +                       goto out;
23068 +       } else {
23069 +               if (likely(!rt_task(p)) ||
23070 +                   (cpu_wakeup_task && p->prio > cpu_wakeup_task->prio) ||
23071 +                   p->prio > curr->prio)
23072 +                       goto out;
23073 +               if ((cpu_wakeup_task && p->prio == cpu_wakeup_task->prio) ||
23074 +                   p->prio == curr->prio)
23075 +                       per_cpu(wakeup_sharedprio, cpu) = 1;
23076 +       }
23078 +       if (cpu_wakeup_task)
23079 +               put_task_struct(cpu_wakeup_task);
23080 +       cpu_wakeup_task = per_cpu(wakeup_task, cpu) = p;
23081 +       get_task_struct(cpu_wakeup_task);
23082 +       cpu_wakeup_task->preempt_timestamp_hist =
23083 +               ftrace_now(raw_smp_processor_id());
23084 +out:
23085 +       raw_spin_unlock_irqrestore(&wakeup_lock, flags);
23088 +static notrace void probe_wakeup_latency_hist_stop(void *v,
23089 +       bool preempt, struct task_struct *prev, struct task_struct *next)
23091 +       unsigned long flags;
23092 +       int cpu = task_cpu(next);
23093 +       long latency;
23094 +       cycle_t stop;
23095 +       struct task_struct *cpu_wakeup_task;
23097 +       raw_spin_lock_irqsave(&wakeup_lock, flags);
23099 +       cpu_wakeup_task = per_cpu(wakeup_task, cpu);
23101 +       if (cpu_wakeup_task == NULL)
23102 +               goto out;
23104 +       /* Already running? */
23105 +       if (unlikely(current == cpu_wakeup_task))
23106 +               goto out_reset;
23108 +       if (next != cpu_wakeup_task) {
23109 +               if (next->prio < cpu_wakeup_task->prio)
23110 +                       goto out_reset;
23112 +               if (next->prio == cpu_wakeup_task->prio)
23113 +                       per_cpu(wakeup_sharedprio, cpu) = 1;
23115 +               goto out;
23116 +       }
23118 +       if (current->prio == cpu_wakeup_task->prio)
23119 +               per_cpu(wakeup_sharedprio, cpu) = 1;
23121 +       /*
23122 +        * The task we are waiting for is about to be switched to.
23123 +        * Calculate latency and store it in histogram.
23124 +        */
23125 +       stop = ftrace_now(raw_smp_processor_id());
23127 +       latency = ((long) (stop - next->preempt_timestamp_hist)) /
23128 +           NSECS_PER_USECS;
23130 +       if (per_cpu(wakeup_sharedprio, cpu)) {
23131 +               latency_hist(WAKEUP_LATENCY_SHAREDPRIO, cpu, latency, 0, stop,
23132 +                   next);
23133 +               per_cpu(wakeup_sharedprio, cpu) = 0;
23134 +       } else {
23135 +               latency_hist(WAKEUP_LATENCY, cpu, latency, 0, stop, next);
23136 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
23137 +               if (timerandwakeup_enabled_data.enabled) {
23138 +                       latency_hist(TIMERANDWAKEUP_LATENCY, cpu,
23139 +                           next->timer_offset + latency, next->timer_offset,
23140 +                           stop, next);
23141 +               }
23142 +#endif
23143 +       }
23145 +out_reset:
23146 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
23147 +       next->timer_offset = 0;
23148 +#endif
23149 +       put_task_struct(cpu_wakeup_task);
23150 +       per_cpu(wakeup_task, cpu) = NULL;
23151 +out:
23152 +       raw_spin_unlock_irqrestore(&wakeup_lock, flags);
23154 +#endif
23156 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
23157 +static notrace void probe_hrtimer_interrupt(void *v, int cpu,
23158 +       long long latency_ns, struct task_struct *curr,
23159 +       struct task_struct *task)
23161 +       if (latency_ns <= 0 && task != NULL && rt_task(task) &&
23162 +           (task->prio < curr->prio ||
23163 +           (task->prio == curr->prio &&
23164 +           !cpumask_test_cpu(cpu, &task->cpus_allowed)))) {
23165 +               long latency;
23166 +               cycle_t now;
23168 +               if (missed_timer_offsets_pid) {
23169 +                       if (likely(missed_timer_offsets_pid !=
23170 +                           task_pid_nr(task)))
23171 +                               return;
23172 +               }
23174 +               now = ftrace_now(cpu);
23175 +               latency = (long) div_s64(-latency_ns, NSECS_PER_USECS);
23176 +               latency_hist(MISSED_TIMER_OFFSETS, cpu, latency, latency, now,
23177 +                   task);
23178 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
23179 +               task->timer_offset = latency;
23180 +#endif
23181 +       }
23183 +#endif
23185 +static __init int latency_hist_init(void)
23187 +       struct dentry *latency_hist_root = NULL;
23188 +       struct dentry *dentry;
23189 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
23190 +       struct dentry *dentry_sharedprio;
23191 +#endif
23192 +       struct dentry *entry;
23193 +       struct dentry *enable_root;
23194 +       int i = 0;
23195 +       struct hist_data *my_hist;
23196 +       char name[64];
23197 +       char *cpufmt = "CPU%d";
23198 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
23199 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
23200 +       char *cpufmt_maxlatproc = "max_latency-CPU%d";
23201 +       struct maxlatproc_data *mp = NULL;
23202 +#endif
23204 +       dentry = tracing_init_dentry();
23205 +       latency_hist_root = debugfs_create_dir(latency_hist_dir_root, dentry);
23206 +       enable_root = debugfs_create_dir("enable", latency_hist_root);
23208 +#ifdef CONFIG_INTERRUPT_OFF_HIST
23209 +       dentry = debugfs_create_dir(irqsoff_hist_dir, latency_hist_root);
23210 +       for_each_possible_cpu(i) {
23211 +               sprintf(name, cpufmt, i);
23212 +               entry = debugfs_create_file(name, 0444, dentry,
23213 +                   &per_cpu(irqsoff_hist, i), &latency_hist_fops);
23214 +               my_hist = &per_cpu(irqsoff_hist, i);
23215 +               atomic_set(&my_hist->hist_mode, 1);
23216 +               my_hist->min_lat = LONG_MAX;
23217 +       }
23218 +       entry = debugfs_create_file("reset", 0644, dentry,
23219 +           (void *)IRQSOFF_LATENCY, &latency_hist_reset_fops);
23220 +#endif
23222 +#ifdef CONFIG_PREEMPT_OFF_HIST
23223 +       dentry = debugfs_create_dir(preemptoff_hist_dir,
23224 +           latency_hist_root);
23225 +       for_each_possible_cpu(i) {
23226 +               sprintf(name, cpufmt, i);
23227 +               entry = debugfs_create_file(name, 0444, dentry,
23228 +                   &per_cpu(preemptoff_hist, i), &latency_hist_fops);
23229 +               my_hist = &per_cpu(preemptoff_hist, i);
23230 +               atomic_set(&my_hist->hist_mode, 1);
23231 +               my_hist->min_lat = LONG_MAX;
23232 +       }
23233 +       entry = debugfs_create_file("reset", 0644, dentry,
23234 +           (void *)PREEMPTOFF_LATENCY, &latency_hist_reset_fops);
23235 +#endif
23237 +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
23238 +       dentry = debugfs_create_dir(preemptirqsoff_hist_dir,
23239 +           latency_hist_root);
23240 +       for_each_possible_cpu(i) {
23241 +               sprintf(name, cpufmt, i);
23242 +               entry = debugfs_create_file(name, 0444, dentry,
23243 +                   &per_cpu(preemptirqsoff_hist, i), &latency_hist_fops);
23244 +               my_hist = &per_cpu(preemptirqsoff_hist, i);
23245 +               atomic_set(&my_hist->hist_mode, 1);
23246 +               my_hist->min_lat = LONG_MAX;
23247 +       }
23248 +       entry = debugfs_create_file("reset", 0644, dentry,
23249 +           (void *)PREEMPTIRQSOFF_LATENCY, &latency_hist_reset_fops);
23250 +#endif
23252 +#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST)
23253 +       entry = debugfs_create_file("preemptirqsoff", 0644,
23254 +           enable_root, (void *)&preemptirqsoff_enabled_data,
23255 +           &enable_fops);
23256 +#endif
23258 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
23259 +       dentry = debugfs_create_dir(wakeup_latency_hist_dir,
23260 +           latency_hist_root);
23261 +       dentry_sharedprio = debugfs_create_dir(
23262 +           wakeup_latency_hist_dir_sharedprio, dentry);
23263 +       for_each_possible_cpu(i) {
23264 +               sprintf(name, cpufmt, i);
23266 +               entry = debugfs_create_file(name, 0444, dentry,
23267 +                   &per_cpu(wakeup_latency_hist, i),
23268 +                   &latency_hist_fops);
23269 +               my_hist = &per_cpu(wakeup_latency_hist, i);
23270 +               atomic_set(&my_hist->hist_mode, 1);
23271 +               my_hist->min_lat = LONG_MAX;
23273 +               entry = debugfs_create_file(name, 0444, dentry_sharedprio,
23274 +                   &per_cpu(wakeup_latency_hist_sharedprio, i),
23275 +                   &latency_hist_fops);
23276 +               my_hist = &per_cpu(wakeup_latency_hist_sharedprio, i);
23277 +               atomic_set(&my_hist->hist_mode, 1);
23278 +               my_hist->min_lat = LONG_MAX;
23280 +               sprintf(name, cpufmt_maxlatproc, i);
23282 +               mp = &per_cpu(wakeup_maxlatproc, i);
23283 +               entry = debugfs_create_file(name, 0444, dentry, mp,
23284 +                   &maxlatproc_fops);
23285 +               clear_maxlatprocdata(mp);
23287 +               mp = &per_cpu(wakeup_maxlatproc_sharedprio, i);
23288 +               entry = debugfs_create_file(name, 0444, dentry_sharedprio, mp,
23289 +                   &maxlatproc_fops);
23290 +               clear_maxlatprocdata(mp);
23291 +       }
23292 +       entry = debugfs_create_file("pid", 0644, dentry,
23293 +           (void *)&wakeup_pid, &pid_fops);
23294 +       entry = debugfs_create_file("reset", 0644, dentry,
23295 +           (void *)WAKEUP_LATENCY, &latency_hist_reset_fops);
23296 +       entry = debugfs_create_file("reset", 0644, dentry_sharedprio,
23297 +           (void *)WAKEUP_LATENCY_SHAREDPRIO, &latency_hist_reset_fops);
23298 +       entry = debugfs_create_file("wakeup", 0644,
23299 +           enable_root, (void *)&wakeup_latency_enabled_data,
23300 +           &enable_fops);
23301 +#endif
23303 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
23304 +       dentry = debugfs_create_dir(missed_timer_offsets_dir,
23305 +           latency_hist_root);
23306 +       for_each_possible_cpu(i) {
23307 +               sprintf(name, cpufmt, i);
23308 +               entry = debugfs_create_file(name, 0444, dentry,
23309 +                   &per_cpu(missed_timer_offsets, i), &latency_hist_fops);
23310 +               my_hist = &per_cpu(missed_timer_offsets, i);
23311 +               atomic_set(&my_hist->hist_mode, 1);
23312 +               my_hist->min_lat = LONG_MAX;
23314 +               sprintf(name, cpufmt_maxlatproc, i);
23315 +               mp = &per_cpu(missed_timer_offsets_maxlatproc, i);
23316 +               entry = debugfs_create_file(name, 0444, dentry, mp,
23317 +                   &maxlatproc_fops);
23318 +               clear_maxlatprocdata(mp);
23319 +       }
23320 +       entry = debugfs_create_file("pid", 0644, dentry,
23321 +           (void *)&missed_timer_offsets_pid, &pid_fops);
23322 +       entry = debugfs_create_file("reset", 0644, dentry,
23323 +           (void *)MISSED_TIMER_OFFSETS, &latency_hist_reset_fops);
23324 +       entry = debugfs_create_file("missed_timer_offsets", 0644,
23325 +           enable_root, (void *)&missed_timer_offsets_enabled_data,
23326 +           &enable_fops);
23327 +#endif
23329 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
23330 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
23331 +       dentry = debugfs_create_dir(timerandwakeup_latency_hist_dir,
23332 +           latency_hist_root);
23333 +       for_each_possible_cpu(i) {
23334 +               sprintf(name, cpufmt, i);
23335 +               entry = debugfs_create_file(name, 0444, dentry,
23336 +                   &per_cpu(timerandwakeup_latency_hist, i),
23337 +                   &latency_hist_fops);
23338 +               my_hist = &per_cpu(timerandwakeup_latency_hist, i);
23339 +               atomic_set(&my_hist->hist_mode, 1);
23340 +               my_hist->min_lat = LONG_MAX;
23342 +               sprintf(name, cpufmt_maxlatproc, i);
23343 +               mp = &per_cpu(timerandwakeup_maxlatproc, i);
23344 +               entry = debugfs_create_file(name, 0444, dentry, mp,
23345 +                   &maxlatproc_fops);
23346 +               clear_maxlatprocdata(mp);
23347 +       }
23348 +       entry = debugfs_create_file("reset", 0644, dentry,
23349 +           (void *)TIMERANDWAKEUP_LATENCY, &latency_hist_reset_fops);
23350 +       entry = debugfs_create_file("timerandwakeup", 0644,
23351 +           enable_root, (void *)&timerandwakeup_enabled_data,
23352 +           &enable_fops);
23353 +#endif
23354 +       return 0;
23357 +device_initcall(latency_hist_init);
23358 diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
23359 index 15b02645ce8b..00d9ebcf42e2 100644
23360 --- a/kernel/trace/trace.c
23361 +++ b/kernel/trace/trace.c
23362 @@ -1897,6 +1897,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
23363         struct task_struct *tsk = current;
23365         entry->preempt_count            = pc & 0xff;
23366 +       entry->preempt_lazy_count       = preempt_lazy_count();
23367         entry->pid                      = (tsk) ? tsk->pid : 0;
23368         entry->flags =
23369  #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
23370 @@ -1907,8 +1908,11 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
23371                 ((pc & NMI_MASK    ) ? TRACE_FLAG_NMI     : 0) |
23372                 ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) |
23373                 ((pc & SOFTIRQ_OFFSET) ? TRACE_FLAG_SOFTIRQ : 0) |
23374 -               (tif_need_resched() ? TRACE_FLAG_NEED_RESCHED : 0) |
23375 +               (tif_need_resched_now() ? TRACE_FLAG_NEED_RESCHED : 0) |
23376 +               (need_resched_lazy() ? TRACE_FLAG_NEED_RESCHED_LAZY : 0) |
23377                 (test_preempt_need_resched() ? TRACE_FLAG_PREEMPT_RESCHED : 0);
23379 +       entry->migrate_disable = (tsk) ? __migrate_disabled(tsk) & 0xFF : 0;
23381  EXPORT_SYMBOL_GPL(tracing_generic_entry_update);
23383 @@ -2898,14 +2902,17 @@ get_total_entries(struct trace_buffer *buf,
23385  static void print_lat_help_header(struct seq_file *m)
23387 -       seq_puts(m, "#                  _------=> CPU#            \n"
23388 -                   "#                 / _-----=> irqs-off        \n"
23389 -                   "#                | / _----=> need-resched    \n"
23390 -                   "#                || / _---=> hardirq/softirq \n"
23391 -                   "#                ||| / _--=> preempt-depth   \n"
23392 -                   "#                |||| /     delay            \n"
23393 -                   "#  cmd     pid   ||||| time  |   caller      \n"
23394 -                   "#     \\   /      |||||  \\    |   /         \n");
23395 +       seq_puts(m, "#                  _--------=> CPU#              \n"
23396 +                   "#                 / _-------=> irqs-off          \n"
23397 +                   "#                | / _------=> need-resched      \n"
23398 +                   "#                || / _-----=> need-resched_lazy \n"
23399 +                   "#                ||| / _----=> hardirq/softirq   \n"
23400 +                   "#                |||| / _---=> preempt-depth     \n"
23401 +                   "#                ||||| / _--=> preempt-lazy-depth\n"
23402 +                   "#                |||||| / _-=> migrate-disable   \n"
23403 +                   "#                ||||||| /     delay             \n"
23404 +                   "# cmd     pid    |||||||| time   |  caller       \n"
23405 +                   "#     \\   /      ||||||||   \\    |  /            \n");
23408  static void print_event_info(struct trace_buffer *buf, struct seq_file *m)
23409 @@ -2931,11 +2938,14 @@ static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file
23410         print_event_info(buf, m);
23411         seq_puts(m, "#                              _-----=> irqs-off\n"
23412                     "#                             / _----=> need-resched\n"
23413 -                   "#                            | / _---=> hardirq/softirq\n"
23414 -                   "#                            || / _--=> preempt-depth\n"
23415 -                   "#                            ||| /     delay\n"
23416 -                   "#           TASK-PID   CPU#  ||||    TIMESTAMP  FUNCTION\n"
23417 -                   "#              | |       |   ||||       |         |\n");
23418 +                   "#                            |/  _-----=> need-resched_lazy\n"
23419 +                   "#                            || / _---=> hardirq/softirq\n"
23420 +                   "#                            ||| / _--=> preempt-depth\n"
23421 +                   "#                            |||| / _-=> preempt-lazy-depth\n"
23422 +                   "#                            ||||| / _-=> migrate-disable   \n"
23423 +                   "#                            |||||| /    delay\n"
23424 +                   "#           TASK-PID   CPU#  |||||||   TIMESTAMP  FUNCTION\n"
23425 +                   "#              | |       |   |||||||      |         |\n");
23428  void
23429 diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
23430 index b0d8576c27ae..702b9376b278 100644
23431 --- a/kernel/trace/trace.h
23432 +++ b/kernel/trace/trace.h
23433 @@ -124,6 +124,7 @@ struct kretprobe_trace_entry_head {
23434   *  NEED_RESCHED       - reschedule is requested
23435   *  HARDIRQ            - inside an interrupt handler
23436   *  SOFTIRQ            - inside a softirq handler
23437 + *  NEED_RESCHED_LAZY  - lazy reschedule is requested
23438   */
23439  enum trace_flag_type {
23440         TRACE_FLAG_IRQS_OFF             = 0x01,
23441 @@ -133,6 +134,7 @@ enum trace_flag_type {
23442         TRACE_FLAG_SOFTIRQ              = 0x10,
23443         TRACE_FLAG_PREEMPT_RESCHED      = 0x20,
23444         TRACE_FLAG_NMI                  = 0x40,
23445 +       TRACE_FLAG_NEED_RESCHED_LAZY    = 0x80,
23446  };
23448  #define TRACE_BUF_SIZE         1024
23449 diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
23450 index 03c0a48c3ac4..0b85d516b491 100644
23451 --- a/kernel/trace/trace_events.c
23452 +++ b/kernel/trace/trace_events.c
23453 @@ -187,6 +187,8 @@ static int trace_define_common_fields(void)
23454         __common_field(unsigned char, flags);
23455         __common_field(unsigned char, preempt_count);
23456         __common_field(int, pid);
23457 +       __common_field(unsigned short, migrate_disable);
23458 +       __common_field(unsigned short, padding);
23460         return ret;
23462 diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
23463 index 03cdff84d026..940bd10b4406 100644
23464 --- a/kernel/trace/trace_irqsoff.c
23465 +++ b/kernel/trace/trace_irqsoff.c
23466 @@ -13,6 +13,7 @@
23467  #include <linux/uaccess.h>
23468  #include <linux/module.h>
23469  #include <linux/ftrace.h>
23470 +#include <trace/events/hist.h>
23472  #include "trace.h"
23474 @@ -424,11 +425,13 @@ void start_critical_timings(void)
23476         if (preempt_trace() || irq_trace())
23477                 start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
23478 +       trace_preemptirqsoff_hist_rcuidle(TRACE_START, 1);
23480  EXPORT_SYMBOL_GPL(start_critical_timings);
23482  void stop_critical_timings(void)
23484 +       trace_preemptirqsoff_hist_rcuidle(TRACE_STOP, 0);
23485         if (preempt_trace() || irq_trace())
23486                 stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
23488 @@ -438,6 +441,7 @@ EXPORT_SYMBOL_GPL(stop_critical_timings);
23489  #ifdef CONFIG_PROVE_LOCKING
23490  void time_hardirqs_on(unsigned long a0, unsigned long a1)
23492 +       trace_preemptirqsoff_hist_rcuidle(IRQS_ON, 0);
23493         if (!preempt_trace() && irq_trace())
23494                 stop_critical_timing(a0, a1);
23496 @@ -446,6 +450,7 @@ void time_hardirqs_off(unsigned long a0, unsigned long a1)
23498         if (!preempt_trace() && irq_trace())
23499                 start_critical_timing(a0, a1);
23500 +       trace_preemptirqsoff_hist_rcuidle(IRQS_OFF, 1);
23503  #else /* !CONFIG_PROVE_LOCKING */
23504 @@ -471,6 +476,7 @@ inline void print_irqtrace_events(struct task_struct *curr)
23505   */
23506  void trace_hardirqs_on(void)
23508 +       trace_preemptirqsoff_hist(IRQS_ON, 0);
23509         if (!preempt_trace() && irq_trace())
23510                 stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
23512 @@ -480,11 +486,13 @@ void trace_hardirqs_off(void)
23514         if (!preempt_trace() && irq_trace())
23515                 start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
23516 +       trace_preemptirqsoff_hist(IRQS_OFF, 1);
23518  EXPORT_SYMBOL(trace_hardirqs_off);
23520  __visible void trace_hardirqs_on_caller(unsigned long caller_addr)
23522 +       trace_preemptirqsoff_hist(IRQS_ON, 0);
23523         if (!preempt_trace() && irq_trace())
23524                 stop_critical_timing(CALLER_ADDR0, caller_addr);
23526 @@ -494,6 +502,7 @@ __visible void trace_hardirqs_off_caller(unsigned long caller_addr)
23528         if (!preempt_trace() && irq_trace())
23529                 start_critical_timing(CALLER_ADDR0, caller_addr);
23530 +       trace_preemptirqsoff_hist(IRQS_OFF, 1);
23532  EXPORT_SYMBOL(trace_hardirqs_off_caller);
23534 @@ -503,12 +512,14 @@ EXPORT_SYMBOL(trace_hardirqs_off_caller);
23535  #ifdef CONFIG_PREEMPT_TRACER
23536  void trace_preempt_on(unsigned long a0, unsigned long a1)
23538 +       trace_preemptirqsoff_hist(PREEMPT_ON, 0);
23539         if (preempt_trace() && !irq_trace())
23540                 stop_critical_timing(a0, a1);
23543  void trace_preempt_off(unsigned long a0, unsigned long a1)
23545 +       trace_preemptirqsoff_hist(PREEMPT_ON, 1);
23546         if (preempt_trace() && !irq_trace())
23547                 start_critical_timing(a0, a1);
23549 diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
23550 index 3fc20422c166..65a6dde71a7d 100644
23551 --- a/kernel/trace/trace_output.c
23552 +++ b/kernel/trace/trace_output.c
23553 @@ -386,6 +386,7 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
23555         char hardsoft_irq;
23556         char need_resched;
23557 +       char need_resched_lazy;
23558         char irqs_off;
23559         int hardirq;
23560         int softirq;
23561 @@ -416,6 +417,9 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
23562                 break;
23563         }
23565 +       need_resched_lazy =
23566 +               (entry->flags & TRACE_FLAG_NEED_RESCHED_LAZY) ? 'L' : '.';
23568         hardsoft_irq =
23569                 (nmi && hardirq)     ? 'Z' :
23570                 nmi                  ? 'z' :
23571 @@ -424,14 +428,25 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
23572                 softirq              ? 's' :
23573                                        '.' ;
23575 -       trace_seq_printf(s, "%c%c%c",
23576 -                        irqs_off, need_resched, hardsoft_irq);
23577 +       trace_seq_printf(s, "%c%c%c%c",
23578 +                        irqs_off, need_resched, need_resched_lazy,
23579 +                        hardsoft_irq);
23581         if (entry->preempt_count)
23582                 trace_seq_printf(s, "%x", entry->preempt_count);
23583         else
23584                 trace_seq_putc(s, '.');
23586 +       if (entry->preempt_lazy_count)
23587 +               trace_seq_printf(s, "%x", entry->preempt_lazy_count);
23588 +       else
23589 +               trace_seq_putc(s, '.');
23591 +       if (entry->migrate_disable)
23592 +               trace_seq_printf(s, "%x", entry->migrate_disable);
23593 +       else
23594 +               trace_seq_putc(s, '.');
23596         return !trace_seq_has_overflowed(s);
23599 diff --git a/kernel/user.c b/kernel/user.c
23600 index b069ccbfb0b0..1a2e88e98b5e 100644
23601 --- a/kernel/user.c
23602 +++ b/kernel/user.c
23603 @@ -161,11 +161,11 @@ void free_uid(struct user_struct *up)
23604         if (!up)
23605                 return;
23607 -       local_irq_save(flags);
23608 +       local_irq_save_nort(flags);
23609         if (atomic_dec_and_lock(&up->__count, &uidhash_lock))
23610                 free_user(up, flags);
23611         else
23612 -               local_irq_restore(flags);
23613 +               local_irq_restore_nort(flags);
23616  struct user_struct *alloc_uid(kuid_t uid)
23617 diff --git a/kernel/watchdog.c b/kernel/watchdog.c
23618 index 63177be0159e..59fe007ad496 100644
23619 --- a/kernel/watchdog.c
23620 +++ b/kernel/watchdog.c
23621 @@ -381,6 +381,7 @@ static void watchdog_enable(unsigned int cpu)
23622         /* kick off the timer for the hardlockup detector */
23623         hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
23624         hrtimer->function = watchdog_timer_fn;
23625 +       hrtimer->irqsafe = 1;
23627         /* Enable the perf event */
23628         watchdog_nmi_enable(cpu);
23629 diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c
23630 index 12b8dd640786..4c90d2ee7433 100644
23631 --- a/kernel/watchdog_hld.c
23632 +++ b/kernel/watchdog_hld.c
23633 @@ -19,6 +19,7 @@
23634  static DEFINE_PER_CPU(bool, hard_watchdog_warn);
23635  static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
23636  static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
23637 +static DEFINE_RAW_SPINLOCK(watchdog_output_lock);
23639  /* boot commands */
23640  /*
23641 @@ -104,6 +105,13 @@ static void watchdog_overflow_callback(struct perf_event *event,
23642                 /* only print hardlockups once */
23643                 if (__this_cpu_read(hard_watchdog_warn) == true)
23644                         return;
23645 +               /*
23646 +                * If early-printk is enabled then make sure we do not
23647 +                * lock up in printk() and kill console logging:
23648 +                */
23649 +               printk_kill();
23651 +               raw_spin_lock(&watchdog_output_lock);
23653                 pr_emerg("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
23654                 print_modules();
23655 @@ -121,6 +129,7 @@ static void watchdog_overflow_callback(struct perf_event *event,
23656                                 !test_and_set_bit(0, &hardlockup_allcpu_dumped))
23657                         trigger_allbutself_cpu_backtrace();
23659 +               raw_spin_unlock(&watchdog_output_lock);
23660                 if (hardlockup_panic)
23661                         nmi_panic(regs, "Hard LOCKUP");
23663 diff --git a/kernel/workqueue.c b/kernel/workqueue.c
23664 index 181c2ad0cb54..7eed129f114a 100644
23665 --- a/kernel/workqueue.c
23666 +++ b/kernel/workqueue.c
23667 @@ -48,6 +48,8 @@
23668  #include <linux/nodemask.h>
23669  #include <linux/moduleparam.h>
23670  #include <linux/uaccess.h>
23671 +#include <linux/locallock.h>
23672 +#include <linux/delay.h>
23674  #include "workqueue_internal.h"
23676 @@ -122,11 +124,16 @@ enum {
23677   *    cpu or grabbing pool->lock is enough for read access.  If
23678   *    POOL_DISASSOCIATED is set, it's identical to L.
23679   *
23680 + *    On RT we need the extra protection via rt_lock_idle_list() for
23681 + *    the list manipulations against read access from
23682 + *    wq_worker_sleeping(). All other places are nicely serialized via
23683 + *    pool->lock.
23684 + *
23685   * A: pool->attach_mutex protected.
23686   *
23687   * PL: wq_pool_mutex protected.
23688   *
23689 - * PR: wq_pool_mutex protected for writes.  Sched-RCU protected for reads.
23690 + * PR: wq_pool_mutex protected for writes.  RCU protected for reads.
23691   *
23692   * PW: wq_pool_mutex and wq->mutex protected for writes.  Either for reads.
23693   *
23694 @@ -135,7 +142,7 @@ enum {
23695   *
23696   * WQ: wq->mutex protected.
23697   *
23698 - * WR: wq->mutex protected for writes.  Sched-RCU protected for reads.
23699 + * WR: wq->mutex protected for writes.  RCU protected for reads.
23700   *
23701   * MD: wq_mayday_lock protected.
23702   */
23703 @@ -185,7 +192,7 @@ struct worker_pool {
23704         atomic_t                nr_running ____cacheline_aligned_in_smp;
23706         /*
23707 -        * Destruction of pool is sched-RCU protected to allow dereferences
23708 +        * Destruction of pool is RCU protected to allow dereferences
23709          * from get_work_pool().
23710          */
23711         struct rcu_head         rcu;
23712 @@ -214,7 +221,7 @@ struct pool_workqueue {
23713         /*
23714          * Release of unbound pwq is punted to system_wq.  See put_pwq()
23715          * and pwq_unbound_release_workfn() for details.  pool_workqueue
23716 -        * itself is also sched-RCU protected so that the first pwq can be
23717 +        * itself is also RCU protected so that the first pwq can be
23718          * determined without grabbing wq->mutex.
23719          */
23720         struct work_struct      unbound_release_work;
23721 @@ -349,6 +356,8 @@ EXPORT_SYMBOL_GPL(system_power_efficient_wq);
23722  struct workqueue_struct *system_freezable_power_efficient_wq __read_mostly;
23723  EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq);
23725 +static DEFINE_LOCAL_IRQ_LOCK(pendingb_lock);
23727  static int worker_thread(void *__worker);
23728  static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
23730 @@ -356,20 +365,20 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
23731  #include <trace/events/workqueue.h>
23733  #define assert_rcu_or_pool_mutex()                                     \
23734 -       RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() &&                 \
23735 +       RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&                       \
23736                          !lockdep_is_held(&wq_pool_mutex),              \
23737 -                        "sched RCU or wq_pool_mutex should be held")
23738 +                        "RCU or wq_pool_mutex should be held")
23740  #define assert_rcu_or_wq_mutex(wq)                                     \
23741 -       RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() &&                 \
23742 +       RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&                       \
23743                          !lockdep_is_held(&wq->mutex),                  \
23744 -                        "sched RCU or wq->mutex should be held")
23745 +                        "RCU or wq->mutex should be held")
23747  #define assert_rcu_or_wq_mutex_or_pool_mutex(wq)                       \
23748 -       RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() &&                 \
23749 +       RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&                       \
23750                          !lockdep_is_held(&wq->mutex) &&                \
23751                          !lockdep_is_held(&wq_pool_mutex),              \
23752 -                        "sched RCU, wq->mutex or wq_pool_mutex should be held")
23753 +                        "RCU, wq->mutex or wq_pool_mutex should be held")
23755  #define for_each_cpu_worker_pool(pool, cpu)                            \
23756         for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0];               \
23757 @@ -381,7 +390,7 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
23758   * @pool: iteration cursor
23759   * @pi: integer used for iteration
23760   *
23761 - * This must be called either with wq_pool_mutex held or sched RCU read
23762 + * This must be called either with wq_pool_mutex held or RCU read
23763   * locked.  If the pool needs to be used beyond the locking in effect, the
23764   * caller is responsible for guaranteeing that the pool stays online.
23765   *
23766 @@ -413,7 +422,7 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
23767   * @pwq: iteration cursor
23768   * @wq: the target workqueue
23769   *
23770 - * This must be called either with wq->mutex held or sched RCU read locked.
23771 + * This must be called either with wq->mutex held or RCU read locked.
23772   * If the pwq needs to be used beyond the locking in effect, the caller is
23773   * responsible for guaranteeing that the pwq stays online.
23774   *
23775 @@ -425,6 +434,31 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
23776                 if (({ assert_rcu_or_wq_mutex(wq); false; })) { }       \
23777                 else
23779 +#ifdef CONFIG_PREEMPT_RT_BASE
23780 +static inline void rt_lock_idle_list(struct worker_pool *pool)
23782 +       preempt_disable();
23784 +static inline void rt_unlock_idle_list(struct worker_pool *pool)
23786 +       preempt_enable();
23788 +static inline void sched_lock_idle_list(struct worker_pool *pool) { }
23789 +static inline void sched_unlock_idle_list(struct worker_pool *pool) { }
23790 +#else
23791 +static inline void rt_lock_idle_list(struct worker_pool *pool) { }
23792 +static inline void rt_unlock_idle_list(struct worker_pool *pool) { }
23793 +static inline void sched_lock_idle_list(struct worker_pool *pool)
23795 +       spin_lock_irq(&pool->lock);
23797 +static inline void sched_unlock_idle_list(struct worker_pool *pool)
23799 +       spin_unlock_irq(&pool->lock);
23801 +#endif
23804  #ifdef CONFIG_DEBUG_OBJECTS_WORK
23806  static struct debug_obj_descr work_debug_descr;
23807 @@ -549,7 +583,7 @@ static int worker_pool_assign_id(struct worker_pool *pool)
23808   * @wq: the target workqueue
23809   * @node: the node ID
23810   *
23811 - * This must be called with any of wq_pool_mutex, wq->mutex or sched RCU
23812 + * This must be called with any of wq_pool_mutex, wq->mutex or RCU
23813   * read locked.
23814   * If the pwq needs to be used beyond the locking in effect, the caller is
23815   * responsible for guaranteeing that the pwq stays online.
23816 @@ -693,8 +727,8 @@ static struct pool_workqueue *get_work_pwq(struct work_struct *work)
23817   * @work: the work item of interest
23818   *
23819   * Pools are created and destroyed under wq_pool_mutex, and allows read
23820 - * access under sched-RCU read lock.  As such, this function should be
23821 - * called under wq_pool_mutex or with preemption disabled.
23822 + * access under RCU read lock.  As such, this function should be
23823 + * called under wq_pool_mutex or inside of a rcu_read_lock() region.
23824   *
23825   * All fields of the returned pool are accessible as long as the above
23826   * mentioned locking is in effect.  If the returned pool needs to be used
23827 @@ -831,50 +865,45 @@ static struct worker *first_idle_worker(struct worker_pool *pool)
23828   */
23829  static void wake_up_worker(struct worker_pool *pool)
23831 -       struct worker *worker = first_idle_worker(pool);
23832 +       struct worker *worker;
23834 +       rt_lock_idle_list(pool);
23836 +       worker = first_idle_worker(pool);
23838         if (likely(worker))
23839                 wake_up_process(worker->task);
23841 +       rt_unlock_idle_list(pool);
23844  /**
23845 - * wq_worker_waking_up - a worker is waking up
23846 + * wq_worker_running - a worker is running again
23847   * @task: task waking up
23848 - * @cpu: CPU @task is waking up to
23849   *
23850 - * This function is called during try_to_wake_up() when a worker is
23851 - * being awoken.
23852 - *
23853 - * CONTEXT:
23854 - * spin_lock_irq(rq->lock)
23855 + * This function is called when a worker returns from schedule()
23856   */
23857 -void wq_worker_waking_up(struct task_struct *task, int cpu)
23858 +void wq_worker_running(struct task_struct *task)
23860         struct worker *worker = kthread_data(task);
23862 -       if (!(worker->flags & WORKER_NOT_RUNNING)) {
23863 -               WARN_ON_ONCE(worker->pool->cpu != cpu);
23864 +       if (!worker->sleeping)
23865 +               return;
23866 +       if (!(worker->flags & WORKER_NOT_RUNNING))
23867                 atomic_inc(&worker->pool->nr_running);
23868 -       }
23869 +       worker->sleeping = 0;
23872  /**
23873   * wq_worker_sleeping - a worker is going to sleep
23874   * @task: task going to sleep
23875   *
23876 - * This function is called during schedule() when a busy worker is
23877 - * going to sleep.  Worker on the same cpu can be woken up by
23878 - * returning pointer to its task.
23879 - *
23880 - * CONTEXT:
23881 - * spin_lock_irq(rq->lock)
23882 - *
23883 - * Return:
23884 - * Worker task on @cpu to wake up, %NULL if none.
23885 + * This function is called from schedule() when a busy worker is
23886 + * going to sleep.
23887   */
23888 -struct task_struct *wq_worker_sleeping(struct task_struct *task)
23889 +void wq_worker_sleeping(struct task_struct *task)
23891 -       struct worker *worker = kthread_data(task), *to_wakeup = NULL;
23892 +       struct worker *worker = kthread_data(task);
23893         struct worker_pool *pool;
23895         /*
23896 @@ -883,29 +912,26 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task)
23897          * checking NOT_RUNNING.
23898          */
23899         if (worker->flags & WORKER_NOT_RUNNING)
23900 -               return NULL;
23901 +               return;
23903         pool = worker->pool;
23905 -       /* this can only happen on the local cpu */
23906 -       if (WARN_ON_ONCE(pool->cpu != raw_smp_processor_id()))
23907 -               return NULL;
23908 +       if (WARN_ON_ONCE(worker->sleeping))
23909 +               return;
23911 +       worker->sleeping = 1;
23913         /*
23914          * The counterpart of the following dec_and_test, implied mb,
23915          * worklist not empty test sequence is in insert_work().
23916          * Please read comment there.
23917 -        *
23918 -        * NOT_RUNNING is clear.  This means that we're bound to and
23919 -        * running on the local cpu w/ rq lock held and preemption
23920 -        * disabled, which in turn means that none else could be
23921 -        * manipulating idle_list, so dereferencing idle_list without pool
23922 -        * lock is safe.
23923          */
23924         if (atomic_dec_and_test(&pool->nr_running) &&
23925 -           !list_empty(&pool->worklist))
23926 -               to_wakeup = first_idle_worker(pool);
23927 -       return to_wakeup ? to_wakeup->task : NULL;
23928 +           !list_empty(&pool->worklist)) {
23929 +               sched_lock_idle_list(pool);
23930 +               wake_up_worker(pool);
23931 +               sched_unlock_idle_list(pool);
23932 +       }
23935  /**
23936 @@ -1099,12 +1125,14 @@ static void put_pwq_unlocked(struct pool_workqueue *pwq)
23938         if (pwq) {
23939                 /*
23940 -                * As both pwqs and pools are sched-RCU protected, the
23941 +                * As both pwqs and pools are RCU protected, the
23942                  * following lock operations are safe.
23943                  */
23944 -               spin_lock_irq(&pwq->pool->lock);
23945 +               rcu_read_lock();
23946 +               local_spin_lock_irq(pendingb_lock, &pwq->pool->lock);
23947                 put_pwq(pwq);
23948 -               spin_unlock_irq(&pwq->pool->lock);
23949 +               local_spin_unlock_irq(pendingb_lock, &pwq->pool->lock);
23950 +               rcu_read_unlock();
23951         }
23954 @@ -1208,7 +1236,7 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
23955         struct worker_pool *pool;
23956         struct pool_workqueue *pwq;
23958 -       local_irq_save(*flags);
23959 +       local_lock_irqsave(pendingb_lock, *flags);
23961         /* try to steal the timer if it exists */
23962         if (is_dwork) {
23963 @@ -1227,6 +1255,7 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
23964         if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)))
23965                 return 0;
23967 +       rcu_read_lock();
23968         /*
23969          * The queueing is in progress, or it is already queued. Try to
23970          * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
23971 @@ -1265,14 +1294,16 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
23972                 set_work_pool_and_keep_pending(work, pool->id);
23974                 spin_unlock(&pool->lock);
23975 +               rcu_read_unlock();
23976                 return 1;
23977         }
23978         spin_unlock(&pool->lock);
23979  fail:
23980 -       local_irq_restore(*flags);
23981 +       rcu_read_unlock();
23982 +       local_unlock_irqrestore(pendingb_lock, *flags);
23983         if (work_is_canceling(work))
23984                 return -ENOENT;
23985 -       cpu_relax();
23986 +       cpu_chill();
23987         return -EAGAIN;
23990 @@ -1374,7 +1405,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
23991          * queued or lose PENDING.  Grabbing PENDING and queueing should
23992          * happen with IRQ disabled.
23993          */
23994 -       WARN_ON_ONCE(!irqs_disabled());
23995 +       WARN_ON_ONCE_NONRT(!irqs_disabled());
23997         debug_work_activate(work);
23999 @@ -1382,6 +1413,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
24000         if (unlikely(wq->flags & __WQ_DRAINING) &&
24001             WARN_ON_ONCE(!is_chained_work(wq)))
24002                 return;
24003 +       rcu_read_lock();
24004  retry:
24005         if (req_cpu == WORK_CPU_UNBOUND)
24006                 cpu = wq_select_unbound_cpu(raw_smp_processor_id());
24007 @@ -1438,10 +1470,8 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
24008         /* pwq determined, queue */
24009         trace_workqueue_queue_work(req_cpu, pwq, work);
24011 -       if (WARN_ON(!list_empty(&work->entry))) {
24012 -               spin_unlock(&pwq->pool->lock);
24013 -               return;
24014 -       }
24015 +       if (WARN_ON(!list_empty(&work->entry)))
24016 +               goto out;
24018         pwq->nr_in_flight[pwq->work_color]++;
24019         work_flags = work_color_to_flags(pwq->work_color);
24020 @@ -1459,7 +1489,9 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
24022         insert_work(pwq, work, worklist, work_flags);
24024 +out:
24025         spin_unlock(&pwq->pool->lock);
24026 +       rcu_read_unlock();
24029  /**
24030 @@ -1479,14 +1511,14 @@ bool queue_work_on(int cpu, struct workqueue_struct *wq,
24031         bool ret = false;
24032         unsigned long flags;
24034 -       local_irq_save(flags);
24035 +       local_lock_irqsave(pendingb_lock,flags);
24037         if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
24038                 __queue_work(cpu, wq, work);
24039                 ret = true;
24040         }
24042 -       local_irq_restore(flags);
24043 +       local_unlock_irqrestore(pendingb_lock, flags);
24044         return ret;
24046  EXPORT_SYMBOL(queue_work_on);
24047 @@ -1554,14 +1586,14 @@ bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
24048         unsigned long flags;
24050         /* read the comment in __queue_work() */
24051 -       local_irq_save(flags);
24052 +       local_lock_irqsave(pendingb_lock, flags);
24054         if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
24055                 __queue_delayed_work(cpu, wq, dwork, delay);
24056                 ret = true;
24057         }
24059 -       local_irq_restore(flags);
24060 +       local_unlock_irqrestore(pendingb_lock, flags);
24061         return ret;
24063  EXPORT_SYMBOL(queue_delayed_work_on);
24064 @@ -1596,7 +1628,7 @@ bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq,
24066         if (likely(ret >= 0)) {
24067                 __queue_delayed_work(cpu, wq, dwork, delay);
24068 -               local_irq_restore(flags);
24069 +               local_unlock_irqrestore(pendingb_lock, flags);
24070         }
24072         /* -ENOENT from try_to_grab_pending() becomes %true */
24073 @@ -1629,7 +1661,9 @@ static void worker_enter_idle(struct worker *worker)
24074         worker->last_active = jiffies;
24076         /* idle_list is LIFO */
24077 +       rt_lock_idle_list(pool);
24078         list_add(&worker->entry, &pool->idle_list);
24079 +       rt_unlock_idle_list(pool);
24081         if (too_many_workers(pool) && !timer_pending(&pool->idle_timer))
24082                 mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);
24083 @@ -1662,7 +1696,9 @@ static void worker_leave_idle(struct worker *worker)
24084                 return;
24085         worker_clr_flags(worker, WORKER_IDLE);
24086         pool->nr_idle--;
24087 +       rt_lock_idle_list(pool);
24088         list_del_init(&worker->entry);
24089 +       rt_unlock_idle_list(pool);
24092  static struct worker *alloc_worker(int node)
24093 @@ -1828,7 +1864,9 @@ static void destroy_worker(struct worker *worker)
24094         pool->nr_workers--;
24095         pool->nr_idle--;
24097 +       rt_lock_idle_list(pool);
24098         list_del_init(&worker->entry);
24099 +       rt_unlock_idle_list(pool);
24100         worker->flags |= WORKER_DIE;
24101         wake_up_process(worker->task);
24103 @@ -2780,14 +2818,14 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)
24105         might_sleep();
24107 -       local_irq_disable();
24108 +       rcu_read_lock();
24109         pool = get_work_pool(work);
24110         if (!pool) {
24111 -               local_irq_enable();
24112 +               rcu_read_unlock();
24113                 return false;
24114         }
24116 -       spin_lock(&pool->lock);
24117 +       spin_lock_irq(&pool->lock);
24118         /* see the comment in try_to_grab_pending() with the same code */
24119         pwq = get_work_pwq(work);
24120         if (pwq) {
24121 @@ -2816,10 +2854,11 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)
24122         else
24123                 lock_map_acquire_read(&pwq->wq->lockdep_map);
24124         lock_map_release(&pwq->wq->lockdep_map);
24126 +       rcu_read_unlock();
24127         return true;
24128  already_gone:
24129         spin_unlock_irq(&pool->lock);
24130 +       rcu_read_unlock();
24131         return false;
24134 @@ -2906,7 +2945,7 @@ static bool __cancel_work_timer(struct work_struct *work, bool is_dwork)
24136         /* tell other tasks trying to grab @work to back off */
24137         mark_work_canceling(work);
24138 -       local_irq_restore(flags);
24139 +       local_unlock_irqrestore(pendingb_lock, flags);
24141         flush_work(work);
24142         clear_work_data(work);
24143 @@ -2961,10 +3000,10 @@ EXPORT_SYMBOL_GPL(cancel_work_sync);
24144   */
24145  bool flush_delayed_work(struct delayed_work *dwork)
24147 -       local_irq_disable();
24148 +       local_lock_irq(pendingb_lock);
24149         if (del_timer_sync(&dwork->timer))
24150                 __queue_work(dwork->cpu, dwork->wq, &dwork->work);
24151 -       local_irq_enable();
24152 +       local_unlock_irq(pendingb_lock);
24153         return flush_work(&dwork->work);
24155  EXPORT_SYMBOL(flush_delayed_work);
24156 @@ -2982,7 +3021,7 @@ static bool __cancel_work(struct work_struct *work, bool is_dwork)
24157                 return false;
24159         set_work_pool_and_clear_pending(work, get_work_pool_id(work));
24160 -       local_irq_restore(flags);
24161 +       local_unlock_irqrestore(pendingb_lock, flags);
24162         return ret;
24165 @@ -3239,7 +3278,7 @@ static void rcu_free_pool(struct rcu_head *rcu)
24166   * put_unbound_pool - put a worker_pool
24167   * @pool: worker_pool to put
24168   *
24169 - * Put @pool.  If its refcnt reaches zero, it gets destroyed in sched-RCU
24170 + * Put @pool.  If its refcnt reaches zero, it gets destroyed in RCU
24171   * safe manner.  get_unbound_pool() calls this function on its failure path
24172   * and this function should be able to release pools which went through,
24173   * successfully or not, init_worker_pool().
24174 @@ -3293,8 +3332,8 @@ static void put_unbound_pool(struct worker_pool *pool)
24175         del_timer_sync(&pool->idle_timer);
24176         del_timer_sync(&pool->mayday_timer);
24178 -       /* sched-RCU protected to allow dereferences from get_work_pool() */
24179 -       call_rcu_sched(&pool->rcu, rcu_free_pool);
24180 +       /* RCU protected to allow dereferences from get_work_pool() */
24181 +       call_rcu(&pool->rcu, rcu_free_pool);
24184  /**
24185 @@ -3401,14 +3440,14 @@ static void pwq_unbound_release_workfn(struct work_struct *work)
24186         put_unbound_pool(pool);
24187         mutex_unlock(&wq_pool_mutex);
24189 -       call_rcu_sched(&pwq->rcu, rcu_free_pwq);
24190 +       call_rcu(&pwq->rcu, rcu_free_pwq);
24192         /*
24193          * If we're the last pwq going away, @wq is already dead and no one
24194          * is gonna access it anymore.  Schedule RCU free.
24195          */
24196         if (is_last)
24197 -               call_rcu_sched(&wq->rcu, rcu_free_wq);
24198 +               call_rcu(&wq->rcu, rcu_free_wq);
24201  /**
24202 @@ -4072,7 +4111,7 @@ void destroy_workqueue(struct workqueue_struct *wq)
24203                  * The base ref is never dropped on per-cpu pwqs.  Directly
24204                  * schedule RCU free.
24205                  */
24206 -               call_rcu_sched(&wq->rcu, rcu_free_wq);
24207 +               call_rcu(&wq->rcu, rcu_free_wq);
24208         } else {
24209                 /*
24210                  * We're the sole accessor of @wq at this point.  Directly
24211 @@ -4166,7 +4205,8 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq)
24212         struct pool_workqueue *pwq;
24213         bool ret;
24215 -       rcu_read_lock_sched();
24216 +       rcu_read_lock();
24217 +       preempt_disable();
24219         if (cpu == WORK_CPU_UNBOUND)
24220                 cpu = smp_processor_id();
24221 @@ -4177,7 +4217,8 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq)
24222                 pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu));
24224         ret = !list_empty(&pwq->delayed_works);
24225 -       rcu_read_unlock_sched();
24226 +       preempt_enable();
24227 +       rcu_read_unlock();
24229         return ret;
24231 @@ -4203,15 +4244,15 @@ unsigned int work_busy(struct work_struct *work)
24232         if (work_pending(work))
24233                 ret |= WORK_BUSY_PENDING;
24235 -       local_irq_save(flags);
24236 +       rcu_read_lock();
24237         pool = get_work_pool(work);
24238         if (pool) {
24239 -               spin_lock(&pool->lock);
24240 +               spin_lock_irqsave(&pool->lock, flags);
24241                 if (find_worker_executing_work(pool, work))
24242                         ret |= WORK_BUSY_RUNNING;
24243 -               spin_unlock(&pool->lock);
24244 +               spin_unlock_irqrestore(&pool->lock, flags);
24245         }
24246 -       local_irq_restore(flags);
24247 +       rcu_read_unlock();
24249         return ret;
24251 @@ -4400,7 +4441,7 @@ void show_workqueue_state(void)
24252         unsigned long flags;
24253         int pi;
24255 -       rcu_read_lock_sched();
24256 +       rcu_read_lock();
24258         pr_info("Showing busy workqueues and worker pools:\n");
24260 @@ -4453,7 +4494,7 @@ void show_workqueue_state(void)
24261                 spin_unlock_irqrestore(&pool->lock, flags);
24262         }
24264 -       rcu_read_unlock_sched();
24265 +       rcu_read_unlock();
24268  /*
24269 @@ -4791,16 +4832,16 @@ bool freeze_workqueues_busy(void)
24270                  * nr_active is monotonically decreasing.  It's safe
24271                  * to peek without lock.
24272                  */
24273 -               rcu_read_lock_sched();
24274 +               rcu_read_lock();
24275                 for_each_pwq(pwq, wq) {
24276                         WARN_ON_ONCE(pwq->nr_active < 0);
24277                         if (pwq->nr_active) {
24278                                 busy = true;
24279 -                               rcu_read_unlock_sched();
24280 +                               rcu_read_unlock();
24281                                 goto out_unlock;
24282                         }
24283                 }
24284 -               rcu_read_unlock_sched();
24285 +               rcu_read_unlock();
24286         }
24287  out_unlock:
24288         mutex_unlock(&wq_pool_mutex);
24289 @@ -4990,7 +5031,8 @@ static ssize_t wq_pool_ids_show(struct device *dev,
24290         const char *delim = "";
24291         int node, written = 0;
24293 -       rcu_read_lock_sched();
24294 +       get_online_cpus();
24295 +       rcu_read_lock();
24296         for_each_node(node) {
24297                 written += scnprintf(buf + written, PAGE_SIZE - written,
24298                                      "%s%d:%d", delim, node,
24299 @@ -4998,7 +5040,8 @@ static ssize_t wq_pool_ids_show(struct device *dev,
24300                 delim = " ";
24301         }
24302         written += scnprintf(buf + written, PAGE_SIZE - written, "\n");
24303 -       rcu_read_unlock_sched();
24304 +       rcu_read_unlock();
24305 +       put_online_cpus();
24307         return written;
24309 diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h
24310 index 29fa81f0f51a..42d1e3974554 100644
24311 --- a/kernel/workqueue_internal.h
24312 +++ b/kernel/workqueue_internal.h
24313 @@ -44,6 +44,7 @@ struct worker {
24314         unsigned long           last_active;    /* L: last active timestamp */
24315         unsigned int            flags;          /* X: flags */
24316         int                     id;             /* I: worker id */
24317 +       int                     sleeping;       /* None */
24319         /*
24320          * Opaque string set with work_set_desc().  Printed out with task
24321 @@ -69,7 +70,7 @@ static inline struct worker *current_wq_worker(void)
24322   * Scheduler hooks for concurrency managed workqueue.  Only to be used from
24323   * sched/core.c and workqueue.c.
24324   */
24325 -void wq_worker_waking_up(struct task_struct *task, int cpu);
24326 -struct task_struct *wq_worker_sleeping(struct task_struct *task);
24327 +void wq_worker_running(struct task_struct *task);
24328 +void wq_worker_sleeping(struct task_struct *task);
24330  #endif /* _KERNEL_WORKQUEUE_INTERNAL_H */
24331 diff --git a/lib/Kconfig b/lib/Kconfig
24332 index 260a80e313b9..b06becb3f477 100644
24333 --- a/lib/Kconfig
24334 +++ b/lib/Kconfig
24335 @@ -400,6 +400,7 @@ config CHECK_SIGNATURE
24337  config CPUMASK_OFFSTACK
24338         bool "Force CPU masks off stack" if DEBUG_PER_CPU_MAPS
24339 +       depends on !PREEMPT_RT_FULL
24340         help
24341           Use dynamic allocation for cpumask_var_t, instead of putting
24342           them on the stack.  This is a bit more expensive, but avoids
24343 diff --git a/lib/debugobjects.c b/lib/debugobjects.c
24344 index 056052dc8e91..d8494e126de8 100644
24345 --- a/lib/debugobjects.c
24346 +++ b/lib/debugobjects.c
24347 @@ -308,7 +308,10 @@ __debug_object_init(void *addr, struct debug_obj_descr *descr, int onstack)
24348         struct debug_obj *obj;
24349         unsigned long flags;
24351 -       fill_pool();
24352 +#ifdef CONFIG_PREEMPT_RT_FULL
24353 +       if (preempt_count() == 0 && !irqs_disabled())
24354 +#endif
24355 +               fill_pool();
24357         db = get_bucket((unsigned long) addr);
24359 diff --git a/lib/idr.c b/lib/idr.c
24360 index 6098336df267..9decbe914595 100644
24361 --- a/lib/idr.c
24362 +++ b/lib/idr.c
24363 @@ -30,6 +30,7 @@
24364  #include <linux/idr.h>
24365  #include <linux/spinlock.h>
24366  #include <linux/percpu.h>
24367 +#include <linux/locallock.h>
24369  #define MAX_IDR_SHIFT          (sizeof(int) * 8 - 1)
24370  #define MAX_IDR_BIT            (1U << MAX_IDR_SHIFT)
24371 @@ -45,6 +46,37 @@ static DEFINE_PER_CPU(struct idr_layer *, idr_preload_head);
24372  static DEFINE_PER_CPU(int, idr_preload_cnt);
24373  static DEFINE_SPINLOCK(simple_ida_lock);
24375 +#ifdef CONFIG_PREEMPT_RT_FULL
24376 +static DEFINE_LOCAL_IRQ_LOCK(idr_lock);
24378 +static inline void idr_preload_lock(void)
24380 +       local_lock(idr_lock);
24383 +static inline void idr_preload_unlock(void)
24385 +       local_unlock(idr_lock);
24388 +void idr_preload_end(void)
24390 +       idr_preload_unlock();
24392 +EXPORT_SYMBOL(idr_preload_end);
24393 +#else
24394 +static inline void idr_preload_lock(void)
24396 +       preempt_disable();
24399 +static inline void idr_preload_unlock(void)
24401 +       preempt_enable();
24403 +#endif
24406  /* the maximum ID which can be allocated given idr->layers */
24407  static int idr_max(int layers)
24409 @@ -115,14 +147,14 @@ static struct idr_layer *idr_layer_alloc(gfp_t gfp_mask, struct idr *layer_idr)
24410          * context.  See idr_preload() for details.
24411          */
24412         if (!in_interrupt()) {
24413 -               preempt_disable();
24414 +               idr_preload_lock();
24415                 new = __this_cpu_read(idr_preload_head);
24416                 if (new) {
24417                         __this_cpu_write(idr_preload_head, new->ary[0]);
24418                         __this_cpu_dec(idr_preload_cnt);
24419                         new->ary[0] = NULL;
24420                 }
24421 -               preempt_enable();
24422 +               idr_preload_unlock();
24423                 if (new)
24424                         return new;
24425         }
24426 @@ -366,7 +398,6 @@ static void idr_fill_slot(struct idr *idr, void *ptr, int id,
24427         idr_mark_full(pa, id);
24431  /**
24432   * idr_preload - preload for idr_alloc()
24433   * @gfp_mask: allocation mask to use for preloading
24434 @@ -401,7 +432,7 @@ void idr_preload(gfp_t gfp_mask)
24435         WARN_ON_ONCE(in_interrupt());
24436         might_sleep_if(gfpflags_allow_blocking(gfp_mask));
24438 -       preempt_disable();
24439 +       idr_preload_lock();
24441         /*
24442          * idr_alloc() is likely to succeed w/o full idr_layer buffer and
24443 @@ -413,9 +444,9 @@ void idr_preload(gfp_t gfp_mask)
24444         while (__this_cpu_read(idr_preload_cnt) < MAX_IDR_FREE) {
24445                 struct idr_layer *new;
24447 -               preempt_enable();
24448 +               idr_preload_unlock();
24449                 new = kmem_cache_zalloc(idr_layer_cache, gfp_mask);
24450 -               preempt_disable();
24451 +               idr_preload_lock();
24452                 if (!new)
24453                         break;
24455 diff --git a/lib/irq_poll.c b/lib/irq_poll.c
24456 index 1d6565e81030..b23a79761df7 100644
24457 --- a/lib/irq_poll.c
24458 +++ b/lib/irq_poll.c
24459 @@ -36,6 +36,7 @@ void irq_poll_sched(struct irq_poll *iop)
24460         list_add_tail(&iop->list, this_cpu_ptr(&blk_cpu_iopoll));
24461         __raise_softirq_irqoff(IRQ_POLL_SOFTIRQ);
24462         local_irq_restore(flags);
24463 +       preempt_check_resched_rt();
24465  EXPORT_SYMBOL(irq_poll_sched);
24467 @@ -71,6 +72,7 @@ void irq_poll_complete(struct irq_poll *iop)
24468         local_irq_save(flags);
24469         __irq_poll_complete(iop);
24470         local_irq_restore(flags);
24471 +       preempt_check_resched_rt();
24473  EXPORT_SYMBOL(irq_poll_complete);
24475 @@ -95,6 +97,7 @@ static void __latent_entropy irq_poll_softirq(struct softirq_action *h)
24476                 }
24478                 local_irq_enable();
24479 +               preempt_check_resched_rt();
24481                 /* Even though interrupts have been re-enabled, this
24482                  * access is safe because interrupts can only add new
24483 @@ -132,6 +135,7 @@ static void __latent_entropy irq_poll_softirq(struct softirq_action *h)
24484                 __raise_softirq_irqoff(IRQ_POLL_SOFTIRQ);
24486         local_irq_enable();
24487 +       preempt_check_resched_rt();
24490  /**
24491 @@ -195,6 +199,7 @@ static int irq_poll_cpu_dead(unsigned int cpu)
24492                          this_cpu_ptr(&blk_cpu_iopoll));
24493         __raise_softirq_irqoff(IRQ_POLL_SOFTIRQ);
24494         local_irq_enable();
24495 +       preempt_check_resched_rt();
24497         return 0;
24499 diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c
24500 index f3a217ea0388..4611b156ef79 100644
24501 --- a/lib/locking-selftest.c
24502 +++ b/lib/locking-selftest.c
24503 @@ -590,6 +590,8 @@ GENERATE_TESTCASE(init_held_rsem)
24504  #include "locking-selftest-spin-hardirq.h"
24505  GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_hard_spin)
24507 +#ifndef CONFIG_PREEMPT_RT_FULL
24509  #include "locking-selftest-rlock-hardirq.h"
24510  GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_hard_rlock)
24512 @@ -605,9 +607,12 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_soft_rlock)
24513  #include "locking-selftest-wlock-softirq.h"
24514  GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_soft_wlock)
24516 +#endif
24518  #undef E1
24519  #undef E2
24521 +#ifndef CONFIG_PREEMPT_RT_FULL
24522  /*
24523   * Enabling hardirqs with a softirq-safe lock held:
24524   */
24525 @@ -640,6 +645,8 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2A_rlock)
24526  #undef E1
24527  #undef E2
24529 +#endif
24531  /*
24532   * Enabling irqs with an irq-safe lock held:
24533   */
24534 @@ -663,6 +670,8 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2A_rlock)
24535  #include "locking-selftest-spin-hardirq.h"
24536  GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_hard_spin)
24538 +#ifndef CONFIG_PREEMPT_RT_FULL
24540  #include "locking-selftest-rlock-hardirq.h"
24541  GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_hard_rlock)
24543 @@ -678,6 +687,8 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_soft_rlock)
24544  #include "locking-selftest-wlock-softirq.h"
24545  GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_soft_wlock)
24547 +#endif
24549  #undef E1
24550  #undef E2
24552 @@ -709,6 +720,8 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_soft_wlock)
24553  #include "locking-selftest-spin-hardirq.h"
24554  GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_hard_spin)
24556 +#ifndef CONFIG_PREEMPT_RT_FULL
24558  #include "locking-selftest-rlock-hardirq.h"
24559  GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_hard_rlock)
24561 @@ -724,6 +737,8 @@ GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_soft_rlock)
24562  #include "locking-selftest-wlock-softirq.h"
24563  GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_soft_wlock)
24565 +#endif
24567  #undef E1
24568  #undef E2
24569  #undef E3
24570 @@ -757,6 +772,8 @@ GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_soft_wlock)
24571  #include "locking-selftest-spin-hardirq.h"
24572  GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_hard_spin)
24574 +#ifndef CONFIG_PREEMPT_RT_FULL
24576  #include "locking-selftest-rlock-hardirq.h"
24577  GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_hard_rlock)
24579 @@ -772,10 +789,14 @@ GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_soft_rlock)
24580  #include "locking-selftest-wlock-softirq.h"
24581  GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_soft_wlock)
24583 +#endif
24585  #undef E1
24586  #undef E2
24587  #undef E3
24589 +#ifndef CONFIG_PREEMPT_RT_FULL
24591  /*
24592   * read-lock / write-lock irq inversion.
24593   *
24594 @@ -838,6 +859,10 @@ GENERATE_PERMUTATIONS_3_EVENTS(irq_inversion_soft_wlock)
24595  #undef E2
24596  #undef E3
24598 +#endif
24600 +#ifndef CONFIG_PREEMPT_RT_FULL
24602  /*
24603   * read-lock / write-lock recursion that is actually safe.
24604   */
24605 @@ -876,6 +901,8 @@ GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion_soft)
24606  #undef E2
24607  #undef E3
24609 +#endif
24611  /*
24612   * read-lock / write-lock recursion that is unsafe.
24613   */
24614 @@ -1858,6 +1885,7 @@ void locking_selftest(void)
24616         printk("  --------------------------------------------------------------------------\n");
24618 +#ifndef CONFIG_PREEMPT_RT_FULL
24619         /*
24620          * irq-context testcases:
24621          */
24622 @@ -1870,6 +1898,28 @@ void locking_selftest(void)
24624         DO_TESTCASE_6x2("irq read-recursion", irq_read_recursion);
24625  //     DO_TESTCASE_6x2B("irq read-recursion #2", irq_read_recursion2);
24626 +#else
24627 +       /* On -rt, we only do hardirq context test for raw spinlock */
24628 +       DO_TESTCASE_1B("hard-irqs-on + irq-safe-A", irqsafe1_hard_spin, 12);
24629 +       DO_TESTCASE_1B("hard-irqs-on + irq-safe-A", irqsafe1_hard_spin, 21);
24631 +       DO_TESTCASE_1B("hard-safe-A + irqs-on", irqsafe2B_hard_spin, 12);
24632 +       DO_TESTCASE_1B("hard-safe-A + irqs-on", irqsafe2B_hard_spin, 21);
24634 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 123);
24635 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 132);
24636 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 213);
24637 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 231);
24638 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 312);
24639 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 321);
24641 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 123);
24642 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 132);
24643 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 213);
24644 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 231);
24645 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 312);
24646 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 321);
24647 +#endif
24649         ww_tests();
24651 diff --git a/lib/percpu_ida.c b/lib/percpu_ida.c
24652 index 6d40944960de..822a2c027e72 100644
24653 --- a/lib/percpu_ida.c
24654 +++ b/lib/percpu_ida.c
24655 @@ -26,6 +26,9 @@
24656  #include <linux/string.h>
24657  #include <linux/spinlock.h>
24658  #include <linux/percpu_ida.h>
24659 +#include <linux/locallock.h>
24661 +static DEFINE_LOCAL_IRQ_LOCK(irq_off_lock);
24663  struct percpu_ida_cpu {
24664         /*
24665 @@ -148,13 +151,13 @@ int percpu_ida_alloc(struct percpu_ida *pool, int state)
24666         unsigned long flags;
24667         int tag;
24669 -       local_irq_save(flags);
24670 +       local_lock_irqsave(irq_off_lock, flags);
24671         tags = this_cpu_ptr(pool->tag_cpu);
24673         /* Fastpath */
24674         tag = alloc_local_tag(tags);
24675         if (likely(tag >= 0)) {
24676 -               local_irq_restore(flags);
24677 +               local_unlock_irqrestore(irq_off_lock, flags);
24678                 return tag;
24679         }
24681 @@ -173,6 +176,7 @@ int percpu_ida_alloc(struct percpu_ida *pool, int state)
24683                 if (!tags->nr_free)
24684                         alloc_global_tags(pool, tags);
24686                 if (!tags->nr_free)
24687                         steal_tags(pool, tags);
24689 @@ -184,7 +188,7 @@ int percpu_ida_alloc(struct percpu_ida *pool, int state)
24690                 }
24692                 spin_unlock(&pool->lock);
24693 -               local_irq_restore(flags);
24694 +               local_unlock_irqrestore(irq_off_lock, flags);
24696                 if (tag >= 0 || state == TASK_RUNNING)
24697                         break;
24698 @@ -196,7 +200,7 @@ int percpu_ida_alloc(struct percpu_ida *pool, int state)
24700                 schedule();
24702 -               local_irq_save(flags);
24703 +               local_lock_irqsave(irq_off_lock, flags);
24704                 tags = this_cpu_ptr(pool->tag_cpu);
24705         }
24706         if (state != TASK_RUNNING)
24707 @@ -221,7 +225,7 @@ void percpu_ida_free(struct percpu_ida *pool, unsigned tag)
24709         BUG_ON(tag >= pool->nr_tags);
24711 -       local_irq_save(flags);
24712 +       local_lock_irqsave(irq_off_lock, flags);
24713         tags = this_cpu_ptr(pool->tag_cpu);
24715         spin_lock(&tags->lock);
24716 @@ -253,7 +257,7 @@ void percpu_ida_free(struct percpu_ida *pool, unsigned tag)
24717                 spin_unlock(&pool->lock);
24718         }
24720 -       local_irq_restore(flags);
24721 +       local_unlock_irqrestore(irq_off_lock, flags);
24723  EXPORT_SYMBOL_GPL(percpu_ida_free);
24725 @@ -345,7 +349,7 @@ int percpu_ida_for_each_free(struct percpu_ida *pool, percpu_ida_cb fn,
24726         struct percpu_ida_cpu *remote;
24727         unsigned cpu, i, err = 0;
24729 -       local_irq_save(flags);
24730 +       local_lock_irqsave(irq_off_lock, flags);
24731         for_each_possible_cpu(cpu) {
24732                 remote = per_cpu_ptr(pool->tag_cpu, cpu);
24733                 spin_lock(&remote->lock);
24734 @@ -367,7 +371,7 @@ int percpu_ida_for_each_free(struct percpu_ida *pool, percpu_ida_cb fn,
24735         }
24736         spin_unlock(&pool->lock);
24737  out:
24738 -       local_irq_restore(flags);
24739 +       local_unlock_irqrestore(irq_off_lock, flags);
24740         return err;
24742  EXPORT_SYMBOL_GPL(percpu_ida_for_each_free);
24743 diff --git a/lib/radix-tree.c b/lib/radix-tree.c
24744 index 8e6d552c40dd..741da5a77fd5 100644
24745 --- a/lib/radix-tree.c
24746 +++ b/lib/radix-tree.c
24747 @@ -36,7 +36,7 @@
24748  #include <linux/bitops.h>
24749  #include <linux/rcupdate.h>
24750  #include <linux/preempt.h>             /* in_interrupt() */
24752 +#include <linux/locallock.h>
24754  /* Number of nodes in fully populated tree of given height */
24755  static unsigned long height_to_maxnodes[RADIX_TREE_MAX_PATH + 1] __read_mostly;
24756 @@ -68,6 +68,7 @@ struct radix_tree_preload {
24757         struct radix_tree_node *nodes;
24758  };
24759  static DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, };
24760 +static DEFINE_LOCAL_IRQ_LOCK(radix_tree_preloads_lock);
24762  static inline void *node_to_entry(void *ptr)
24764 @@ -290,13 +291,14 @@ radix_tree_node_alloc(struct radix_tree_root *root)
24765                  * succeed in getting a node here (and never reach
24766                  * kmem_cache_alloc)
24767                  */
24768 -               rtp = this_cpu_ptr(&radix_tree_preloads);
24769 +               rtp = &get_locked_var(radix_tree_preloads_lock, radix_tree_preloads);
24770                 if (rtp->nr) {
24771                         ret = rtp->nodes;
24772                         rtp->nodes = ret->private_data;
24773                         ret->private_data = NULL;
24774                         rtp->nr--;
24775                 }
24776 +               put_locked_var(radix_tree_preloads_lock, radix_tree_preloads);
24777                 /*
24778                  * Update the allocation stack trace as this is more useful
24779                  * for debugging.
24780 @@ -357,14 +359,14 @@ static int __radix_tree_preload(gfp_t gfp_mask, int nr)
24781          */
24782         gfp_mask &= ~__GFP_ACCOUNT;
24784 -       preempt_disable();
24785 +       local_lock(radix_tree_preloads_lock);
24786         rtp = this_cpu_ptr(&radix_tree_preloads);
24787         while (rtp->nr < nr) {
24788 -               preempt_enable();
24789 +               local_unlock(radix_tree_preloads_lock);
24790                 node = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask);
24791                 if (node == NULL)
24792                         goto out;
24793 -               preempt_disable();
24794 +               local_lock(radix_tree_preloads_lock);
24795                 rtp = this_cpu_ptr(&radix_tree_preloads);
24796                 if (rtp->nr < nr) {
24797                         node->private_data = rtp->nodes;
24798 @@ -406,7 +408,7 @@ int radix_tree_maybe_preload(gfp_t gfp_mask)
24799         if (gfpflags_allow_blocking(gfp_mask))
24800                 return __radix_tree_preload(gfp_mask, RADIX_TREE_PRELOAD_SIZE);
24801         /* Preloading doesn't help anything with this gfp mask, skip it */
24802 -       preempt_disable();
24803 +       local_lock(radix_tree_preloads_lock);
24804         return 0;
24806  EXPORT_SYMBOL(radix_tree_maybe_preload);
24807 @@ -422,7 +424,7 @@ int radix_tree_maybe_preload_order(gfp_t gfp_mask, int order)
24809         /* Preloading doesn't help anything with this gfp mask, skip it */
24810         if (!gfpflags_allow_blocking(gfp_mask)) {
24811 -               preempt_disable();
24812 +               local_lock(radix_tree_preloads_lock);
24813                 return 0;
24814         }
24816 @@ -456,6 +458,12 @@ int radix_tree_maybe_preload_order(gfp_t gfp_mask, int order)
24817         return __radix_tree_preload(gfp_mask, nr_nodes);
24820 +void radix_tree_preload_end(void)
24822 +       local_unlock(radix_tree_preloads_lock);
24824 +EXPORT_SYMBOL(radix_tree_preload_end);
24826  /*
24827   * The maximum index which can be stored in a radix tree
24828   */
24829 diff --git a/lib/scatterlist.c b/lib/scatterlist.c
24830 index 004fc70fc56a..ccc46992a517 100644
24831 --- a/lib/scatterlist.c
24832 +++ b/lib/scatterlist.c
24833 @@ -620,7 +620,7 @@ void sg_miter_stop(struct sg_mapping_iter *miter)
24834                         flush_kernel_dcache_page(miter->page);
24836                 if (miter->__flags & SG_MITER_ATOMIC) {
24837 -                       WARN_ON_ONCE(preemptible());
24838 +                       WARN_ON_ONCE(!pagefault_disabled());
24839                         kunmap_atomic(miter->addr);
24840                 } else
24841                         kunmap(miter->page);
24842 @@ -664,7 +664,7 @@ size_t sg_copy_buffer(struct scatterlist *sgl, unsigned int nents, void *buf,
24843         if (!sg_miter_skip(&miter, skip))
24844                 return false;
24846 -       local_irq_save(flags);
24847 +       local_irq_save_nort(flags);
24849         while (sg_miter_next(&miter) && offset < buflen) {
24850                 unsigned int len;
24851 @@ -681,7 +681,7 @@ size_t sg_copy_buffer(struct scatterlist *sgl, unsigned int nents, void *buf,
24853         sg_miter_stop(&miter);
24855 -       local_irq_restore(flags);
24856 +       local_irq_restore_nort(flags);
24857         return offset;
24859  EXPORT_SYMBOL(sg_copy_buffer);
24860 diff --git a/lib/smp_processor_id.c b/lib/smp_processor_id.c
24861 index 1afec32de6f2..11fa431046a8 100644
24862 --- a/lib/smp_processor_id.c
24863 +++ b/lib/smp_processor_id.c
24864 @@ -39,8 +39,9 @@ notrace static unsigned int check_preemption_disabled(const char *what1,
24865         if (!printk_ratelimit())
24866                 goto out_enable;
24868 -       printk(KERN_ERR "BUG: using %s%s() in preemptible [%08x] code: %s/%d\n",
24869 -               what1, what2, preempt_count() - 1, current->comm, current->pid);
24870 +       printk(KERN_ERR "BUG: using %s%s() in preemptible [%08x %08x] code: %s/%d\n",
24871 +               what1, what2, preempt_count() - 1, __migrate_disabled(current),
24872 +               current->comm, current->pid);
24874         print_symbol("caller is %s\n", (long)__builtin_return_address(0));
24875         dump_stack();
24876 diff --git a/mm/Kconfig b/mm/Kconfig
24877 index 86e3e0e74d20..77e5862a1ed2 100644
24878 --- a/mm/Kconfig
24879 +++ b/mm/Kconfig
24880 @@ -410,7 +410,7 @@ config NOMMU_INITIAL_TRIM_EXCESS
24882  config TRANSPARENT_HUGEPAGE
24883         bool "Transparent Hugepage Support"
24884 -       depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE
24885 +       depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE && !PREEMPT_RT_FULL
24886         select COMPACTION
24887         select RADIX_TREE_MULTIORDER
24888         help
24889 diff --git a/mm/backing-dev.c b/mm/backing-dev.c
24890 index 6ff2d7744223..b5a91dd53b5f 100644
24891 --- a/mm/backing-dev.c
24892 +++ b/mm/backing-dev.c
24893 @@ -457,9 +457,9 @@ void wb_congested_put(struct bdi_writeback_congested *congested)
24895         unsigned long flags;
24897 -       local_irq_save(flags);
24898 +       local_irq_save_nort(flags);
24899         if (!atomic_dec_and_lock(&congested->refcnt, &cgwb_lock)) {
24900 -               local_irq_restore(flags);
24901 +               local_irq_restore_nort(flags);
24902                 return;
24903         }
24905 diff --git a/mm/compaction.c b/mm/compaction.c
24906 index 70e6bec46dc2..6678ed58b7c6 100644
24907 --- a/mm/compaction.c
24908 +++ b/mm/compaction.c
24909 @@ -1593,10 +1593,12 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro
24910                                 block_start_pfn(cc->migrate_pfn, cc->order);
24912                         if (cc->last_migrated_pfn < current_block_start) {
24913 -                               cpu = get_cpu();
24914 +                               cpu = get_cpu_light();
24915 +                               local_lock_irq(swapvec_lock);
24916                                 lru_add_drain_cpu(cpu);
24917 +                               local_unlock_irq(swapvec_lock);
24918                                 drain_local_pages(zone);
24919 -                               put_cpu();
24920 +                               put_cpu_light();
24921                                 /* No more flushing until we migrate again */
24922                                 cc->last_migrated_pfn = 0;
24923                         }
24924 diff --git a/mm/filemap.c b/mm/filemap.c
24925 index edfb90e3830c..a8d2c7a73d54 100644
24926 --- a/mm/filemap.c
24927 +++ b/mm/filemap.c
24928 @@ -159,9 +159,12 @@ static int page_cache_tree_insert(struct address_space *mapping,
24929                  * node->private_list is protected by
24930                  * mapping->tree_lock.
24931                  */
24932 -               if (!list_empty(&node->private_list))
24933 -                       list_lru_del(&workingset_shadow_nodes,
24934 +               if (!list_empty(&node->private_list)) {
24935 +                       local_lock(workingset_shadow_lock);
24936 +                       list_lru_del(&__workingset_shadow_nodes,
24937                                      &node->private_list);
24938 +                       local_unlock(workingset_shadow_lock);
24939 +               }
24940         }
24941         return 0;
24943 @@ -217,8 +220,10 @@ static void page_cache_tree_delete(struct address_space *mapping,
24944                 if (!dax_mapping(mapping) && !workingset_node_pages(node) &&
24945                                 list_empty(&node->private_list)) {
24946                         node->private_data = mapping;
24947 -                       list_lru_add(&workingset_shadow_nodes,
24948 -                                       &node->private_list);
24949 +                       local_lock(workingset_shadow_lock);
24950 +                       list_lru_add(&__workingset_shadow_nodes,
24951 +                                    &node->private_list);
24952 +                       local_unlock(workingset_shadow_lock);
24953                 }
24954         }
24956 diff --git a/mm/highmem.c b/mm/highmem.c
24957 index 50b4ca6787f0..77518a3b35a1 100644
24958 --- a/mm/highmem.c
24959 +++ b/mm/highmem.c
24960 @@ -29,10 +29,11 @@
24961  #include <linux/kgdb.h>
24962  #include <asm/tlbflush.h>
24965 +#ifndef CONFIG_PREEMPT_RT_FULL
24966  #if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32)
24967  DEFINE_PER_CPU(int, __kmap_atomic_idx);
24968  #endif
24969 +#endif
24971  /*
24972   * Virtual_count is not a pure "count".
24973 @@ -107,8 +108,9 @@ static inline wait_queue_head_t *get_pkmap_wait_queue_head(unsigned int color)
24974  unsigned long totalhigh_pages __read_mostly;
24975  EXPORT_SYMBOL(totalhigh_pages);
24978 +#ifndef CONFIG_PREEMPT_RT_FULL
24979  EXPORT_PER_CPU_SYMBOL(__kmap_atomic_idx);
24980 +#endif
24982  unsigned int nr_free_highpages (void)
24984 diff --git a/mm/memcontrol.c b/mm/memcontrol.c
24985 index 2a800c4a39bd..c04403033aec 100644
24986 --- a/mm/memcontrol.c
24987 +++ b/mm/memcontrol.c
24988 @@ -67,6 +67,7 @@
24989  #include <net/sock.h>
24990  #include <net/ip.h>
24991  #include "slab.h"
24992 +#include <linux/locallock.h>
24994  #include <asm/uaccess.h>
24996 @@ -92,6 +93,8 @@ int do_swap_account __read_mostly;
24997  #define do_swap_account                0
24998  #endif
25000 +static DEFINE_LOCAL_IRQ_LOCK(event_lock);
25002  /* Whether legacy memory+swap accounting is active */
25003  static bool do_memsw_account(void)
25005 @@ -1795,7 +1798,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
25006                 return;
25007         /* Notify other cpus that system-wide "drain" is running */
25008         get_online_cpus();
25009 -       curcpu = get_cpu();
25010 +       curcpu = get_cpu_light();
25011         for_each_online_cpu(cpu) {
25012                 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
25013                 struct mem_cgroup *memcg;
25014 @@ -1812,7 +1815,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
25015                                 schedule_work_on(cpu, &stock->work);
25016                 }
25017         }
25018 -       put_cpu();
25019 +       put_cpu_light();
25020         put_online_cpus();
25021         mutex_unlock(&percpu_charge_mutex);
25023 @@ -4558,12 +4561,12 @@ static int mem_cgroup_move_account(struct page *page,
25025         ret = 0;
25027 -       local_irq_disable();
25028 +       local_lock_irq(event_lock);
25029         mem_cgroup_charge_statistics(to, page, compound, nr_pages);
25030         memcg_check_events(to, page);
25031         mem_cgroup_charge_statistics(from, page, compound, -nr_pages);
25032         memcg_check_events(from, page);
25033 -       local_irq_enable();
25034 +       local_unlock_irq(event_lock);
25035  out_unlock:
25036         unlock_page(page);
25037  out:
25038 @@ -5438,10 +5441,10 @@ void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
25040         commit_charge(page, memcg, lrucare);
25042 -       local_irq_disable();
25043 +       local_lock_irq(event_lock);
25044         mem_cgroup_charge_statistics(memcg, page, compound, nr_pages);
25045         memcg_check_events(memcg, page);
25046 -       local_irq_enable();
25047 +       local_unlock_irq(event_lock);
25049         if (do_memsw_account() && PageSwapCache(page)) {
25050                 swp_entry_t entry = { .val = page_private(page) };
25051 @@ -5497,14 +5500,14 @@ static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
25052                 memcg_oom_recover(memcg);
25053         }
25055 -       local_irq_save(flags);
25056 +       local_lock_irqsave(event_lock, flags);
25057         __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon);
25058         __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_file);
25059         __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_huge);
25060         __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout);
25061         __this_cpu_add(memcg->stat->nr_page_events, nr_pages);
25062         memcg_check_events(memcg, dummy_page);
25063 -       local_irq_restore(flags);
25064 +       local_unlock_irqrestore(event_lock, flags);
25066         if (!mem_cgroup_is_root(memcg))
25067                 css_put_many(&memcg->css, nr_pages);
25068 @@ -5659,10 +5662,10 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage)
25070         commit_charge(newpage, memcg, false);
25072 -       local_irq_save(flags);
25073 +       local_lock_irqsave(event_lock, flags);
25074         mem_cgroup_charge_statistics(memcg, newpage, compound, nr_pages);
25075         memcg_check_events(memcg, newpage);
25076 -       local_irq_restore(flags);
25077 +       local_unlock_irqrestore(event_lock, flags);
25080  DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key);
25081 @@ -5853,6 +5856,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
25083         struct mem_cgroup *memcg, *swap_memcg;
25084         unsigned short oldid;
25085 +       unsigned long flags;
25087         VM_BUG_ON_PAGE(PageLRU(page), page);
25088         VM_BUG_ON_PAGE(page_count(page), page);
25089 @@ -5893,12 +5897,16 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
25090          * important here to have the interrupts disabled because it is the
25091          * only synchronisation we have for udpating the per-CPU variables.
25092          */
25093 +       local_lock_irqsave(event_lock, flags);
25094 +#ifndef CONFIG_PREEMPT_RT_BASE
25095         VM_BUG_ON(!irqs_disabled());
25096 +#endif
25097         mem_cgroup_charge_statistics(memcg, page, false, -1);
25098         memcg_check_events(memcg, page);
25100         if (!mem_cgroup_is_root(memcg))
25101                 css_put(&memcg->css);
25102 +       local_unlock_irqrestore(event_lock, flags);
25105  /*
25106 diff --git a/mm/mmu_context.c b/mm/mmu_context.c
25107 index 6f4d27c5bb32..5cd25c745a8f 100644
25108 --- a/mm/mmu_context.c
25109 +++ b/mm/mmu_context.c
25110 @@ -23,6 +23,7 @@ void use_mm(struct mm_struct *mm)
25111         struct task_struct *tsk = current;
25113         task_lock(tsk);
25114 +       preempt_disable_rt();
25115         active_mm = tsk->active_mm;
25116         if (active_mm != mm) {
25117                 atomic_inc(&mm->mm_count);
25118 @@ -30,6 +31,7 @@ void use_mm(struct mm_struct *mm)
25119         }
25120         tsk->mm = mm;
25121         switch_mm(active_mm, mm, tsk);
25122 +       preempt_enable_rt();
25123         task_unlock(tsk);
25124  #ifdef finish_arch_post_lock_switch
25125         finish_arch_post_lock_switch();
25126 diff --git a/mm/page_alloc.c b/mm/page_alloc.c
25127 index fbc38888252b..1cb08e1406ea 100644
25128 --- a/mm/page_alloc.c
25129 +++ b/mm/page_alloc.c
25130 @@ -61,6 +61,7 @@
25131  #include <linux/page_ext.h>
25132  #include <linux/hugetlb.h>
25133  #include <linux/sched/rt.h>
25134 +#include <linux/locallock.h>
25135  #include <linux/page_owner.h>
25136  #include <linux/kthread.h>
25137  #include <linux/memcontrol.h>
25138 @@ -281,6 +282,18 @@ EXPORT_SYMBOL(nr_node_ids);
25139  EXPORT_SYMBOL(nr_online_nodes);
25140  #endif
25142 +static DEFINE_LOCAL_IRQ_LOCK(pa_lock);
25144 +#ifdef CONFIG_PREEMPT_RT_BASE
25145 +# define cpu_lock_irqsave(cpu, flags)          \
25146 +       local_lock_irqsave_on(pa_lock, flags, cpu)
25147 +# define cpu_unlock_irqrestore(cpu, flags)     \
25148 +       local_unlock_irqrestore_on(pa_lock, flags, cpu)
25149 +#else
25150 +# define cpu_lock_irqsave(cpu, flags)          local_irq_save(flags)
25151 +# define cpu_unlock_irqrestore(cpu, flags)     local_irq_restore(flags)
25152 +#endif
25154  int page_group_by_mobility_disabled __read_mostly;
25156  #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
25157 @@ -1092,7 +1105,7 @@ static bool bulkfree_pcp_prepare(struct page *page)
25158  #endif /* CONFIG_DEBUG_VM */
25160  /*
25161 - * Frees a number of pages from the PCP lists
25162 + * Frees a number of pages which have been collected from the pcp lists.
25163   * Assumes all pages on list are in same zone, and of same order.
25164   * count is the number of pages to free.
25165   *
25166 @@ -1103,19 +1116,58 @@ static bool bulkfree_pcp_prepare(struct page *page)
25167   * pinned" detection logic.
25168   */
25169  static void free_pcppages_bulk(struct zone *zone, int count,
25170 -                                       struct per_cpu_pages *pcp)
25171 +                              struct list_head *list)
25173 -       int migratetype = 0;
25174 -       int batch_free = 0;
25175         unsigned long nr_scanned;
25176         bool isolated_pageblocks;
25177 +       unsigned long flags;
25179 +       spin_lock_irqsave(&zone->lock, flags);
25181 -       spin_lock(&zone->lock);
25182         isolated_pageblocks = has_isolate_pageblock(zone);
25183         nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED);
25184         if (nr_scanned)
25185                 __mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned);
25187 +       while (!list_empty(list)) {
25188 +               struct page *page;
25189 +               int mt; /* migratetype of the to-be-freed page */
25191 +               page = list_first_entry(list, struct page, lru);
25192 +               /* must delete as __free_one_page list manipulates */
25193 +               list_del(&page->lru);
25195 +               mt = get_pcppage_migratetype(page);
25196 +               /* MIGRATE_ISOLATE page should not go to pcplists */
25197 +               VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
25198 +               /* Pageblock could have been isolated meanwhile */
25199 +               if (unlikely(isolated_pageblocks))
25200 +                       mt = get_pageblock_migratetype(page);
25202 +               if (bulkfree_pcp_prepare(page))
25203 +                       continue;
25205 +               __free_one_page(page, page_to_pfn(page), zone, 0, mt);
25206 +               trace_mm_page_pcpu_drain(page, 0, mt);
25207 +               count--;
25208 +       }
25209 +       WARN_ON(count != 0);
25210 +       spin_unlock_irqrestore(&zone->lock, flags);
25214 + * Moves a number of pages from the PCP lists to free list which
25215 + * is freed outside of the locked region.
25216 + *
25217 + * Assumes all pages on list are in same zone, and of same order.
25218 + * count is the number of pages to free.
25219 + */
25220 +static void isolate_pcp_pages(int count, struct per_cpu_pages *src,
25221 +                             struct list_head *dst)
25223 +       int migratetype = 0;
25224 +       int batch_free = 0;
25226         while (count) {
25227                 struct page *page;
25228                 struct list_head *list;
25229 @@ -1131,7 +1183,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
25230                         batch_free++;
25231                         if (++migratetype == MIGRATE_PCPTYPES)
25232                                 migratetype = 0;
25233 -                       list = &pcp->lists[migratetype];
25234 +                       list = &src->lists[migratetype];
25235                 } while (list_empty(list));
25237                 /* This is the only non-empty list. Free them all. */
25238 @@ -1139,27 +1191,12 @@ static void free_pcppages_bulk(struct zone *zone, int count,
25239                         batch_free = count;
25241                 do {
25242 -                       int mt; /* migratetype of the to-be-freed page */
25244                         page = list_last_entry(list, struct page, lru);
25245 -                       /* must delete as __free_one_page list manipulates */
25246                         list_del(&page->lru);
25248 -                       mt = get_pcppage_migratetype(page);
25249 -                       /* MIGRATE_ISOLATE page should not go to pcplists */
25250 -                       VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
25251 -                       /* Pageblock could have been isolated meanwhile */
25252 -                       if (unlikely(isolated_pageblocks))
25253 -                               mt = get_pageblock_migratetype(page);
25255 -                       if (bulkfree_pcp_prepare(page))
25256 -                               continue;
25258 -                       __free_one_page(page, page_to_pfn(page), zone, 0, mt);
25259 -                       trace_mm_page_pcpu_drain(page, 0, mt);
25260 +                       list_add(&page->lru, dst);
25261                 } while (--count && --batch_free && !list_empty(list));
25262         }
25263 -       spin_unlock(&zone->lock);
25266  static void free_one_page(struct zone *zone,
25267 @@ -1168,7 +1205,9 @@ static void free_one_page(struct zone *zone,
25268                                 int migratetype)
25270         unsigned long nr_scanned;
25271 -       spin_lock(&zone->lock);
25272 +       unsigned long flags;
25274 +       spin_lock_irqsave(&zone->lock, flags);
25275         nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED);
25276         if (nr_scanned)
25277                 __mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned);
25278 @@ -1178,7 +1217,7 @@ static void free_one_page(struct zone *zone,
25279                 migratetype = get_pfnblock_migratetype(page, pfn);
25280         }
25281         __free_one_page(page, pfn, zone, order, migratetype);
25282 -       spin_unlock(&zone->lock);
25283 +       spin_unlock_irqrestore(&zone->lock, flags);
25286  static void __meminit __init_single_page(struct page *page, unsigned long pfn,
25287 @@ -1264,10 +1303,10 @@ static void __free_pages_ok(struct page *page, unsigned int order)
25288                 return;
25290         migratetype = get_pfnblock_migratetype(page, pfn);
25291 -       local_irq_save(flags);
25292 +       local_lock_irqsave(pa_lock, flags);
25293         __count_vm_events(PGFREE, 1 << order);
25294         free_one_page(page_zone(page), page, pfn, order, migratetype);
25295 -       local_irq_restore(flags);
25296 +       local_unlock_irqrestore(pa_lock, flags);
25299  static void __init __free_pages_boot_core(struct page *page, unsigned int order)
25300 @@ -2282,16 +2321,18 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
25301  void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
25303         unsigned long flags;
25304 +       LIST_HEAD(dst);
25305         int to_drain, batch;
25307 -       local_irq_save(flags);
25308 +       local_lock_irqsave(pa_lock, flags);
25309         batch = READ_ONCE(pcp->batch);
25310         to_drain = min(pcp->count, batch);
25311         if (to_drain > 0) {
25312 -               free_pcppages_bulk(zone, to_drain, pcp);
25313 +               isolate_pcp_pages(to_drain, pcp, &dst);
25314                 pcp->count -= to_drain;
25315         }
25316 -       local_irq_restore(flags);
25317 +       local_unlock_irqrestore(pa_lock, flags);
25318 +       free_pcppages_bulk(zone, to_drain, &dst);
25320  #endif
25322 @@ -2307,16 +2348,21 @@ static void drain_pages_zone(unsigned int cpu, struct zone *zone)
25323         unsigned long flags;
25324         struct per_cpu_pageset *pset;
25325         struct per_cpu_pages *pcp;
25326 +       LIST_HEAD(dst);
25327 +       int count;
25329 -       local_irq_save(flags);
25330 +       cpu_lock_irqsave(cpu, flags);
25331         pset = per_cpu_ptr(zone->pageset, cpu);
25333         pcp = &pset->pcp;
25334 -       if (pcp->count) {
25335 -               free_pcppages_bulk(zone, pcp->count, pcp);
25336 +       count = pcp->count;
25337 +       if (count) {
25338 +               isolate_pcp_pages(count, pcp, &dst);
25339                 pcp->count = 0;
25340         }
25341 -       local_irq_restore(flags);
25342 +       cpu_unlock_irqrestore(cpu, flags);
25343 +       if (count)
25344 +               free_pcppages_bulk(zone, count, &dst);
25347  /*
25348 @@ -2402,8 +2448,17 @@ void drain_all_pages(struct zone *zone)
25349                 else
25350                         cpumask_clear_cpu(cpu, &cpus_with_pcps);
25351         }
25352 +#ifndef CONFIG_PREEMPT_RT_BASE
25353         on_each_cpu_mask(&cpus_with_pcps, (smp_call_func_t) drain_local_pages,
25354                                                                 zone, 1);
25355 +#else
25356 +       for_each_cpu(cpu, &cpus_with_pcps) {
25357 +               if (zone)
25358 +                       drain_pages_zone(cpu, zone);
25359 +               else
25360 +                       drain_pages(cpu);
25361 +       }
25362 +#endif
25365  #ifdef CONFIG_HIBERNATION
25366 @@ -2463,7 +2518,7 @@ void free_hot_cold_page(struct page *page, bool cold)
25368         migratetype = get_pfnblock_migratetype(page, pfn);
25369         set_pcppage_migratetype(page, migratetype);
25370 -       local_irq_save(flags);
25371 +       local_lock_irqsave(pa_lock, flags);
25372         __count_vm_event(PGFREE);
25374         /*
25375 @@ -2489,12 +2544,17 @@ void free_hot_cold_page(struct page *page, bool cold)
25376         pcp->count++;
25377         if (pcp->count >= pcp->high) {
25378                 unsigned long batch = READ_ONCE(pcp->batch);
25379 -               free_pcppages_bulk(zone, batch, pcp);
25380 +               LIST_HEAD(dst);
25382 +               isolate_pcp_pages(batch, pcp, &dst);
25383                 pcp->count -= batch;
25384 +               local_unlock_irqrestore(pa_lock, flags);
25385 +               free_pcppages_bulk(zone, batch, &dst);
25386 +               return;
25387         }
25389  out:
25390 -       local_irq_restore(flags);
25391 +       local_unlock_irqrestore(pa_lock, flags);
25394  /*
25395 @@ -2629,7 +2689,7 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
25396                 struct per_cpu_pages *pcp;
25397                 struct list_head *list;
25399 -               local_irq_save(flags);
25400 +               local_lock_irqsave(pa_lock, flags);
25401                 do {
25402                         pcp = &this_cpu_ptr(zone->pageset)->pcp;
25403                         list = &pcp->lists[migratetype];
25404 @@ -2656,7 +2716,7 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
25405                  * allocate greater than order-1 page units with __GFP_NOFAIL.
25406                  */
25407                 WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
25408 -               spin_lock_irqsave(&zone->lock, flags);
25409 +               local_spin_lock_irqsave(pa_lock, &zone->lock, flags);
25411                 do {
25412                         page = NULL;
25413 @@ -2668,22 +2728,24 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
25414                         if (!page)
25415                                 page = __rmqueue(zone, order, migratetype);
25416                 } while (page && check_new_pages(page, order));
25417 -               spin_unlock(&zone->lock);
25418 -               if (!page)
25419 +               if (!page) {
25420 +                       spin_unlock(&zone->lock);
25421                         goto failed;
25422 +               }
25423                 __mod_zone_freepage_state(zone, -(1 << order),
25424                                           get_pcppage_migratetype(page));
25425 +               spin_unlock(&zone->lock);
25426         }
25428         __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
25429         zone_statistics(preferred_zone, zone, gfp_flags);
25430 -       local_irq_restore(flags);
25431 +       local_unlock_irqrestore(pa_lock, flags);
25433         VM_BUG_ON_PAGE(bad_range(zone, page), page);
25434         return page;
25436  failed:
25437 -       local_irq_restore(flags);
25438 +       local_unlock_irqrestore(pa_lock, flags);
25439         return NULL;
25442 @@ -6561,7 +6623,9 @@ static int page_alloc_cpu_notify(struct notifier_block *self,
25443         int cpu = (unsigned long)hcpu;
25445         if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
25446 +               local_lock_irq_on(swapvec_lock, cpu);
25447                 lru_add_drain_cpu(cpu);
25448 +               local_unlock_irq_on(swapvec_lock, cpu);
25449                 drain_pages(cpu);
25451                 /*
25452 @@ -6587,6 +6651,7 @@ static int page_alloc_cpu_notify(struct notifier_block *self,
25453  void __init page_alloc_init(void)
25455         hotcpu_notifier(page_alloc_cpu_notify, 0);
25456 +       local_irq_lock_init(pa_lock);
25459  /*
25460 @@ -7422,7 +7487,7 @@ void zone_pcp_reset(struct zone *zone)
25461         struct per_cpu_pageset *pset;
25463         /* avoid races with drain_pages()  */
25464 -       local_irq_save(flags);
25465 +       local_lock_irqsave(pa_lock, flags);
25466         if (zone->pageset != &boot_pageset) {
25467                 for_each_online_cpu(cpu) {
25468                         pset = per_cpu_ptr(zone->pageset, cpu);
25469 @@ -7431,7 +7496,7 @@ void zone_pcp_reset(struct zone *zone)
25470                 free_percpu(zone->pageset);
25471                 zone->pageset = &boot_pageset;
25472         }
25473 -       local_irq_restore(flags);
25474 +       local_unlock_irqrestore(pa_lock, flags);
25477  #ifdef CONFIG_MEMORY_HOTREMOVE
25478 diff --git a/mm/percpu.c b/mm/percpu.c
25479 index f014cebbf405..4e739fcf91bf 100644
25480 --- a/mm/percpu.c
25481 +++ b/mm/percpu.c
25482 @@ -1283,18 +1283,7 @@ void free_percpu(void __percpu *ptr)
25484  EXPORT_SYMBOL_GPL(free_percpu);
25486 -/**
25487 - * is_kernel_percpu_address - test whether address is from static percpu area
25488 - * @addr: address to test
25489 - *
25490 - * Test whether @addr belongs to in-kernel static percpu area.  Module
25491 - * static percpu areas are not considered.  For those, use
25492 - * is_module_percpu_address().
25493 - *
25494 - * RETURNS:
25495 - * %true if @addr is from in-kernel static percpu area, %false otherwise.
25496 - */
25497 -bool is_kernel_percpu_address(unsigned long addr)
25498 +bool __is_kernel_percpu_address(unsigned long addr, unsigned long *can_addr)
25500  #ifdef CONFIG_SMP
25501         const size_t static_size = __per_cpu_end - __per_cpu_start;
25502 @@ -1303,15 +1292,38 @@ bool is_kernel_percpu_address(unsigned long addr)
25504         for_each_possible_cpu(cpu) {
25505                 void *start = per_cpu_ptr(base, cpu);
25506 +               void *va = (void *)addr;
25508 -               if ((void *)addr >= start && (void *)addr < start + static_size)
25509 +               if (va >= start && va < start + static_size) {
25510 +                       if (can_addr) {
25511 +                               *can_addr = (unsigned long) (va - start);
25512 +                               *can_addr += (unsigned long)
25513 +                                       per_cpu_ptr(base, get_boot_cpu_id());
25514 +                       }
25515                         return true;
25516 -        }
25517 +               }
25518 +       }
25519  #endif
25520         /* on UP, can't distinguish from other static vars, always false */
25521         return false;
25524 +/**
25525 + * is_kernel_percpu_address - test whether address is from static percpu area
25526 + * @addr: address to test
25527 + *
25528 + * Test whether @addr belongs to in-kernel static percpu area.  Module
25529 + * static percpu areas are not considered.  For those, use
25530 + * is_module_percpu_address().
25531 + *
25532 + * RETURNS:
25533 + * %true if @addr is from in-kernel static percpu area, %false otherwise.
25534 + */
25535 +bool is_kernel_percpu_address(unsigned long addr)
25537 +       return __is_kernel_percpu_address(addr, NULL);
25540  /**
25541   * per_cpu_ptr_to_phys - convert translated percpu address to physical address
25542   * @addr: the address to be converted to physical address
25543 diff --git a/mm/slab.h b/mm/slab.h
25544 index ceb7d70cdb76..dfd281e43fbe 100644
25545 --- a/mm/slab.h
25546 +++ b/mm/slab.h
25547 @@ -426,7 +426,11 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags,
25548   * The slab lists for all objects.
25549   */
25550  struct kmem_cache_node {
25551 +#ifdef CONFIG_SLUB
25552 +       raw_spinlock_t list_lock;
25553 +#else
25554         spinlock_t list_lock;
25555 +#endif
25557  #ifdef CONFIG_SLAB
25558         struct list_head slabs_partial; /* partial list first, better asm code */
25559 diff --git a/mm/slub.c b/mm/slub.c
25560 index edc79ca3c6d5..67eb368b9314 100644
25561 --- a/mm/slub.c
25562 +++ b/mm/slub.c
25563 @@ -1144,7 +1144,7 @@ static noinline int free_debug_processing(
25564         unsigned long uninitialized_var(flags);
25565         int ret = 0;
25567 -       spin_lock_irqsave(&n->list_lock, flags);
25568 +       raw_spin_lock_irqsave(&n->list_lock, flags);
25569         slab_lock(page);
25571         if (s->flags & SLAB_CONSISTENCY_CHECKS) {
25572 @@ -1179,7 +1179,7 @@ static noinline int free_debug_processing(
25573                          bulk_cnt, cnt);
25575         slab_unlock(page);
25576 -       spin_unlock_irqrestore(&n->list_lock, flags);
25577 +       raw_spin_unlock_irqrestore(&n->list_lock, flags);
25578         if (!ret)
25579                 slab_fix(s, "Object at 0x%p not freed", object);
25580         return ret;
25581 @@ -1307,6 +1307,12 @@ static inline void dec_slabs_node(struct kmem_cache *s, int node,
25583  #endif /* CONFIG_SLUB_DEBUG */
25585 +struct slub_free_list {
25586 +       raw_spinlock_t          lock;
25587 +       struct list_head        list;
25589 +static DEFINE_PER_CPU(struct slub_free_list, slub_free_list);
25591  /*
25592   * Hooks for other subsystems that check memory allocations. In a typical
25593   * production configuration these hooks all should produce no code at all.
25594 @@ -1530,10 +1536,17 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
25595         void *start, *p;
25596         int idx, order;
25597         bool shuffle;
25598 +       bool enableirqs = false;
25600         flags &= gfp_allowed_mask;
25602         if (gfpflags_allow_blocking(flags))
25603 +               enableirqs = true;
25604 +#ifdef CONFIG_PREEMPT_RT_FULL
25605 +       if (system_state == SYSTEM_RUNNING)
25606 +               enableirqs = true;
25607 +#endif
25608 +       if (enableirqs)
25609                 local_irq_enable();
25611         flags |= s->allocflags;
25612 @@ -1608,7 +1621,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
25613         page->frozen = 1;
25615  out:
25616 -       if (gfpflags_allow_blocking(flags))
25617 +       if (enableirqs)
25618                 local_irq_disable();
25619         if (!page)
25620                 return NULL;
25621 @@ -1667,6 +1680,16 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
25622         __free_pages(page, order);
25625 +static void free_delayed(struct list_head *h)
25627 +       while(!list_empty(h)) {
25628 +               struct page *page = list_first_entry(h, struct page, lru);
25630 +               list_del(&page->lru);
25631 +               __free_slab(page->slab_cache, page);
25632 +       }
25635  #define need_reserve_slab_rcu                                          \
25636         (sizeof(((struct page *)NULL)->lru) < sizeof(struct rcu_head))
25638 @@ -1698,6 +1721,12 @@ static void free_slab(struct kmem_cache *s, struct page *page)
25639                 }
25641                 call_rcu(head, rcu_free_slab);
25642 +       } else if (irqs_disabled()) {
25643 +               struct slub_free_list *f = this_cpu_ptr(&slub_free_list);
25645 +               raw_spin_lock(&f->lock);
25646 +               list_add(&page->lru, &f->list);
25647 +               raw_spin_unlock(&f->lock);
25648         } else
25649                 __free_slab(s, page);
25651 @@ -1805,7 +1834,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
25652         if (!n || !n->nr_partial)
25653                 return NULL;
25655 -       spin_lock(&n->list_lock);
25656 +       raw_spin_lock(&n->list_lock);
25657         list_for_each_entry_safe(page, page2, &n->partial, lru) {
25658                 void *t;
25660 @@ -1830,7 +1859,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
25661                         break;
25663         }
25664 -       spin_unlock(&n->list_lock);
25665 +       raw_spin_unlock(&n->list_lock);
25666         return object;
25669 @@ -2076,7 +2105,7 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page,
25670                          * that acquire_slab() will see a slab page that
25671                          * is frozen
25672                          */
25673 -                       spin_lock(&n->list_lock);
25674 +                       raw_spin_lock(&n->list_lock);
25675                 }
25676         } else {
25677                 m = M_FULL;
25678 @@ -2087,7 +2116,7 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page,
25679                          * slabs from diagnostic functions will not see
25680                          * any frozen slabs.
25681                          */
25682 -                       spin_lock(&n->list_lock);
25683 +                       raw_spin_lock(&n->list_lock);
25684                 }
25685         }
25687 @@ -2122,7 +2151,7 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page,
25688                 goto redo;
25690         if (lock)
25691 -               spin_unlock(&n->list_lock);
25692 +               raw_spin_unlock(&n->list_lock);
25694         if (m == M_FREE) {
25695                 stat(s, DEACTIVATE_EMPTY);
25696 @@ -2154,10 +2183,10 @@ static void unfreeze_partials(struct kmem_cache *s,
25697                 n2 = get_node(s, page_to_nid(page));
25698                 if (n != n2) {
25699                         if (n)
25700 -                               spin_unlock(&n->list_lock);
25701 +                               raw_spin_unlock(&n->list_lock);
25703                         n = n2;
25704 -                       spin_lock(&n->list_lock);
25705 +                       raw_spin_lock(&n->list_lock);
25706                 }
25708                 do {
25709 @@ -2186,7 +2215,7 @@ static void unfreeze_partials(struct kmem_cache *s,
25710         }
25712         if (n)
25713 -               spin_unlock(&n->list_lock);
25714 +               raw_spin_unlock(&n->list_lock);
25716         while (discard_page) {
25717                 page = discard_page;
25718 @@ -2225,14 +2254,21 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
25719                         pobjects = oldpage->pobjects;
25720                         pages = oldpage->pages;
25721                         if (drain && pobjects > s->cpu_partial) {
25722 +                               struct slub_free_list *f;
25723                                 unsigned long flags;
25724 +                               LIST_HEAD(tofree);
25725                                 /*
25726                                  * partial array is full. Move the existing
25727                                  * set to the per node partial list.
25728                                  */
25729                                 local_irq_save(flags);
25730                                 unfreeze_partials(s, this_cpu_ptr(s->cpu_slab));
25731 +                               f = this_cpu_ptr(&slub_free_list);
25732 +                               raw_spin_lock(&f->lock);
25733 +                               list_splice_init(&f->list, &tofree);
25734 +                               raw_spin_unlock(&f->lock);
25735                                 local_irq_restore(flags);
25736 +                               free_delayed(&tofree);
25737                                 oldpage = NULL;
25738                                 pobjects = 0;
25739                                 pages = 0;
25740 @@ -2304,7 +2340,22 @@ static bool has_cpu_slab(int cpu, void *info)
25742  static void flush_all(struct kmem_cache *s)
25744 +       LIST_HEAD(tofree);
25745 +       int cpu;
25747         on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1, GFP_ATOMIC);
25748 +       for_each_online_cpu(cpu) {
25749 +               struct slub_free_list *f;
25751 +               if (!has_cpu_slab(cpu, s))
25752 +                       continue;
25754 +               f = &per_cpu(slub_free_list, cpu);
25755 +               raw_spin_lock_irq(&f->lock);
25756 +               list_splice_init(&f->list, &tofree);
25757 +               raw_spin_unlock_irq(&f->lock);
25758 +               free_delayed(&tofree);
25759 +       }
25762  /*
25763 @@ -2359,10 +2410,10 @@ static unsigned long count_partial(struct kmem_cache_node *n,
25764         unsigned long x = 0;
25765         struct page *page;
25767 -       spin_lock_irqsave(&n->list_lock, flags);
25768 +       raw_spin_lock_irqsave(&n->list_lock, flags);
25769         list_for_each_entry(page, &n->partial, lru)
25770                 x += get_count(page);
25771 -       spin_unlock_irqrestore(&n->list_lock, flags);
25772 +       raw_spin_unlock_irqrestore(&n->list_lock, flags);
25773         return x;
25775  #endif /* CONFIG_SLUB_DEBUG || CONFIG_SYSFS */
25776 @@ -2500,8 +2551,10 @@ static inline void *get_freelist(struct kmem_cache *s, struct page *page)
25777   * already disabled (which is the case for bulk allocation).
25778   */
25779  static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
25780 -                         unsigned long addr, struct kmem_cache_cpu *c)
25781 +                         unsigned long addr, struct kmem_cache_cpu *c,
25782 +                         struct list_head *to_free)
25784 +       struct slub_free_list *f;
25785         void *freelist;
25786         struct page *page;
25788 @@ -2561,6 +2614,13 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
25789         VM_BUG_ON(!c->page->frozen);
25790         c->freelist = get_freepointer(s, freelist);
25791         c->tid = next_tid(c->tid);
25793 +out:
25794 +       f = this_cpu_ptr(&slub_free_list);
25795 +       raw_spin_lock(&f->lock);
25796 +       list_splice_init(&f->list, to_free);
25797 +       raw_spin_unlock(&f->lock);
25799         return freelist;
25801  new_slab:
25802 @@ -2592,7 +2652,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
25803         deactivate_slab(s, page, get_freepointer(s, freelist));
25804         c->page = NULL;
25805         c->freelist = NULL;
25806 -       return freelist;
25807 +       goto out;
25810  /*
25811 @@ -2604,6 +2664,7 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
25813         void *p;
25814         unsigned long flags;
25815 +       LIST_HEAD(tofree);
25817         local_irq_save(flags);
25818  #ifdef CONFIG_PREEMPT
25819 @@ -2615,8 +2676,9 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
25820         c = this_cpu_ptr(s->cpu_slab);
25821  #endif
25823 -       p = ___slab_alloc(s, gfpflags, node, addr, c);
25824 +       p = ___slab_alloc(s, gfpflags, node, addr, c, &tofree);
25825         local_irq_restore(flags);
25826 +       free_delayed(&tofree);
25827         return p;
25830 @@ -2802,7 +2864,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
25832         do {
25833                 if (unlikely(n)) {
25834 -                       spin_unlock_irqrestore(&n->list_lock, flags);
25835 +                       raw_spin_unlock_irqrestore(&n->list_lock, flags);
25836                         n = NULL;
25837                 }
25838                 prior = page->freelist;
25839 @@ -2834,7 +2896,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
25840                                  * Otherwise the list_lock will synchronize with
25841                                  * other processors updating the list of slabs.
25842                                  */
25843 -                               spin_lock_irqsave(&n->list_lock, flags);
25844 +                               raw_spin_lock_irqsave(&n->list_lock, flags);
25846                         }
25847                 }
25848 @@ -2876,7 +2938,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
25849                 add_partial(n, page, DEACTIVATE_TO_TAIL);
25850                 stat(s, FREE_ADD_PARTIAL);
25851         }
25852 -       spin_unlock_irqrestore(&n->list_lock, flags);
25853 +       raw_spin_unlock_irqrestore(&n->list_lock, flags);
25854         return;
25856  slab_empty:
25857 @@ -2891,7 +2953,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
25858                 remove_full(s, n, page);
25859         }
25861 -       spin_unlock_irqrestore(&n->list_lock, flags);
25862 +       raw_spin_unlock_irqrestore(&n->list_lock, flags);
25863         stat(s, FREE_SLAB);
25864         discard_slab(s, page);
25866 @@ -3096,6 +3158,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
25867                           void **p)
25869         struct kmem_cache_cpu *c;
25870 +       LIST_HEAD(to_free);
25871         int i;
25873         /* memcg and kmem_cache debug support */
25874 @@ -3119,7 +3182,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
25875                          * of re-populating per CPU c->freelist
25876                          */
25877                         p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE,
25878 -                                           _RET_IP_, c);
25879 +                                           _RET_IP_, c, &to_free);
25880                         if (unlikely(!p[i]))
25881                                 goto error;
25883 @@ -3131,6 +3194,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
25884         }
25885         c->tid = next_tid(c->tid);
25886         local_irq_enable();
25887 +       free_delayed(&to_free);
25889         /* Clear memory outside IRQ disabled fastpath loop */
25890         if (unlikely(flags & __GFP_ZERO)) {
25891 @@ -3278,7 +3342,7 @@ static void
25892  init_kmem_cache_node(struct kmem_cache_node *n)
25894         n->nr_partial = 0;
25895 -       spin_lock_init(&n->list_lock);
25896 +       raw_spin_lock_init(&n->list_lock);
25897         INIT_LIST_HEAD(&n->partial);
25898  #ifdef CONFIG_SLUB_DEBUG
25899         atomic_long_set(&n->nr_slabs, 0);
25900 @@ -3622,6 +3686,10 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
25901                                                         const char *text)
25903  #ifdef CONFIG_SLUB_DEBUG
25904 +#ifdef CONFIG_PREEMPT_RT_BASE
25905 +       /* XXX move out of irq-off section */
25906 +       slab_err(s, page, text, s->name);
25907 +#else
25908         void *addr = page_address(page);
25909         void *p;
25910         unsigned long *map = kzalloc(BITS_TO_LONGS(page->objects) *
25911 @@ -3642,6 +3710,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
25912         slab_unlock(page);
25913         kfree(map);
25914  #endif
25915 +#endif
25918  /*
25919 @@ -3655,7 +3724,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
25920         struct page *page, *h;
25922         BUG_ON(irqs_disabled());
25923 -       spin_lock_irq(&n->list_lock);
25924 +       raw_spin_lock_irq(&n->list_lock);
25925         list_for_each_entry_safe(page, h, &n->partial, lru) {
25926                 if (!page->inuse) {
25927                         remove_partial(n, page);
25928 @@ -3665,7 +3734,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
25929                         "Objects remaining in %s on __kmem_cache_shutdown()");
25930                 }
25931         }
25932 -       spin_unlock_irq(&n->list_lock);
25933 +       raw_spin_unlock_irq(&n->list_lock);
25935         list_for_each_entry_safe(page, h, &discard, lru)
25936                 discard_slab(s, page);
25937 @@ -3908,7 +3977,7 @@ int __kmem_cache_shrink(struct kmem_cache *s)
25938                 for (i = 0; i < SHRINK_PROMOTE_MAX; i++)
25939                         INIT_LIST_HEAD(promote + i);
25941 -               spin_lock_irqsave(&n->list_lock, flags);
25942 +               raw_spin_lock_irqsave(&n->list_lock, flags);
25944                 /*
25945                  * Build lists of slabs to discard or promote.
25946 @@ -3939,7 +4008,7 @@ int __kmem_cache_shrink(struct kmem_cache *s)
25947                 for (i = SHRINK_PROMOTE_MAX - 1; i >= 0; i--)
25948                         list_splice(promote + i, &n->partial);
25950 -               spin_unlock_irqrestore(&n->list_lock, flags);
25951 +               raw_spin_unlock_irqrestore(&n->list_lock, flags);
25953                 /* Release empty slabs */
25954                 list_for_each_entry_safe(page, t, &discard, lru)
25955 @@ -4115,6 +4184,12 @@ void __init kmem_cache_init(void)
25957         static __initdata struct kmem_cache boot_kmem_cache,
25958                 boot_kmem_cache_node;
25959 +       int cpu;
25961 +       for_each_possible_cpu(cpu) {
25962 +               raw_spin_lock_init(&per_cpu(slub_free_list, cpu).lock);
25963 +               INIT_LIST_HEAD(&per_cpu(slub_free_list, cpu).list);
25964 +       }
25966         if (debug_guardpage_minorder())
25967                 slub_max_order = 0;
25968 @@ -4323,7 +4398,7 @@ static int validate_slab_node(struct kmem_cache *s,
25969         struct page *page;
25970         unsigned long flags;
25972 -       spin_lock_irqsave(&n->list_lock, flags);
25973 +       raw_spin_lock_irqsave(&n->list_lock, flags);
25975         list_for_each_entry(page, &n->partial, lru) {
25976                 validate_slab_slab(s, page, map);
25977 @@ -4345,7 +4420,7 @@ static int validate_slab_node(struct kmem_cache *s,
25978                        s->name, count, atomic_long_read(&n->nr_slabs));
25980  out:
25981 -       spin_unlock_irqrestore(&n->list_lock, flags);
25982 +       raw_spin_unlock_irqrestore(&n->list_lock, flags);
25983         return count;
25986 @@ -4533,12 +4608,12 @@ static int list_locations(struct kmem_cache *s, char *buf,
25987                 if (!atomic_long_read(&n->nr_slabs))
25988                         continue;
25990 -               spin_lock_irqsave(&n->list_lock, flags);
25991 +               raw_spin_lock_irqsave(&n->list_lock, flags);
25992                 list_for_each_entry(page, &n->partial, lru)
25993                         process_slab(&t, s, page, alloc, map);
25994                 list_for_each_entry(page, &n->full, lru)
25995                         process_slab(&t, s, page, alloc, map);
25996 -               spin_unlock_irqrestore(&n->list_lock, flags);
25997 +               raw_spin_unlock_irqrestore(&n->list_lock, flags);
25998         }
26000         for (i = 0; i < t.count; i++) {
26001 diff --git a/mm/swap.c b/mm/swap.c
26002 index 4dcf852e1e6d..69c3a5b24060 100644
26003 --- a/mm/swap.c
26004 +++ b/mm/swap.c
26005 @@ -32,6 +32,7 @@
26006  #include <linux/memcontrol.h>
26007  #include <linux/gfp.h>
26008  #include <linux/uio.h>
26009 +#include <linux/locallock.h>
26010  #include <linux/hugetlb.h>
26011  #include <linux/page_idle.h>
26013 @@ -50,6 +51,8 @@ static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs);
26014  #ifdef CONFIG_SMP
26015  static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs);
26016  #endif
26017 +static DEFINE_LOCAL_IRQ_LOCK(rotate_lock);
26018 +DEFINE_LOCAL_IRQ_LOCK(swapvec_lock);
26020  /*
26021   * This path almost never happens for VM activity - pages are normally
26022 @@ -240,11 +243,11 @@ void rotate_reclaimable_page(struct page *page)
26023                 unsigned long flags;
26025                 get_page(page);
26026 -               local_irq_save(flags);
26027 +               local_lock_irqsave(rotate_lock, flags);
26028                 pvec = this_cpu_ptr(&lru_rotate_pvecs);
26029                 if (!pagevec_add(pvec, page) || PageCompound(page))
26030                         pagevec_move_tail(pvec);
26031 -               local_irq_restore(flags);
26032 +               local_unlock_irqrestore(rotate_lock, flags);
26033         }
26036 @@ -294,12 +297,13 @@ void activate_page(struct page *page)
26038         page = compound_head(page);
26039         if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
26040 -               struct pagevec *pvec = &get_cpu_var(activate_page_pvecs);
26041 +               struct pagevec *pvec = &get_locked_var(swapvec_lock,
26042 +                                                      activate_page_pvecs);
26044                 get_page(page);
26045                 if (!pagevec_add(pvec, page) || PageCompound(page))
26046                         pagevec_lru_move_fn(pvec, __activate_page, NULL);
26047 -               put_cpu_var(activate_page_pvecs);
26048 +               put_locked_var(swapvec_lock, activate_page_pvecs);
26049         }
26052 @@ -326,7 +330,7 @@ void activate_page(struct page *page)
26054  static void __lru_cache_activate_page(struct page *page)
26056 -       struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
26057 +       struct pagevec *pvec = &get_locked_var(swapvec_lock, lru_add_pvec);
26058         int i;
26060         /*
26061 @@ -348,7 +352,7 @@ static void __lru_cache_activate_page(struct page *page)
26062                 }
26063         }
26065 -       put_cpu_var(lru_add_pvec);
26066 +       put_locked_var(swapvec_lock, lru_add_pvec);
26069  /*
26070 @@ -390,12 +394,12 @@ EXPORT_SYMBOL(mark_page_accessed);
26072  static void __lru_cache_add(struct page *page)
26074 -       struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
26075 +       struct pagevec *pvec = &get_locked_var(swapvec_lock, lru_add_pvec);
26077         get_page(page);
26078         if (!pagevec_add(pvec, page) || PageCompound(page))
26079                 __pagevec_lru_add(pvec);
26080 -       put_cpu_var(lru_add_pvec);
26081 +       put_locked_var(swapvec_lock, lru_add_pvec);
26084  /**
26085 @@ -593,9 +597,15 @@ void lru_add_drain_cpu(int cpu)
26086                 unsigned long flags;
26088                 /* No harm done if a racing interrupt already did this */
26089 -               local_irq_save(flags);
26090 +#ifdef CONFIG_PREEMPT_RT_BASE
26091 +               local_lock_irqsave_on(rotate_lock, flags, cpu);
26092                 pagevec_move_tail(pvec);
26093 -               local_irq_restore(flags);
26094 +               local_unlock_irqrestore_on(rotate_lock, flags, cpu);
26095 +#else
26096 +               local_lock_irqsave(rotate_lock, flags);
26097 +               pagevec_move_tail(pvec);
26098 +               local_unlock_irqrestore(rotate_lock, flags);
26099 +#endif
26100         }
26102         pvec = &per_cpu(lru_deactivate_file_pvecs, cpu);
26103 @@ -627,11 +637,12 @@ void deactivate_file_page(struct page *page)
26104                 return;
26106         if (likely(get_page_unless_zero(page))) {
26107 -               struct pagevec *pvec = &get_cpu_var(lru_deactivate_file_pvecs);
26108 +               struct pagevec *pvec = &get_locked_var(swapvec_lock,
26109 +                                                      lru_deactivate_file_pvecs);
26111                 if (!pagevec_add(pvec, page) || PageCompound(page))
26112                         pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
26113 -               put_cpu_var(lru_deactivate_file_pvecs);
26114 +               put_locked_var(swapvec_lock, lru_deactivate_file_pvecs);
26115         }
26118 @@ -646,27 +657,31 @@ void deactivate_file_page(struct page *page)
26119  void deactivate_page(struct page *page)
26121         if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
26122 -               struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs);
26123 +               struct pagevec *pvec = &get_locked_var(swapvec_lock,
26124 +                                                      lru_deactivate_pvecs);
26126                 get_page(page);
26127                 if (!pagevec_add(pvec, page) || PageCompound(page))
26128                         pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
26129 -               put_cpu_var(lru_deactivate_pvecs);
26130 +               put_locked_var(swapvec_lock, lru_deactivate_pvecs);
26131         }
26134  void lru_add_drain(void)
26136 -       lru_add_drain_cpu(get_cpu());
26137 -       put_cpu();
26138 +       lru_add_drain_cpu(local_lock_cpu(swapvec_lock));
26139 +       local_unlock_cpu(swapvec_lock);
26142 -static void lru_add_drain_per_cpu(struct work_struct *dummy)
26143 +#ifdef CONFIG_PREEMPT_RT_BASE
26144 +static inline void remote_lru_add_drain(int cpu, struct cpumask *has_work)
26146 -       lru_add_drain();
26147 +       local_lock_on(swapvec_lock, cpu);
26148 +       lru_add_drain_cpu(cpu);
26149 +       local_unlock_on(swapvec_lock, cpu);
26152 -static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
26153 +#else
26155  /*
26156   * lru_add_drain_wq is used to do lru_add_drain_all() from a WQ_MEM_RECLAIM
26157 @@ -686,6 +701,22 @@ static int __init lru_init(void)
26159  early_initcall(lru_init);
26161 +static void lru_add_drain_per_cpu(struct work_struct *dummy)
26163 +       lru_add_drain();
26166 +static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
26167 +static inline void remote_lru_add_drain(int cpu, struct cpumask *has_work)
26169 +       struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);
26171 +       INIT_WORK(work, lru_add_drain_per_cpu);
26172 +       queue_work_on(cpu, lru_add_drain_wq, work);
26173 +       cpumask_set_cpu(cpu, has_work);
26175 +#endif
26177  void lru_add_drain_all(void)
26179         static DEFINE_MUTEX(lock);
26180 @@ -697,21 +728,18 @@ void lru_add_drain_all(void)
26181         cpumask_clear(&has_work);
26183         for_each_online_cpu(cpu) {
26184 -               struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);
26186                 if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
26187                     pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
26188                     pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) ||
26189                     pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) ||
26190 -                   need_activate_page_drain(cpu)) {
26191 -                       INIT_WORK(work, lru_add_drain_per_cpu);
26192 -                       queue_work_on(cpu, lru_add_drain_wq, work);
26193 -                       cpumask_set_cpu(cpu, &has_work);
26194 -               }
26195 +                   need_activate_page_drain(cpu))
26196 +                       remote_lru_add_drain(cpu, &has_work);
26197         }
26199 +#ifndef CONFIG_PREEMPT_RT_BASE
26200         for_each_cpu(cpu, &has_work)
26201                 flush_work(&per_cpu(lru_add_drain_work, cpu));
26202 +#endif
26204         put_online_cpus();
26205         mutex_unlock(&lock);
26206 diff --git a/mm/truncate.c b/mm/truncate.c
26207 index 9c809e7d73c3..b7681e888ba0 100644
26208 --- a/mm/truncate.c
26209 +++ b/mm/truncate.c
26210 @@ -62,9 +62,12 @@ static void clear_exceptional_entry(struct address_space *mapping,
26211          * protected by mapping->tree_lock.
26212          */
26213         if (!workingset_node_shadows(node) &&
26214 -           !list_empty(&node->private_list))
26215 -               list_lru_del(&workingset_shadow_nodes,
26216 +           !list_empty(&node->private_list)) {
26217 +               local_lock(workingset_shadow_lock);
26218 +               list_lru_del(&__workingset_shadow_nodes,
26219                                 &node->private_list);
26220 +               local_unlock(workingset_shadow_lock);
26221 +       }
26222         __radix_tree_delete_node(&mapping->page_tree, node);
26223  unlock:
26224         spin_unlock_irq(&mapping->tree_lock);
26225 diff --git a/mm/vmalloc.c b/mm/vmalloc.c
26226 index 195de42bea1f..b46cb686fde7 100644
26227 --- a/mm/vmalloc.c
26228 +++ b/mm/vmalloc.c
26229 @@ -855,7 +855,7 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
26230         struct vmap_block *vb;
26231         struct vmap_area *va;
26232         unsigned long vb_idx;
26233 -       int node, err;
26234 +       int node, err, cpu;
26235         void *vaddr;
26237         node = numa_node_id();
26238 @@ -898,11 +898,12 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
26239         BUG_ON(err);
26240         radix_tree_preload_end();
26242 -       vbq = &get_cpu_var(vmap_block_queue);
26243 +       cpu = get_cpu_light();
26244 +       vbq = this_cpu_ptr(&vmap_block_queue);
26245         spin_lock(&vbq->lock);
26246         list_add_tail_rcu(&vb->free_list, &vbq->free);
26247         spin_unlock(&vbq->lock);
26248 -       put_cpu_var(vmap_block_queue);
26249 +       put_cpu_light();
26251         return vaddr;
26253 @@ -971,6 +972,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
26254         struct vmap_block *vb;
26255         void *vaddr = NULL;
26256         unsigned int order;
26257 +       int cpu;
26259         BUG_ON(offset_in_page(size));
26260         BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
26261 @@ -985,7 +987,8 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
26262         order = get_order(size);
26264         rcu_read_lock();
26265 -       vbq = &get_cpu_var(vmap_block_queue);
26266 +       cpu = get_cpu_light();
26267 +       vbq = this_cpu_ptr(&vmap_block_queue);
26268         list_for_each_entry_rcu(vb, &vbq->free, free_list) {
26269                 unsigned long pages_off;
26271 @@ -1008,7 +1011,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
26272                 break;
26273         }
26275 -       put_cpu_var(vmap_block_queue);
26276 +       put_cpu_light();
26277         rcu_read_unlock();
26279         /* Allocate new block if nothing was found */
26280 diff --git a/mm/vmstat.c b/mm/vmstat.c
26281 index 6a088df04b29..abda95be88b4 100644
26282 --- a/mm/vmstat.c
26283 +++ b/mm/vmstat.c
26284 @@ -245,6 +245,7 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
26285         long x;
26286         long t;
26288 +       preempt_disable_rt();
26289         x = delta + __this_cpu_read(*p);
26291         t = __this_cpu_read(pcp->stat_threshold);
26292 @@ -254,6 +255,7 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
26293                 x = 0;
26294         }
26295         __this_cpu_write(*p, x);
26296 +       preempt_enable_rt();
26298  EXPORT_SYMBOL(__mod_zone_page_state);
26300 @@ -265,6 +267,7 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
26301         long x;
26302         long t;
26304 +       preempt_disable_rt();
26305         x = delta + __this_cpu_read(*p);
26307         t = __this_cpu_read(pcp->stat_threshold);
26308 @@ -274,6 +277,7 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
26309                 x = 0;
26310         }
26311         __this_cpu_write(*p, x);
26312 +       preempt_enable_rt();
26314  EXPORT_SYMBOL(__mod_node_page_state);
26316 @@ -306,6 +310,7 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
26317         s8 __percpu *p = pcp->vm_stat_diff + item;
26318         s8 v, t;
26320 +       preempt_disable_rt();
26321         v = __this_cpu_inc_return(*p);
26322         t = __this_cpu_read(pcp->stat_threshold);
26323         if (unlikely(v > t)) {
26324 @@ -314,6 +319,7 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
26325                 zone_page_state_add(v + overstep, zone, item);
26326                 __this_cpu_write(*p, -overstep);
26327         }
26328 +       preempt_enable_rt();
26331  void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
26332 @@ -322,6 +328,7 @@ void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
26333         s8 __percpu *p = pcp->vm_node_stat_diff + item;
26334         s8 v, t;
26336 +       preempt_disable_rt();
26337         v = __this_cpu_inc_return(*p);
26338         t = __this_cpu_read(pcp->stat_threshold);
26339         if (unlikely(v > t)) {
26340 @@ -330,6 +337,7 @@ void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
26341                 node_page_state_add(v + overstep, pgdat, item);
26342                 __this_cpu_write(*p, -overstep);
26343         }
26344 +       preempt_enable_rt();
26347  void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
26348 @@ -350,6 +358,7 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
26349         s8 __percpu *p = pcp->vm_stat_diff + item;
26350         s8 v, t;
26352 +       preempt_disable_rt();
26353         v = __this_cpu_dec_return(*p);
26354         t = __this_cpu_read(pcp->stat_threshold);
26355         if (unlikely(v < - t)) {
26356 @@ -358,6 +367,7 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
26357                 zone_page_state_add(v - overstep, zone, item);
26358                 __this_cpu_write(*p, overstep);
26359         }
26360 +       preempt_enable_rt();
26363  void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
26364 @@ -366,6 +376,7 @@ void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
26365         s8 __percpu *p = pcp->vm_node_stat_diff + item;
26366         s8 v, t;
26368 +       preempt_disable_rt();
26369         v = __this_cpu_dec_return(*p);
26370         t = __this_cpu_read(pcp->stat_threshold);
26371         if (unlikely(v < - t)) {
26372 @@ -374,6 +385,7 @@ void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
26373                 node_page_state_add(v - overstep, pgdat, item);
26374                 __this_cpu_write(*p, overstep);
26375         }
26376 +       preempt_enable_rt();
26379  void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
26380 diff --git a/mm/workingset.c b/mm/workingset.c
26381 index 4c4f05655e6e..b97b1e87b54c 100644
26382 --- a/mm/workingset.c
26383 +++ b/mm/workingset.c
26384 @@ -334,7 +334,8 @@ void workingset_activation(struct page *page)
26385   * point where they would still be useful.
26386   */
26388 -struct list_lru workingset_shadow_nodes;
26389 +struct list_lru __workingset_shadow_nodes;
26390 +DEFINE_LOCAL_IRQ_LOCK(workingset_shadow_lock);
26392  static unsigned long count_shadow_nodes(struct shrinker *shrinker,
26393                                         struct shrink_control *sc)
26394 @@ -344,9 +345,9 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
26395         unsigned long pages;
26397         /* list_lru lock nests inside IRQ-safe mapping->tree_lock */
26398 -       local_irq_disable();
26399 -       shadow_nodes = list_lru_shrink_count(&workingset_shadow_nodes, sc);
26400 -       local_irq_enable();
26401 +       local_lock_irq(workingset_shadow_lock);
26402 +       shadow_nodes = list_lru_shrink_count(&__workingset_shadow_nodes, sc);
26403 +       local_unlock_irq(workingset_shadow_lock);
26405         if (sc->memcg) {
26406                 pages = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid,
26407 @@ -438,9 +439,9 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
26408         spin_unlock(&mapping->tree_lock);
26409         ret = LRU_REMOVED_RETRY;
26410  out:
26411 -       local_irq_enable();
26412 +       local_unlock_irq(workingset_shadow_lock);
26413         cond_resched();
26414 -       local_irq_disable();
26415 +       local_lock_irq(workingset_shadow_lock);
26416         spin_lock(lru_lock);
26417         return ret;
26419 @@ -451,10 +452,10 @@ static unsigned long scan_shadow_nodes(struct shrinker *shrinker,
26420         unsigned long ret;
26422         /* list_lru lock nests inside IRQ-safe mapping->tree_lock */
26423 -       local_irq_disable();
26424 -       ret =  list_lru_shrink_walk(&workingset_shadow_nodes, sc,
26425 +       local_lock_irq(workingset_shadow_lock);
26426 +       ret =  list_lru_shrink_walk(&__workingset_shadow_nodes, sc,
26427                                     shadow_lru_isolate, NULL);
26428 -       local_irq_enable();
26429 +       local_unlock_irq(workingset_shadow_lock);
26430         return ret;
26433 @@ -492,7 +493,7 @@ static int __init workingset_init(void)
26434         pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n",
26435                timestamp_bits, max_order, bucket_order);
26437 -       ret = __list_lru_init(&workingset_shadow_nodes, true, &shadow_nodes_key);
26438 +       ret = __list_lru_init(&__workingset_shadow_nodes, true, &shadow_nodes_key);
26439         if (ret)
26440                 goto err;
26441         ret = register_shrinker(&workingset_shadow_shrinker);
26442 @@ -500,7 +501,7 @@ static int __init workingset_init(void)
26443                 goto err_list_lru;
26444         return 0;
26445  err_list_lru:
26446 -       list_lru_destroy(&workingset_shadow_nodes);
26447 +       list_lru_destroy(&__workingset_shadow_nodes);
26448  err:
26449         return ret;
26451 diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
26452 index d3548c48369f..8894f0749d8d 100644
26453 --- a/mm/zsmalloc.c
26454 +++ b/mm/zsmalloc.c
26455 @@ -53,6 +53,7 @@
26456  #include <linux/mount.h>
26457  #include <linux/migrate.h>
26458  #include <linux/pagemap.h>
26459 +#include <linux/locallock.h>
26461  #define ZSPAGE_MAGIC   0x58
26463 @@ -70,9 +71,22 @@
26464   */
26465  #define ZS_MAX_ZSPAGE_ORDER 2
26466  #define ZS_MAX_PAGES_PER_ZSPAGE (_AC(1, UL) << ZS_MAX_ZSPAGE_ORDER)
26468  #define ZS_HANDLE_SIZE (sizeof(unsigned long))
26470 +#ifdef CONFIG_PREEMPT_RT_FULL
26472 +struct zsmalloc_handle {
26473 +       unsigned long addr;
26474 +       struct mutex lock;
26477 +#define ZS_HANDLE_ALLOC_SIZE (sizeof(struct zsmalloc_handle))
26479 +#else
26481 +#define ZS_HANDLE_ALLOC_SIZE (sizeof(unsigned long))
26482 +#endif
26484  /*
26485   * Object location (<PFN>, <obj_idx>) is encoded as
26486   * as single (unsigned long) handle value.
26487 @@ -327,7 +341,7 @@ static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage) {}
26489  static int create_cache(struct zs_pool *pool)
26491 -       pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_SIZE,
26492 +       pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_ALLOC_SIZE,
26493                                         0, 0, NULL);
26494         if (!pool->handle_cachep)
26495                 return 1;
26496 @@ -351,10 +365,27 @@ static void destroy_cache(struct zs_pool *pool)
26498  static unsigned long cache_alloc_handle(struct zs_pool *pool, gfp_t gfp)
26500 -       return (unsigned long)kmem_cache_alloc(pool->handle_cachep,
26501 -                       gfp & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
26502 +       void *p;
26504 +       p = kmem_cache_alloc(pool->handle_cachep,
26505 +                            gfp & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
26506 +#ifdef CONFIG_PREEMPT_RT_FULL
26507 +       if (p) {
26508 +               struct zsmalloc_handle *zh = p;
26510 +               mutex_init(&zh->lock);
26511 +       }
26512 +#endif
26513 +       return (unsigned long)p;
26516 +#ifdef CONFIG_PREEMPT_RT_FULL
26517 +static struct zsmalloc_handle *zs_get_pure_handle(unsigned long handle)
26519 +       return (void *)(handle &~((1 << OBJ_TAG_BITS) - 1));
26521 +#endif
26523  static void cache_free_handle(struct zs_pool *pool, unsigned long handle)
26525         kmem_cache_free(pool->handle_cachep, (void *)handle);
26526 @@ -373,12 +404,18 @@ static void cache_free_zspage(struct zs_pool *pool, struct zspage *zspage)
26528  static void record_obj(unsigned long handle, unsigned long obj)
26530 +#ifdef CONFIG_PREEMPT_RT_FULL
26531 +       struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
26533 +       WRITE_ONCE(zh->addr, obj);
26534 +#else
26535         /*
26536          * lsb of @obj represents handle lock while other bits
26537          * represent object value the handle is pointing so
26538          * updating shouldn't do store tearing.
26539          */
26540         WRITE_ONCE(*(unsigned long *)handle, obj);
26541 +#endif
26544  /* zpool driver */
26545 @@ -467,6 +504,7 @@ MODULE_ALIAS("zpool-zsmalloc");
26547  /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */
26548  static DEFINE_PER_CPU(struct mapping_area, zs_map_area);
26549 +static DEFINE_LOCAL_IRQ_LOCK(zs_map_area_lock);
26551  static bool is_zspage_isolated(struct zspage *zspage)
26553 @@ -902,7 +940,13 @@ static unsigned long location_to_obj(struct page *page, unsigned int obj_idx)
26555  static unsigned long handle_to_obj(unsigned long handle)
26557 +#ifdef CONFIG_PREEMPT_RT_FULL
26558 +       struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
26560 +       return zh->addr;
26561 +#else
26562         return *(unsigned long *)handle;
26563 +#endif
26566  static unsigned long obj_to_head(struct page *page, void *obj)
26567 @@ -916,22 +960,46 @@ static unsigned long obj_to_head(struct page *page, void *obj)
26569  static inline int testpin_tag(unsigned long handle)
26571 +#ifdef CONFIG_PREEMPT_RT_FULL
26572 +       struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
26574 +       return mutex_is_locked(&zh->lock);
26575 +#else
26576         return bit_spin_is_locked(HANDLE_PIN_BIT, (unsigned long *)handle);
26577 +#endif
26580  static inline int trypin_tag(unsigned long handle)
26582 +#ifdef CONFIG_PREEMPT_RT_FULL
26583 +       struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
26585 +       return mutex_trylock(&zh->lock);
26586 +#else
26587         return bit_spin_trylock(HANDLE_PIN_BIT, (unsigned long *)handle);
26588 +#endif
26591  static void pin_tag(unsigned long handle)
26593 +#ifdef CONFIG_PREEMPT_RT_FULL
26594 +       struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
26596 +       return mutex_lock(&zh->lock);
26597 +#else
26598         bit_spin_lock(HANDLE_PIN_BIT, (unsigned long *)handle);
26599 +#endif
26602  static void unpin_tag(unsigned long handle)
26604 +#ifdef CONFIG_PREEMPT_RT_FULL
26605 +       struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
26607 +       return mutex_unlock(&zh->lock);
26608 +#else
26609         bit_spin_unlock(HANDLE_PIN_BIT, (unsigned long *)handle);
26610 +#endif
26613  static void reset_page(struct page *page)
26614 @@ -1423,7 +1491,7 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
26615         class = pool->size_class[class_idx];
26616         off = (class->size * obj_idx) & ~PAGE_MASK;
26618 -       area = &get_cpu_var(zs_map_area);
26619 +       area = &get_locked_var(zs_map_area_lock, zs_map_area);
26620         area->vm_mm = mm;
26621         if (off + class->size <= PAGE_SIZE) {
26622                 /* this object is contained entirely within a page */
26623 @@ -1477,7 +1545,7 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
26625                 __zs_unmap_object(area, pages, off, class->size);
26626         }
26627 -       put_cpu_var(zs_map_area);
26628 +       put_locked_var(zs_map_area_lock, zs_map_area);
26630         migrate_read_unlock(zspage);
26631         unpin_tag(handle);
26632 diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c
26633 index c88a6007e643..5de85b55a821 100644
26634 --- a/net/bluetooth/hci_sock.c
26635 +++ b/net/bluetooth/hci_sock.c
26636 @@ -251,15 +251,13 @@ void hci_send_to_sock(struct hci_dev *hdev, struct sk_buff *skb)
26639  /* Send frame to sockets with specific channel */
26640 -void hci_send_to_channel(unsigned short channel, struct sk_buff *skb,
26641 -                        int flag, struct sock *skip_sk)
26642 +static void __hci_send_to_channel(unsigned short channel, struct sk_buff *skb,
26643 +                                 int flag, struct sock *skip_sk)
26645         struct sock *sk;
26647         BT_DBG("channel %u len %d", channel, skb->len);
26649 -       read_lock(&hci_sk_list.lock);
26651         sk_for_each(sk, &hci_sk_list.head) {
26652                 struct sk_buff *nskb;
26654 @@ -285,6 +283,13 @@ void hci_send_to_channel(unsigned short channel, struct sk_buff *skb,
26655                         kfree_skb(nskb);
26656         }
26660 +void hci_send_to_channel(unsigned short channel, struct sk_buff *skb,
26661 +                        int flag, struct sock *skip_sk)
26663 +       read_lock(&hci_sk_list.lock);
26664 +       __hci_send_to_channel(channel, skb, flag, skip_sk);
26665         read_unlock(&hci_sk_list.lock);
26668 @@ -388,8 +393,8 @@ void hci_send_monitor_ctrl_event(struct hci_dev *hdev, u16 event,
26669                 hdr->index = index;
26670                 hdr->len = cpu_to_le16(skb->len - HCI_MON_HDR_SIZE);
26672 -               hci_send_to_channel(HCI_CHANNEL_MONITOR, skb,
26673 -                                   HCI_SOCK_TRUSTED, NULL);
26674 +               __hci_send_to_channel(HCI_CHANNEL_MONITOR, skb,
26675 +                                     HCI_SOCK_TRUSTED, NULL);
26676                 kfree_skb(skb);
26677         }
26679 diff --git a/net/core/dev.c b/net/core/dev.c
26680 index 09007a71c8dd..6cb279747408 100644
26681 --- a/net/core/dev.c
26682 +++ b/net/core/dev.c
26683 @@ -190,6 +190,7 @@ static unsigned int napi_gen_id = NR_CPUS;
26684  static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
26686  static seqcount_t devnet_rename_seq;
26687 +static DEFINE_MUTEX(devnet_rename_mutex);
26689  static inline void dev_base_seq_inc(struct net *net)
26691 @@ -211,14 +212,14 @@ static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
26692  static inline void rps_lock(struct softnet_data *sd)
26694  #ifdef CONFIG_RPS
26695 -       spin_lock(&sd->input_pkt_queue.lock);
26696 +       raw_spin_lock(&sd->input_pkt_queue.raw_lock);
26697  #endif
26700  static inline void rps_unlock(struct softnet_data *sd)
26702  #ifdef CONFIG_RPS
26703 -       spin_unlock(&sd->input_pkt_queue.lock);
26704 +       raw_spin_unlock(&sd->input_pkt_queue.raw_lock);
26705  #endif
26708 @@ -888,7 +889,8 @@ int netdev_get_name(struct net *net, char *name, int ifindex)
26709         strcpy(name, dev->name);
26710         rcu_read_unlock();
26711         if (read_seqcount_retry(&devnet_rename_seq, seq)) {
26712 -               cond_resched();
26713 +               mutex_lock(&devnet_rename_mutex);
26714 +               mutex_unlock(&devnet_rename_mutex);
26715                 goto retry;
26716         }
26718 @@ -1157,20 +1159,17 @@ int dev_change_name(struct net_device *dev, const char *newname)
26719         if (dev->flags & IFF_UP)
26720                 return -EBUSY;
26722 -       write_seqcount_begin(&devnet_rename_seq);
26723 +       mutex_lock(&devnet_rename_mutex);
26724 +       __raw_write_seqcount_begin(&devnet_rename_seq);
26726 -       if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
26727 -               write_seqcount_end(&devnet_rename_seq);
26728 -               return 0;
26729 -       }
26730 +       if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
26731 +               goto outunlock;
26733         memcpy(oldname, dev->name, IFNAMSIZ);
26735         err = dev_get_valid_name(net, dev, newname);
26736 -       if (err < 0) {
26737 -               write_seqcount_end(&devnet_rename_seq);
26738 -               return err;
26739 -       }
26740 +       if (err < 0)
26741 +               goto outunlock;
26743         if (oldname[0] && !strchr(oldname, '%'))
26744                 netdev_info(dev, "renamed from %s\n", oldname);
26745 @@ -1183,11 +1182,12 @@ int dev_change_name(struct net_device *dev, const char *newname)
26746         if (ret) {
26747                 memcpy(dev->name, oldname, IFNAMSIZ);
26748                 dev->name_assign_type = old_assign_type;
26749 -               write_seqcount_end(&devnet_rename_seq);
26750 -               return ret;
26751 +               err = ret;
26752 +               goto outunlock;
26753         }
26755 -       write_seqcount_end(&devnet_rename_seq);
26756 +       __raw_write_seqcount_end(&devnet_rename_seq);
26757 +       mutex_unlock(&devnet_rename_mutex);
26759         netdev_adjacent_rename_links(dev, oldname);
26761 @@ -1208,7 +1208,8 @@ int dev_change_name(struct net_device *dev, const char *newname)
26762                 /* err >= 0 after dev_alloc_name() or stores the first errno */
26763                 if (err >= 0) {
26764                         err = ret;
26765 -                       write_seqcount_begin(&devnet_rename_seq);
26766 +                       mutex_lock(&devnet_rename_mutex);
26767 +                       __raw_write_seqcount_begin(&devnet_rename_seq);
26768                         memcpy(dev->name, oldname, IFNAMSIZ);
26769                         memcpy(oldname, newname, IFNAMSIZ);
26770                         dev->name_assign_type = old_assign_type;
26771 @@ -1221,6 +1222,11 @@ int dev_change_name(struct net_device *dev, const char *newname)
26772         }
26774         return err;
26776 +outunlock:
26777 +       __raw_write_seqcount_end(&devnet_rename_seq);
26778 +       mutex_unlock(&devnet_rename_mutex);
26779 +       return err;
26782  /**
26783 @@ -2287,6 +2293,7 @@ static void __netif_reschedule(struct Qdisc *q)
26784         sd->output_queue_tailp = &q->next_sched;
26785         raise_softirq_irqoff(NET_TX_SOFTIRQ);
26786         local_irq_restore(flags);
26787 +       preempt_check_resched_rt();
26790  void __netif_schedule(struct Qdisc *q)
26791 @@ -2371,6 +2378,7 @@ void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
26792         __this_cpu_write(softnet_data.completion_queue, skb);
26793         raise_softirq_irqoff(NET_TX_SOFTIRQ);
26794         local_irq_restore(flags);
26795 +       preempt_check_resched_rt();
26797  EXPORT_SYMBOL(__dev_kfree_skb_irq);
26799 @@ -3112,7 +3120,11 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
26800          * This permits qdisc->running owner to get the lock more
26801          * often and dequeue packets faster.
26802          */
26803 +#ifdef CONFIG_PREEMPT_RT_FULL
26804 +       contended = true;
26805 +#else
26806         contended = qdisc_is_running(q);
26807 +#endif
26808         if (unlikely(contended))
26809                 spin_lock(&q->busylock);
26811 @@ -3175,8 +3187,10 @@ static void skb_update_prio(struct sk_buff *skb)
26812  #define skb_update_prio(skb)
26813  #endif
26815 +#ifndef CONFIG_PREEMPT_RT_FULL
26816  DEFINE_PER_CPU(int, xmit_recursion);
26817  EXPORT_SYMBOL(xmit_recursion);
26818 +#endif
26820  /**
26821   *     dev_loopback_xmit - loop back @skb
26822 @@ -3410,8 +3424,7 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
26823                 int cpu = smp_processor_id(); /* ok because BHs are off */
26825                 if (txq->xmit_lock_owner != cpu) {
26826 -                       if (unlikely(__this_cpu_read(xmit_recursion) >
26827 -                                    XMIT_RECURSION_LIMIT))
26828 +                       if (unlikely(xmit_rec_read() > XMIT_RECURSION_LIMIT))
26829                                 goto recursion_alert;
26831                         skb = validate_xmit_skb(skb, dev);
26832 @@ -3421,9 +3434,9 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
26833                         HARD_TX_LOCK(dev, txq, cpu);
26835                         if (!netif_xmit_stopped(txq)) {
26836 -                               __this_cpu_inc(xmit_recursion);
26837 +                               xmit_rec_inc();
26838                                 skb = dev_hard_start_xmit(skb, dev, txq, &rc);
26839 -                               __this_cpu_dec(xmit_recursion);
26840 +                               xmit_rec_dec();
26841                                 if (dev_xmit_complete(rc)) {
26842                                         HARD_TX_UNLOCK(dev, txq);
26843                                         goto out;
26844 @@ -3797,6 +3810,7 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
26845         rps_unlock(sd);
26847         local_irq_restore(flags);
26848 +       preempt_check_resched_rt();
26850         atomic_long_inc(&skb->dev->rx_dropped);
26851         kfree_skb(skb);
26852 @@ -3815,7 +3829,7 @@ static int netif_rx_internal(struct sk_buff *skb)
26853                 struct rps_dev_flow voidflow, *rflow = &voidflow;
26854                 int cpu;
26856 -               preempt_disable();
26857 +               migrate_disable();
26858                 rcu_read_lock();
26860                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
26861 @@ -3825,13 +3839,13 @@ static int netif_rx_internal(struct sk_buff *skb)
26862                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
26864                 rcu_read_unlock();
26865 -               preempt_enable();
26866 +               migrate_enable();
26867         } else
26868  #endif
26869         {
26870                 unsigned int qtail;
26871 -               ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
26872 -               put_cpu();
26873 +               ret = enqueue_to_backlog(skb, get_cpu_light(), &qtail);
26874 +               put_cpu_light();
26875         }
26876         return ret;
26878 @@ -3865,11 +3879,9 @@ int netif_rx_ni(struct sk_buff *skb)
26880         trace_netif_rx_ni_entry(skb);
26882 -       preempt_disable();
26883 +       local_bh_disable();
26884         err = netif_rx_internal(skb);
26885 -       if (local_softirq_pending())
26886 -               do_softirq();
26887 -       preempt_enable();
26888 +       local_bh_enable();
26890         return err;
26892 @@ -4348,7 +4360,7 @@ static void flush_backlog(struct work_struct *work)
26893         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
26894                 if (skb->dev->reg_state == NETREG_UNREGISTERING) {
26895                         __skb_unlink(skb, &sd->input_pkt_queue);
26896 -                       kfree_skb(skb);
26897 +                       __skb_queue_tail(&sd->tofree_queue, skb);
26898                         input_queue_head_incr(sd);
26899                 }
26900         }
26901 @@ -4358,11 +4370,14 @@ static void flush_backlog(struct work_struct *work)
26902         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
26903                 if (skb->dev->reg_state == NETREG_UNREGISTERING) {
26904                         __skb_unlink(skb, &sd->process_queue);
26905 -                       kfree_skb(skb);
26906 +                       __skb_queue_tail(&sd->tofree_queue, skb);
26907                         input_queue_head_incr(sd);
26908                 }
26909         }
26910 +       if (!skb_queue_empty(&sd->tofree_queue))
26911 +               raise_softirq_irqoff(NET_RX_SOFTIRQ);
26912         local_bh_enable();
26916  static void flush_all_backlogs(void)
26917 @@ -4853,6 +4868,7 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd)
26918                 sd->rps_ipi_list = NULL;
26920                 local_irq_enable();
26921 +               preempt_check_resched_rt();
26923                 /* Send pending IPI's to kick RPS processing on remote cpus. */
26924                 while (remsd) {
26925 @@ -4866,6 +4882,7 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd)
26926         } else
26927  #endif
26928                 local_irq_enable();
26929 +       preempt_check_resched_rt();
26932  static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
26933 @@ -4895,7 +4912,9 @@ static int process_backlog(struct napi_struct *napi, int quota)
26934         while (again) {
26935                 struct sk_buff *skb;
26937 +               local_irq_disable();
26938                 while ((skb = __skb_dequeue(&sd->process_queue))) {
26939 +                       local_irq_enable();
26940                         rcu_read_lock();
26941                         __netif_receive_skb(skb);
26942                         rcu_read_unlock();
26943 @@ -4903,9 +4922,9 @@ static int process_backlog(struct napi_struct *napi, int quota)
26944                         if (++work >= quota)
26945                                 return work;
26947 +                       local_irq_disable();
26948                 }
26950 -               local_irq_disable();
26951                 rps_lock(sd);
26952                 if (skb_queue_empty(&sd->input_pkt_queue)) {
26953                         /*
26954 @@ -4943,9 +4962,11 @@ void __napi_schedule(struct napi_struct *n)
26955         local_irq_save(flags);
26956         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
26957         local_irq_restore(flags);
26958 +       preempt_check_resched_rt();
26960  EXPORT_SYMBOL(__napi_schedule);
26962 +#ifndef CONFIG_PREEMPT_RT_FULL
26963  /**
26964   * __napi_schedule_irqoff - schedule for receive
26965   * @n: entry to schedule
26966 @@ -4957,6 +4978,7 @@ void __napi_schedule_irqoff(struct napi_struct *n)
26967         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
26969  EXPORT_SYMBOL(__napi_schedule_irqoff);
26970 +#endif
26972  void __napi_complete(struct napi_struct *n)
26974 @@ -5246,13 +5268,21 @@ static __latent_entropy void net_rx_action(struct softirq_action *h)
26975         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
26976         unsigned long time_limit = jiffies + 2;
26977         int budget = netdev_budget;
26978 +       struct sk_buff_head tofree_q;
26979 +       struct sk_buff *skb;
26980         LIST_HEAD(list);
26981         LIST_HEAD(repoll);
26983 +       __skb_queue_head_init(&tofree_q);
26985         local_irq_disable();
26986 +       skb_queue_splice_init(&sd->tofree_queue, &tofree_q);
26987         list_splice_init(&sd->poll_list, &list);
26988         local_irq_enable();
26990 +       while ((skb = __skb_dequeue(&tofree_q)))
26991 +               kfree_skb(skb);
26993         for (;;) {
26994                 struct napi_struct *n;
26996 @@ -5283,7 +5313,7 @@ static __latent_entropy void net_rx_action(struct softirq_action *h)
26997         list_splice_tail(&repoll, &list);
26998         list_splice(&list, &sd->poll_list);
26999         if (!list_empty(&sd->poll_list))
27000 -               __raise_softirq_irqoff(NET_RX_SOFTIRQ);
27001 +               __raise_softirq_irqoff_ksoft(NET_RX_SOFTIRQ);
27003         net_rps_action_and_irq_enable(sd);
27005 @@ -8045,16 +8075,20 @@ static int dev_cpu_callback(struct notifier_block *nfb,
27007         raise_softirq_irqoff(NET_TX_SOFTIRQ);
27008         local_irq_enable();
27009 +       preempt_check_resched_rt();
27011         /* Process offline CPU's input_pkt_queue */
27012         while ((skb = __skb_dequeue(&oldsd->process_queue))) {
27013                 netif_rx_ni(skb);
27014                 input_queue_head_incr(oldsd);
27015         }
27016 -       while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
27017 +       while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
27018                 netif_rx_ni(skb);
27019                 input_queue_head_incr(oldsd);
27020         }
27021 +       while ((skb = __skb_dequeue(&oldsd->tofree_queue))) {
27022 +               kfree_skb(skb);
27023 +       }
27025         return NOTIFY_OK;
27027 @@ -8359,8 +8393,9 @@ static int __init net_dev_init(void)
27029                 INIT_WORK(flush, flush_backlog);
27031 -               skb_queue_head_init(&sd->input_pkt_queue);
27032 -               skb_queue_head_init(&sd->process_queue);
27033 +               skb_queue_head_init_raw(&sd->input_pkt_queue);
27034 +               skb_queue_head_init_raw(&sd->process_queue);
27035 +               skb_queue_head_init_raw(&sd->tofree_queue);
27036                 INIT_LIST_HEAD(&sd->poll_list);
27037                 sd->output_queue_tailp = &sd->output_queue;
27038  #ifdef CONFIG_RPS
27039 diff --git a/net/core/filter.c b/net/core/filter.c
27040 index 4eb4ce0aeef4..4f09d6a57217 100644
27041 --- a/net/core/filter.c
27042 +++ b/net/core/filter.c
27043 @@ -1645,7 +1645,7 @@ static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb)
27045         int ret;
27047 -       if (unlikely(__this_cpu_read(xmit_recursion) > XMIT_RECURSION_LIMIT)) {
27048 +       if (unlikely(xmit_rec_read() > XMIT_RECURSION_LIMIT)) {
27049                 net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n");
27050                 kfree_skb(skb);
27051                 return -ENETDOWN;
27052 @@ -1653,9 +1653,9 @@ static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb)
27054         skb->dev = dev;
27056 -       __this_cpu_inc(xmit_recursion);
27057 +       xmit_rec_inc();
27058         ret = dev_queue_xmit(skb);
27059 -       __this_cpu_dec(xmit_recursion);
27060 +       xmit_rec_dec();
27062         return ret;
27064 diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
27065 index cad8e791f28e..2a9364fe62a5 100644
27066 --- a/net/core/gen_estimator.c
27067 +++ b/net/core/gen_estimator.c
27068 @@ -84,7 +84,7 @@ struct gen_estimator
27069         struct gnet_stats_basic_packed  *bstats;
27070         struct gnet_stats_rate_est64    *rate_est;
27071         spinlock_t              *stats_lock;
27072 -       seqcount_t              *running;
27073 +       net_seqlock_t           *running;
27074         int                     ewma_log;
27075         u32                     last_packets;
27076         unsigned long           avpps;
27077 @@ -213,7 +213,7 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
27078                       struct gnet_stats_basic_cpu __percpu *cpu_bstats,
27079                       struct gnet_stats_rate_est64 *rate_est,
27080                       spinlock_t *stats_lock,
27081 -                     seqcount_t *running,
27082 +                     net_seqlock_t *running,
27083                       struct nlattr *opt)
27085         struct gen_estimator *est;
27086 @@ -309,7 +309,7 @@ int gen_replace_estimator(struct gnet_stats_basic_packed *bstats,
27087                           struct gnet_stats_basic_cpu __percpu *cpu_bstats,
27088                           struct gnet_stats_rate_est64 *rate_est,
27089                           spinlock_t *stats_lock,
27090 -                         seqcount_t *running, struct nlattr *opt)
27091 +                         net_seqlock_t *running, struct nlattr *opt)
27093         gen_kill_estimator(bstats, rate_est);
27094         return gen_new_estimator(bstats, cpu_bstats, rate_est, stats_lock, running, opt);
27095 diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c
27096 index 508e051304fb..bc3b17b78c94 100644
27097 --- a/net/core/gen_stats.c
27098 +++ b/net/core/gen_stats.c
27099 @@ -130,7 +130,7 @@ __gnet_stats_copy_basic_cpu(struct gnet_stats_basic_packed *bstats,
27102  void
27103 -__gnet_stats_copy_basic(const seqcount_t *running,
27104 +__gnet_stats_copy_basic(net_seqlock_t *running,
27105                         struct gnet_stats_basic_packed *bstats,
27106                         struct gnet_stats_basic_cpu __percpu *cpu,
27107                         struct gnet_stats_basic_packed *b)
27108 @@ -143,10 +143,10 @@ __gnet_stats_copy_basic(const seqcount_t *running,
27109         }
27110         do {
27111                 if (running)
27112 -                       seq = read_seqcount_begin(running);
27113 +                       seq = net_seq_begin(running);
27114                 bstats->bytes = b->bytes;
27115                 bstats->packets = b->packets;
27116 -       } while (running && read_seqcount_retry(running, seq));
27117 +       } while (running && net_seq_retry(running, seq));
27119  EXPORT_SYMBOL(__gnet_stats_copy_basic);
27121 @@ -164,7 +164,7 @@ EXPORT_SYMBOL(__gnet_stats_copy_basic);
27122   * if the room in the socket buffer was not sufficient.
27123   */
27124  int
27125 -gnet_stats_copy_basic(const seqcount_t *running,
27126 +gnet_stats_copy_basic(net_seqlock_t *running,
27127                       struct gnet_dump *d,
27128                       struct gnet_stats_basic_cpu __percpu *cpu,
27129                       struct gnet_stats_basic_packed *b)
27130 diff --git a/net/core/skbuff.c b/net/core/skbuff.c
27131 index a64515583bc1..fec448d29f42 100644
27132 --- a/net/core/skbuff.c
27133 +++ b/net/core/skbuff.c
27134 @@ -64,6 +64,7 @@
27135  #include <linux/errqueue.h>
27136  #include <linux/prefetch.h>
27137  #include <linux/if_vlan.h>
27138 +#include <linux/locallock.h>
27140  #include <net/protocol.h>
27141  #include <net/dst.h>
27142 @@ -360,6 +361,8 @@ struct napi_alloc_cache {
27144  static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache);
27145  static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache);
27146 +static DEFINE_LOCAL_IRQ_LOCK(netdev_alloc_lock);
27147 +static DEFINE_LOCAL_IRQ_LOCK(napi_alloc_cache_lock);
27149  static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
27151 @@ -367,10 +370,10 @@ static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
27152         unsigned long flags;
27153         void *data;
27155 -       local_irq_save(flags);
27156 +       local_lock_irqsave(netdev_alloc_lock, flags);
27157         nc = this_cpu_ptr(&netdev_alloc_cache);
27158         data = __alloc_page_frag(nc, fragsz, gfp_mask);
27159 -       local_irq_restore(flags);
27160 +       local_unlock_irqrestore(netdev_alloc_lock, flags);
27161         return data;
27164 @@ -389,9 +392,13 @@ EXPORT_SYMBOL(netdev_alloc_frag);
27166  static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
27168 -       struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
27169 +       struct napi_alloc_cache *nc;
27170 +       void *data;
27172 -       return __alloc_page_frag(&nc->page, fragsz, gfp_mask);
27173 +       nc = &get_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
27174 +       data = __alloc_page_frag(&nc->page, fragsz, gfp_mask);
27175 +       put_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
27176 +       return data;
27179  void *napi_alloc_frag(unsigned int fragsz)
27180 @@ -438,13 +445,13 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
27181         if (sk_memalloc_socks())
27182                 gfp_mask |= __GFP_MEMALLOC;
27184 -       local_irq_save(flags);
27185 +       local_lock_irqsave(netdev_alloc_lock, flags);
27187         nc = this_cpu_ptr(&netdev_alloc_cache);
27188         data = __alloc_page_frag(nc, len, gfp_mask);
27189         pfmemalloc = nc->pfmemalloc;
27191 -       local_irq_restore(flags);
27192 +       local_unlock_irqrestore(netdev_alloc_lock, flags);
27194         if (unlikely(!data))
27195                 return NULL;
27196 @@ -485,9 +492,10 @@ EXPORT_SYMBOL(__netdev_alloc_skb);
27197  struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
27198                                  gfp_t gfp_mask)
27200 -       struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
27201 +       struct napi_alloc_cache *nc;
27202         struct sk_buff *skb;
27203         void *data;
27204 +       bool pfmemalloc;
27206         len += NET_SKB_PAD + NET_IP_ALIGN;
27208 @@ -505,7 +513,10 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
27209         if (sk_memalloc_socks())
27210                 gfp_mask |= __GFP_MEMALLOC;
27212 +       nc = &get_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
27213         data = __alloc_page_frag(&nc->page, len, gfp_mask);
27214 +       pfmemalloc = nc->page.pfmemalloc;
27215 +       put_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
27216         if (unlikely(!data))
27217                 return NULL;
27219 @@ -516,7 +527,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
27220         }
27222         /* use OR instead of assignment to avoid clearing of bits in mask */
27223 -       if (nc->page.pfmemalloc)
27224 +       if (pfmemalloc)
27225                 skb->pfmemalloc = 1;
27226         skb->head_frag = 1;
27228 @@ -760,23 +771,26 @@ EXPORT_SYMBOL(consume_skb);
27230  void __kfree_skb_flush(void)
27232 -       struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
27233 +       struct napi_alloc_cache *nc;
27235 +       nc = &get_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
27236         /* flush skb_cache if containing objects */
27237         if (nc->skb_count) {
27238                 kmem_cache_free_bulk(skbuff_head_cache, nc->skb_count,
27239                                      nc->skb_cache);
27240                 nc->skb_count = 0;
27241         }
27242 +       put_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
27245  static inline void _kfree_skb_defer(struct sk_buff *skb)
27247 -       struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
27248 +       struct napi_alloc_cache *nc;
27250         /* drop skb->head and call any destructors for packet */
27251         skb_release_all(skb);
27253 +       nc = &get_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
27254         /* record skb to CPU local list */
27255         nc->skb_cache[nc->skb_count++] = skb;
27257 @@ -791,6 +805,7 @@ static inline void _kfree_skb_defer(struct sk_buff *skb)
27258                                      nc->skb_cache);
27259                 nc->skb_count = 0;
27260         }
27261 +       put_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
27263  void __kfree_skb_defer(struct sk_buff *skb)
27265 diff --git a/net/core/sock.c b/net/core/sock.c
27266 index e3b60460dc9c..8d15848c3a22 100644
27267 --- a/net/core/sock.c
27268 +++ b/net/core/sock.c
27269 @@ -2493,12 +2493,11 @@ void lock_sock_nested(struct sock *sk, int subclass)
27270         if (sk->sk_lock.owned)
27271                 __lock_sock(sk);
27272         sk->sk_lock.owned = 1;
27273 -       spin_unlock(&sk->sk_lock.slock);
27274 +       spin_unlock_bh(&sk->sk_lock.slock);
27275         /*
27276          * The sk_lock has mutex_lock() semantics here:
27277          */
27278         mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
27279 -       local_bh_enable();
27281  EXPORT_SYMBOL(lock_sock_nested);
27283 diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
27284 index 31f17f0bbd1c..c9525356823c 100644
27285 --- a/net/ipv4/icmp.c
27286 +++ b/net/ipv4/icmp.c
27287 @@ -69,6 +69,7 @@
27288  #include <linux/jiffies.h>
27289  #include <linux/kernel.h>
27290  #include <linux/fcntl.h>
27291 +#include <linux/sysrq.h>
27292  #include <linux/socket.h>
27293  #include <linux/in.h>
27294  #include <linux/inet.h>
27295 @@ -77,6 +78,7 @@
27296  #include <linux/string.h>
27297  #include <linux/netfilter_ipv4.h>
27298  #include <linux/slab.h>
27299 +#include <linux/locallock.h>
27300  #include <net/snmp.h>
27301  #include <net/ip.h>
27302  #include <net/route.h>
27303 @@ -204,6 +206,8 @@ static const struct icmp_control icmp_pointers[NR_ICMP_TYPES+1];
27304   *
27305   *     On SMP we have one ICMP socket per-cpu.
27306   */
27307 +static DEFINE_LOCAL_IRQ_LOCK(icmp_sk_lock);
27309  static struct sock *icmp_sk(struct net *net)
27311         return *this_cpu_ptr(net->ipv4.icmp_sk);
27312 @@ -215,12 +219,18 @@ static inline struct sock *icmp_xmit_lock(struct net *net)
27314         local_bh_disable();
27316 +       if (!local_trylock(icmp_sk_lock)) {
27317 +               local_bh_enable();
27318 +               return NULL;
27319 +       }
27321         sk = icmp_sk(net);
27323         if (unlikely(!spin_trylock(&sk->sk_lock.slock))) {
27324                 /* This can happen if the output path signals a
27325                  * dst_link_failure() for an outgoing ICMP packet.
27326                  */
27327 +               local_unlock(icmp_sk_lock);
27328                 local_bh_enable();
27329                 return NULL;
27330         }
27331 @@ -230,6 +240,7 @@ static inline struct sock *icmp_xmit_lock(struct net *net)
27332  static inline void icmp_xmit_unlock(struct sock *sk)
27334         spin_unlock_bh(&sk->sk_lock.slock);
27335 +       local_unlock(icmp_sk_lock);
27338  int sysctl_icmp_msgs_per_sec __read_mostly = 1000;
27339 @@ -358,6 +369,7 @@ static void icmp_push_reply(struct icmp_bxm *icmp_param,
27340         struct sock *sk;
27341         struct sk_buff *skb;
27343 +       local_lock(icmp_sk_lock);
27344         sk = icmp_sk(dev_net((*rt)->dst.dev));
27345         if (ip_append_data(sk, fl4, icmp_glue_bits, icmp_param,
27346                            icmp_param->data_len+icmp_param->head_len,
27347 @@ -380,6 +392,7 @@ static void icmp_push_reply(struct icmp_bxm *icmp_param,
27348                 skb->ip_summed = CHECKSUM_NONE;
27349                 ip_push_pending_frames(sk, fl4);
27350         }
27351 +       local_unlock(icmp_sk_lock);
27354  /*
27355 @@ -899,6 +912,30 @@ static bool icmp_redirect(struct sk_buff *skb)
27356         return true;
27360 + * 32bit and 64bit have different timestamp length, so we check for
27361 + * the cookie at offset 20 and verify it is repeated at offset 50
27362 + */
27363 +#define CO_POS0                20
27364 +#define CO_POS1                50
27365 +#define CO_SIZE                sizeof(int)
27366 +#define ICMP_SYSRQ_SIZE        57
27369 + * We got a ICMP_SYSRQ_SIZE sized ping request. Check for the cookie
27370 + * pattern and if it matches send the next byte as a trigger to sysrq.
27371 + */
27372 +static void icmp_check_sysrq(struct net *net, struct sk_buff *skb)
27374 +       int cookie = htonl(net->ipv4.sysctl_icmp_echo_sysrq);
27375 +       char *p = skb->data;
27377 +       if (!memcmp(&cookie, p + CO_POS0, CO_SIZE) &&
27378 +           !memcmp(&cookie, p + CO_POS1, CO_SIZE) &&
27379 +           p[CO_POS0 + CO_SIZE] == p[CO_POS1 + CO_SIZE])
27380 +               handle_sysrq(p[CO_POS0 + CO_SIZE]);
27383  /*
27384   *     Handle ICMP_ECHO ("ping") requests.
27385   *
27386 @@ -926,6 +963,11 @@ static bool icmp_echo(struct sk_buff *skb)
27387                 icmp_param.data_len        = skb->len;
27388                 icmp_param.head_len        = sizeof(struct icmphdr);
27389                 icmp_reply(&icmp_param, skb);
27391 +               if (skb->len == ICMP_SYSRQ_SIZE &&
27392 +                   net->ipv4.sysctl_icmp_echo_sysrq) {
27393 +                       icmp_check_sysrq(net, skb);
27394 +               }
27395         }
27396         /* should there be an ICMP stat for ignored echos? */
27397         return true;
27398 diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
27399 index 566cfc50f7cf..4b8551d78a3b 100644
27400 --- a/net/ipv4/sysctl_net_ipv4.c
27401 +++ b/net/ipv4/sysctl_net_ipv4.c
27402 @@ -680,6 +680,13 @@ static struct ctl_table ipv4_net_table[] = {
27403                 .mode           = 0644,
27404                 .proc_handler   = proc_dointvec
27405         },
27406 +       {
27407 +               .procname       = "icmp_echo_sysrq",
27408 +               .data           = &init_net.ipv4.sysctl_icmp_echo_sysrq,
27409 +               .maxlen         = sizeof(int),
27410 +               .mode           = 0644,
27411 +               .proc_handler   = proc_dointvec
27412 +       },
27413         {
27414                 .procname       = "icmp_ignore_bogus_error_responses",
27415                 .data           = &init_net.ipv4.sysctl_icmp_ignore_bogus_error_responses,
27416 diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
27417 index b3960738464e..17699390a324 100644
27418 --- a/net/ipv4/tcp_ipv4.c
27419 +++ b/net/ipv4/tcp_ipv4.c
27420 @@ -62,6 +62,7 @@
27421  #include <linux/init.h>
27422  #include <linux/times.h>
27423  #include <linux/slab.h>
27424 +#include <linux/locallock.h>
27426  #include <net/net_namespace.h>
27427  #include <net/icmp.h>
27428 @@ -568,6 +569,7 @@ void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
27430  EXPORT_SYMBOL(tcp_v4_send_check);
27432 +static DEFINE_LOCAL_IRQ_LOCK(tcp_sk_lock);
27433  /*
27434   *     This routine will send an RST to the other tcp.
27435   *
27436 @@ -695,7 +697,9 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
27437                      offsetof(struct inet_timewait_sock, tw_bound_dev_if));
27439         arg.tos = ip_hdr(skb)->tos;
27441         local_bh_disable();
27442 +       local_lock(tcp_sk_lock);
27443         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
27444                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
27445                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
27446 @@ -703,6 +707,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
27448         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
27449         __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
27450 +       local_unlock(tcp_sk_lock);
27451         local_bh_enable();
27453  #ifdef CONFIG_TCP_MD5SIG
27454 @@ -780,12 +785,14 @@ static void tcp_v4_send_ack(struct net *net,
27455                 arg.bound_dev_if = oif;
27456         arg.tos = tos;
27457         local_bh_disable();
27458 +       local_lock(tcp_sk_lock);
27459         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
27460                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
27461                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
27462                               &arg, arg.iov[0].iov_len);
27464         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
27465 +       local_unlock(tcp_sk_lock);
27466         local_bh_enable();
27469 diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
27470 index 439e597fd374..ca0daeaff370 100644
27471 --- a/net/mac80211/rx.c
27472 +++ b/net/mac80211/rx.c
27473 @@ -4229,7 +4229,7 @@ void ieee80211_rx_napi(struct ieee80211_hw *hw, struct ieee80211_sta *pubsta,
27474         struct ieee80211_supported_band *sband;
27475         struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
27477 -       WARN_ON_ONCE(softirq_count() == 0);
27478 +       WARN_ON_ONCE_NONRT(softirq_count() == 0);
27480         if (WARN_ON(status->band >= NUM_NL80211_BANDS))
27481                 goto drop;
27482 diff --git a/net/netfilter/core.c b/net/netfilter/core.c
27483 index d869ea50623e..5cafa87b030b 100644
27484 --- a/net/netfilter/core.c
27485 +++ b/net/netfilter/core.c
27486 @@ -22,12 +22,18 @@
27487  #include <linux/proc_fs.h>
27488  #include <linux/mutex.h>
27489  #include <linux/slab.h>
27490 +#include <linux/locallock.h>
27491  #include <linux/rcupdate.h>
27492  #include <net/net_namespace.h>
27493  #include <net/sock.h>
27495  #include "nf_internals.h"
27497 +#ifdef CONFIG_PREEMPT_RT_BASE
27498 +DEFINE_LOCAL_IRQ_LOCK(xt_write_lock);
27499 +EXPORT_PER_CPU_SYMBOL(xt_write_lock);
27500 +#endif
27502  static DEFINE_MUTEX(afinfo_mutex);
27504  const struct nf_afinfo __rcu *nf_afinfo[NFPROTO_NUMPROTO] __read_mostly;
27505 diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
27506 index 267db0d603bc..00994de54d57 100644
27507 --- a/net/packet/af_packet.c
27508 +++ b/net/packet/af_packet.c
27509 @@ -63,6 +63,7 @@
27510  #include <linux/if_packet.h>
27511  #include <linux/wireless.h>
27512  #include <linux/kernel.h>
27513 +#include <linux/delay.h>
27514  #include <linux/kmod.h>
27515  #include <linux/slab.h>
27516  #include <linux/vmalloc.h>
27517 @@ -694,7 +695,7 @@ static void prb_retire_rx_blk_timer_expired(unsigned long data)
27518         if (BLOCK_NUM_PKTS(pbd)) {
27519                 while (atomic_read(&pkc->blk_fill_in_prog)) {
27520                         /* Waiting for skb_copy_bits to finish... */
27521 -                       cpu_relax();
27522 +                       cpu_chill();
27523                 }
27524         }
27526 @@ -956,7 +957,7 @@ static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
27527                 if (!(status & TP_STATUS_BLK_TMO)) {
27528                         while (atomic_read(&pkc->blk_fill_in_prog)) {
27529                                 /* Waiting for skb_copy_bits to finish... */
27530 -                               cpu_relax();
27531 +                               cpu_chill();
27532                         }
27533                 }
27534                 prb_close_block(pkc, pbd, po, status);
27535 diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c
27536 index 977f69886c00..f3e7a36b0396 100644
27537 --- a/net/rds/ib_rdma.c
27538 +++ b/net/rds/ib_rdma.c
27539 @@ -34,6 +34,7 @@
27540  #include <linux/slab.h>
27541  #include <linux/rculist.h>
27542  #include <linux/llist.h>
27543 +#include <linux/delay.h>
27545  #include "rds_single_path.h"
27546  #include "ib_mr.h"
27547 @@ -210,7 +211,7 @@ static inline void wait_clean_list_grace(void)
27548         for_each_online_cpu(cpu) {
27549                 flag = &per_cpu(clean_list_grace, cpu);
27550                 while (test_bit(CLEAN_LIST_BUSY_BIT, flag))
27551 -                       cpu_relax();
27552 +                       cpu_chill();
27553         }
27556 diff --git a/net/rxrpc/security.c b/net/rxrpc/security.c
27557 index 7d921e56e715..13df56a738e5 100644
27558 --- a/net/rxrpc/security.c
27559 +++ b/net/rxrpc/security.c
27560 @@ -19,9 +19,6 @@
27561  #include <keys/rxrpc-type.h>
27562  #include "ar-internal.h"
27564 -static LIST_HEAD(rxrpc_security_methods);
27565 -static DECLARE_RWSEM(rxrpc_security_sem);
27567  static const struct rxrpc_security *rxrpc_security_types[] = {
27568         [RXRPC_SECURITY_NONE]   = &rxrpc_no_security,
27569  #ifdef CONFIG_RXKAD
27570 diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
27571 index ea13df1be067..76c20745b502 100644
27572 --- a/net/sched/sch_api.c
27573 +++ b/net/sched/sch_api.c
27574 @@ -980,7 +980,7 @@ static struct Qdisc *qdisc_create(struct net_device *dev,
27575                         rcu_assign_pointer(sch->stab, stab);
27576                 }
27577                 if (tca[TCA_RATE]) {
27578 -                       seqcount_t *running;
27579 +                       net_seqlock_t *running;
27581                         err = -EOPNOTSUPP;
27582                         if (sch->flags & TCQ_F_MQROOT)
27583 diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
27584 index 9016c8baf2aa..d925f0e63679 100644
27585 --- a/net/sched/sch_generic.c
27586 +++ b/net/sched/sch_generic.c
27587 @@ -425,7 +425,11 @@ struct Qdisc noop_qdisc = {
27588         .ops            =       &noop_qdisc_ops,
27589         .q.lock         =       __SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock),
27590         .dev_queue      =       &noop_netdev_queue,
27591 +#ifdef CONFIG_PREEMPT_RT_BASE
27592 +       .running        =       __SEQLOCK_UNLOCKED(noop_qdisc.running),
27593 +#else
27594         .running        =       SEQCNT_ZERO(noop_qdisc.running),
27595 +#endif
27596         .busylock       =       __SPIN_LOCK_UNLOCKED(noop_qdisc.busylock),
27597  };
27598  EXPORT_SYMBOL(noop_qdisc);
27599 @@ -624,9 +628,17 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
27600         lockdep_set_class(&sch->busylock,
27601                           dev->qdisc_tx_busylock ?: &qdisc_tx_busylock);
27603 +#ifdef CONFIG_PREEMPT_RT_BASE
27604 +       seqlock_init(&sch->running);
27605 +       lockdep_set_class(&sch->running.seqcount,
27606 +                         dev->qdisc_running_key ?: &qdisc_running_key);
27607 +       lockdep_set_class(&sch->running.lock,
27608 +                         dev->qdisc_running_key ?: &qdisc_running_key);
27609 +#else
27610         seqcount_init(&sch->running);
27611         lockdep_set_class(&sch->running,
27612                           dev->qdisc_running_key ?: &qdisc_running_key);
27613 +#endif
27615         sch->ops = ops;
27616         sch->enqueue = ops->enqueue;
27617 @@ -926,7 +938,7 @@ void dev_deactivate_many(struct list_head *head)
27618         /* Wait for outstanding qdisc_run calls. */
27619         list_for_each_entry(dev, head, close_list)
27620                 while (some_qdisc_is_busy(dev))
27621 -                       yield();
27622 +                       msleep(1);
27625  void dev_deactivate(struct net_device *dev)
27626 diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
27627 index 9c9db55a0c1e..e6583b018a72 100644
27628 --- a/net/sunrpc/svc_xprt.c
27629 +++ b/net/sunrpc/svc_xprt.c
27630 @@ -396,7 +396,7 @@ void svc_xprt_do_enqueue(struct svc_xprt *xprt)
27631                 goto out;
27632         }
27634 -       cpu = get_cpu();
27635 +       cpu = get_cpu_light();
27636         pool = svc_pool_for_cpu(xprt->xpt_server, cpu);
27638         atomic_long_inc(&pool->sp_stats.packets);
27639 @@ -432,7 +432,7 @@ void svc_xprt_do_enqueue(struct svc_xprt *xprt)
27641                 atomic_long_inc(&pool->sp_stats.threads_woken);
27642                 wake_up_process(rqstp->rq_task);
27643 -               put_cpu();
27644 +               put_cpu_light();
27645                 goto out;
27646         }
27647         rcu_read_unlock();
27648 @@ -453,7 +453,7 @@ void svc_xprt_do_enqueue(struct svc_xprt *xprt)
27649                 goto redo_search;
27650         }
27651         rqstp = NULL;
27652 -       put_cpu();
27653 +       put_cpu_light();
27654  out:
27655         trace_svc_xprt_do_enqueue(xprt, rqstp);
27657 diff --git a/scripts/mkcompile_h b/scripts/mkcompile_h
27658 index 6fdc97ef6023..523e0420d7f0 100755
27659 --- a/scripts/mkcompile_h
27660 +++ b/scripts/mkcompile_h
27661 @@ -4,7 +4,8 @@ TARGET=$1
27662  ARCH=$2
27663  SMP=$3
27664  PREEMPT=$4
27665 -CC=$5
27666 +RT=$5
27667 +CC=$6
27669  vecho() { [ "${quiet}" = "silent_" ] || echo "$@" ; }
27671 @@ -57,6 +58,7 @@ UTS_VERSION="#$VERSION"
27672  CONFIG_FLAGS=""
27673  if [ -n "$SMP" ] ; then CONFIG_FLAGS="SMP"; fi
27674  if [ -n "$PREEMPT" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS PREEMPT"; fi
27675 +if [ -n "$RT" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS RT"; fi
27676  UTS_VERSION="$UTS_VERSION $CONFIG_FLAGS $TIMESTAMP"
27678  # Truncate to maximum length
27679 diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c
27680 index 9d33c1e85c79..3d307bda86f9 100644
27681 --- a/sound/core/pcm_native.c
27682 +++ b/sound/core/pcm_native.c
27683 @@ -135,7 +135,7 @@ EXPORT_SYMBOL_GPL(snd_pcm_stream_unlock);
27684  void snd_pcm_stream_lock_irq(struct snd_pcm_substream *substream)
27686         if (!substream->pcm->nonatomic)
27687 -               local_irq_disable();
27688 +               local_irq_disable_nort();
27689         snd_pcm_stream_lock(substream);
27691  EXPORT_SYMBOL_GPL(snd_pcm_stream_lock_irq);
27692 @@ -150,7 +150,7 @@ void snd_pcm_stream_unlock_irq(struct snd_pcm_substream *substream)
27694         snd_pcm_stream_unlock(substream);
27695         if (!substream->pcm->nonatomic)
27696 -               local_irq_enable();
27697 +               local_irq_enable_nort();
27699  EXPORT_SYMBOL_GPL(snd_pcm_stream_unlock_irq);
27701 @@ -158,7 +158,7 @@ unsigned long _snd_pcm_stream_lock_irqsave(struct snd_pcm_substream *substream)
27703         unsigned long flags = 0;
27704         if (!substream->pcm->nonatomic)
27705 -               local_irq_save(flags);
27706 +               local_irq_save_nort(flags);
27707         snd_pcm_stream_lock(substream);
27708         return flags;
27710 @@ -176,7 +176,7 @@ void snd_pcm_stream_unlock_irqrestore(struct snd_pcm_substream *substream,
27712         snd_pcm_stream_unlock(substream);
27713         if (!substream->pcm->nonatomic)
27714 -               local_irq_restore(flags);
27715 +               local_irq_restore_nort(flags);
27717  EXPORT_SYMBOL_GPL(snd_pcm_stream_unlock_irqrestore);