linux: bump all lts / stable to latest
[openadk.git] / target / linux / patches / 4.4.88 / patch-realtime
blobb49cd6f037ffaf4dfaeccd5180ca09e6789dbe00
1 diff --git a/Documentation/hwlat_detector.txt b/Documentation/hwlat_detector.txt
2 new file mode 100644
3 index 000000000000..cb61516483d3
4 --- /dev/null
5 +++ b/Documentation/hwlat_detector.txt
6 @@ -0,0 +1,64 @@
7 +Introduction:
8 +-------------
10 +The module hwlat_detector is a special purpose kernel module that is used to
11 +detect large system latencies induced by the behavior of certain underlying
12 +hardware or firmware, independent of Linux itself. The code was developed
13 +originally to detect SMIs (System Management Interrupts) on x86 systems,
14 +however there is nothing x86 specific about this patchset. It was
15 +originally written for use by the "RT" patch since the Real Time
16 +kernel is highly latency sensitive.
18 +SMIs are usually not serviced by the Linux kernel, which typically does not
19 +even know that they are occuring. SMIs are instead are set up by BIOS code
20 +and are serviced by BIOS code, usually for "critical" events such as
21 +management of thermal sensors and fans. Sometimes though, SMIs are used for
22 +other tasks and those tasks can spend an inordinate amount of time in the
23 +handler (sometimes measured in milliseconds). Obviously this is a problem if
24 +you are trying to keep event service latencies down in the microsecond range.
26 +The hardware latency detector works by hogging all of the cpus for configurable
27 +amounts of time (by calling stop_machine()), polling the CPU Time Stamp Counter
28 +for some period, then looking for gaps in the TSC data. Any gap indicates a
29 +time when the polling was interrupted and since the machine is stopped and
30 +interrupts turned off the only thing that could do that would be an SMI.
32 +Note that the SMI detector should *NEVER* be used in a production environment.
33 +It is intended to be run manually to determine if the hardware platform has a
34 +problem with long system firmware service routines.
36 +Usage:
37 +------
39 +Loading the module hwlat_detector passing the parameter "enabled=1" (or by
40 +setting the "enable" entry in "hwlat_detector" debugfs toggled on) is the only
41 +step required to start the hwlat_detector. It is possible to redefine the
42 +threshold in microseconds (us) above which latency spikes will be taken
43 +into account (parameter "threshold=").
45 +Example:
47 +       # modprobe hwlat_detector enabled=1 threshold=100
49 +After the module is loaded, it creates a directory named "hwlat_detector" under
50 +the debugfs mountpoint, "/debug/hwlat_detector" for this text. It is necessary
51 +to have debugfs mounted, which might be on /sys/debug on your system.
53 +The /debug/hwlat_detector interface contains the following files:
55 +count                  - number of latency spikes observed since last reset
56 +enable                 - a global enable/disable toggle (0/1), resets count
57 +max                    - maximum hardware latency actually observed (usecs)
58 +sample                 - a pipe from which to read current raw sample data
59 +                         in the format <timestamp> <latency observed usecs>
60 +                         (can be opened O_NONBLOCK for a single sample)
61 +threshold              - minimum latency value to be considered (usecs)
62 +width                  - time period to sample with CPUs held (usecs)
63 +                         must be less than the total window size (enforced)
64 +window                 - total period of sampling, width being inside (usecs)
66 +By default we will set width to 500,000 and window to 1,000,000, meaning that
67 +we will sample every 1,000,000 usecs (1s) for 500,000 usecs (0.5s). If we
68 +observe any latencies that exceed the threshold (initially 100 usecs),
69 +then we write to a global sample ring buffer of 8K samples, which is
70 +consumed by reading from the "sample" (pipe) debugfs file interface.
71 diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
72 index ca64ca566099..19bef799c902 100644
73 --- a/Documentation/kernel-parameters.txt
74 +++ b/Documentation/kernel-parameters.txt
75 @@ -1640,6 +1640,15 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
76         ip=             [IP_PNP]
77                         See Documentation/filesystems/nfs/nfsroot.txt.
79 +       irqaffinity=    [SMP] Set the default irq affinity mask
80 +                       Format:
81 +                       <cpu number>,...,<cpu number>
82 +                       or
83 +                       <cpu number>-<cpu number>
84 +                       (must be a positive range in ascending order)
85 +                       or a mixture
86 +                       <cpu number>,...,<cpu number>-<cpu number>
88         irqfixup        [HW]
89                         When an interrupt is not handled search all handlers
90                         for it. Intended to get systems with badly broken
91 diff --git a/Documentation/sysrq.txt b/Documentation/sysrq.txt
92 index 13f5619b2203..f64d075ba647 100644
93 --- a/Documentation/sysrq.txt
94 +++ b/Documentation/sysrq.txt
95 @@ -59,10 +59,17 @@ On PowerPC - Press 'ALT - Print Screen (or F13) - <command key>,
96  On other - If you know of the key combos for other architectures, please
97             let me know so I can add them to this section.
99 -On all -  write a character to /proc/sysrq-trigger.  e.g.:
101 +On all -  write a character to /proc/sysrq-trigger, e.g.:
102                 echo t > /proc/sysrq-trigger
104 +On all - Enable network SysRq by writing a cookie to icmp_echo_sysrq, e.g.
105 +               echo 0x01020304 >/proc/sys/net/ipv4/icmp_echo_sysrq
106 +        Send an ICMP echo request with this pattern plus the particular
107 +        SysRq command key. Example:
108 +               # ping -c1 -s57 -p0102030468
109 +        will trigger the SysRq-H (help) command.
112  *  What are the 'command' keys?
113  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
114  'b'     - Will immediately reboot the system without syncing or unmounting
115 diff --git a/Documentation/trace/histograms.txt b/Documentation/trace/histograms.txt
116 new file mode 100644
117 index 000000000000..6f2aeabf7faa
118 --- /dev/null
119 +++ b/Documentation/trace/histograms.txt
120 @@ -0,0 +1,186 @@
121 +               Using the Linux Kernel Latency Histograms
124 +This document gives a short explanation how to enable, configure and use
125 +latency histograms. Latency histograms are primarily relevant in the
126 +context of real-time enabled kernels (CONFIG_PREEMPT/CONFIG_PREEMPT_RT)
127 +and are used in the quality management of the Linux real-time
128 +capabilities.
131 +* Purpose of latency histograms
133 +A latency histogram continuously accumulates the frequencies of latency
134 +data. There are two types of histograms
135 +- potential sources of latencies
136 +- effective latencies
139 +* Potential sources of latencies
141 +Potential sources of latencies are code segments where interrupts,
142 +preemption or both are disabled (aka critical sections). To create
143 +histograms of potential sources of latency, the kernel stores the time
144 +stamp at the start of a critical section, determines the time elapsed
145 +when the end of the section is reached, and increments the frequency
146 +counter of that latency value - irrespective of whether any concurrently
147 +running process is affected by latency or not.
148 +- Configuration items (in the Kernel hacking/Tracers submenu)
149 +  CONFIG_INTERRUPT_OFF_LATENCY
150 +  CONFIG_PREEMPT_OFF_LATENCY
153 +* Effective latencies
155 +Effective latencies are actually occuring during wakeup of a process. To
156 +determine effective latencies, the kernel stores the time stamp when a
157 +process is scheduled to be woken up, and determines the duration of the
158 +wakeup time shortly before control is passed over to this process. Note
159 +that the apparent latency in user space may be somewhat longer, since the
160 +process may be interrupted after control is passed over to it but before
161 +the execution in user space takes place. Simply measuring the interval
162 +between enqueuing and wakeup may also not appropriate in cases when a
163 +process is scheduled as a result of a timer expiration. The timer may have
164 +missed its deadline, e.g. due to disabled interrupts, but this latency
165 +would not be registered. Therefore, the offsets of missed timers are
166 +recorded in a separate histogram. If both wakeup latency and missed timer
167 +offsets are configured and enabled, a third histogram may be enabled that
168 +records the overall latency as a sum of the timer latency, if any, and the
169 +wakeup latency. This histogram is called "timerandwakeup".
170 +- Configuration items (in the Kernel hacking/Tracers submenu)
171 +  CONFIG_WAKEUP_LATENCY
172 +  CONFIG_MISSED_TIMER_OFSETS
175 +* Usage
177 +The interface to the administration of the latency histograms is located
178 +in the debugfs file system. To mount it, either enter
180 +mount -t sysfs nodev /sys
181 +mount -t debugfs nodev /sys/kernel/debug
183 +from shell command line level, or add
185 +nodev  /sys                    sysfs   defaults        0 0
186 +nodev  /sys/kernel/debug       debugfs defaults        0 0
188 +to the file /etc/fstab. All latency histogram related files are then
189 +available in the directory /sys/kernel/debug/tracing/latency_hist. A
190 +particular histogram type is enabled by writing non-zero to the related
191 +variable in the /sys/kernel/debug/tracing/latency_hist/enable directory.
192 +Select "preemptirqsoff" for the histograms of potential sources of
193 +latencies and "wakeup" for histograms of effective latencies etc. The
194 +histogram data - one per CPU - are available in the files
196 +/sys/kernel/debug/tracing/latency_hist/preemptoff/CPUx
197 +/sys/kernel/debug/tracing/latency_hist/irqsoff/CPUx
198 +/sys/kernel/debug/tracing/latency_hist/preemptirqsoff/CPUx
199 +/sys/kernel/debug/tracing/latency_hist/wakeup/CPUx
200 +/sys/kernel/debug/tracing/latency_hist/wakeup/sharedprio/CPUx
201 +/sys/kernel/debug/tracing/latency_hist/missed_timer_offsets/CPUx
202 +/sys/kernel/debug/tracing/latency_hist/timerandwakeup/CPUx
204 +The histograms are reset by writing non-zero to the file "reset" in a
205 +particular latency directory. To reset all latency data, use
207 +#!/bin/sh
209 +TRACINGDIR=/sys/kernel/debug/tracing
210 +HISTDIR=$TRACINGDIR/latency_hist
212 +if test -d $HISTDIR
213 +then
214 +  cd $HISTDIR
215 +  for i in `find . | grep /reset$`
216 +  do
217 +    echo 1 >$i
218 +  done
222 +* Data format
224 +Latency data are stored with a resolution of one microsecond. The
225 +maximum latency is 10,240 microseconds. The data are only valid, if the
226 +overflow register is empty. Every output line contains the latency in
227 +microseconds in the first row and the number of samples in the second
228 +row. To display only lines with a positive latency count, use, for
229 +example,
231 +grep -v " 0$" /sys/kernel/debug/tracing/latency_hist/preemptoff/CPU0
233 +#Minimum latency: 0 microseconds.
234 +#Average latency: 0 microseconds.
235 +#Maximum latency: 25 microseconds.
236 +#Total samples: 3104770694
237 +#There are 0 samples greater or equal than 10240 microseconds
238 +#usecs          samples
239 +    0        2984486876
240 +    1          49843506
241 +    2          58219047
242 +    3           5348126
243 +    4           2187960
244 +    5           3388262
245 +    6            959289
246 +    7            208294
247 +    8             40420
248 +    9              4485
249 +   10             14918
250 +   11             18340
251 +   12             25052
252 +   13             19455
253 +   14              5602
254 +   15               969
255 +   16                47
256 +   17                18
257 +   18                14
258 +   19                 1
259 +   20                 3
260 +   21                 2
261 +   22                 5
262 +   23                 2
263 +   25                 1
266 +* Wakeup latency of a selected process
268 +To only collect wakeup latency data of a particular process, write the
269 +PID of the requested process to
271 +/sys/kernel/debug/tracing/latency_hist/wakeup/pid
273 +PIDs are not considered, if this variable is set to 0.
276 +* Details of the process with the highest wakeup latency so far
278 +Selected data of the process that suffered from the highest wakeup
279 +latency that occurred in a particular CPU are available in the file
281 +/sys/kernel/debug/tracing/latency_hist/wakeup/max_latency-CPUx.
283 +In addition, other relevant system data at the time when the
284 +latency occurred are given.
286 +The format of the data is (all in one line):
287 +<PID> <Priority> <Latency> (<Timeroffset>) <Command> \
288 +<- <PID> <Priority> <Command> <Timestamp>
290 +The value of <Timeroffset> is only relevant in the combined timer
291 +and wakeup latency recording. In the wakeup recording, it is
292 +always 0, in the missed_timer_offsets recording, it is the same
293 +as <Latency>.
295 +When retrospectively searching for the origin of a latency and
296 +tracing was not enabled, it may be helpful to know the name and
297 +some basic data of the task that (finally) was switching to the
298 +late real-tlme task. In addition to the victim's data, also the
299 +data of the possible culprit are therefore displayed after the
300 +"<-" symbol.
302 +Finally, the timestamp of the time when the latency occurred
303 +in <seconds>.<microseconds> after the most recent system boot
304 +is provided.
306 +These data are also reset when the wakeup histogram is reset.
307 diff --git a/Makefile b/Makefile
308 index a5ecb29c6ed3..5521b0c3abf2 100644
309 --- a/Makefile
310 +++ b/Makefile
311 @@ -785,6 +785,9 @@ KBUILD_CFLAGS   += $(call cc-option,-Werror=strict-prototypes)
312  # Prohibit date/time macros, which would make the build non-deterministic
313  KBUILD_CFLAGS   += $(call cc-option,-Werror=date-time)
315 +# enforce correct pointer usage
316 +KBUILD_CFLAGS   += $(call cc-option,-Werror=incompatible-pointer-types)
318  # use the deterministic mode of AR if available
319  KBUILD_ARFLAGS := $(call ar-option,D)
321 diff --git a/arch/Kconfig b/arch/Kconfig
322 index 4e949e58b192..3b26d76933fb 100644
323 --- a/arch/Kconfig
324 +++ b/arch/Kconfig
325 @@ -9,6 +9,7 @@ config OPROFILE
326         tristate "OProfile system profiling"
327         depends on PROFILING
328         depends on HAVE_OPROFILE
329 +       depends on !PREEMPT_RT_FULL
330         select RING_BUFFER
331         select RING_BUFFER_ALLOW_SWAP
332         help
333 @@ -52,6 +53,7 @@ config KPROBES
334  config JUMP_LABEL
335         bool "Optimize very unlikely/likely branches"
336         depends on HAVE_ARCH_JUMP_LABEL
337 +       depends on (!INTERRUPT_OFF_HIST && !PREEMPT_OFF_HIST && !WAKEUP_LATENCY_HIST && !MISSED_TIMER_OFFSETS_HIST)
338         help
339           This option enables a transparent branch optimization that
340          makes certain almost-always-true or almost-always-false branch
341 diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
342 index 34e1569a11ee..79c4603e9453 100644
343 --- a/arch/arm/Kconfig
344 +++ b/arch/arm/Kconfig
345 @@ -33,7 +33,7 @@ config ARM
346         select HARDIRQS_SW_RESEND
347         select HAVE_ARCH_AUDITSYSCALL if (AEABI && !OABI_COMPAT)
348         select HAVE_ARCH_BITREVERSE if (CPU_32v7M || CPU_32v7) && !CPU_32v6
349 -       select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32
350 +       select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 && !PREEMPT_RT_BASE
351         select HAVE_ARCH_KGDB if !CPU_ENDIAN_BE32
352         select HAVE_ARCH_SECCOMP_FILTER if (AEABI && !OABI_COMPAT)
353         select HAVE_ARCH_TRACEHOOK
354 @@ -68,6 +68,7 @@ config ARM
355         select HAVE_PERF_EVENTS
356         select HAVE_PERF_REGS
357         select HAVE_PERF_USER_STACK_DUMP
358 +       select HAVE_PREEMPT_LAZY
359         select HAVE_RCU_TABLE_FREE if (SMP && ARM_LPAE)
360         select HAVE_REGS_AND_STACK_ACCESS_API
361         select HAVE_SYSCALL_TRACEPOINTS
362 diff --git a/arch/arm/include/asm/switch_to.h b/arch/arm/include/asm/switch_to.h
363 index 12ebfcc1d539..c962084605bc 100644
364 --- a/arch/arm/include/asm/switch_to.h
365 +++ b/arch/arm/include/asm/switch_to.h
366 @@ -3,6 +3,13 @@
368  #include <linux/thread_info.h>
370 +#if defined CONFIG_PREEMPT_RT_FULL && defined CONFIG_HIGHMEM
371 +void switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p);
372 +#else
373 +static inline void
374 +switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p) { }
375 +#endif
377  /*
378   * For v7 SMP cores running a preemptible kernel we may be pre-empted
379   * during a TLB maintenance operation, so execute an inner-shareable dsb
380 @@ -25,6 +32,7 @@ extern struct task_struct *__switch_to(struct task_struct *, struct thread_info
381  #define switch_to(prev,next,last)                                      \
382  do {                                                                   \
383         __complete_pending_tlbi();                                      \
384 +       switch_kmaps(prev, next);                                       \
385         last = __switch_to(prev,task_thread_info(prev), task_thread_info(next));        \
386  } while (0)
388 diff --git a/arch/arm/include/asm/thread_info.h b/arch/arm/include/asm/thread_info.h
389 index 776757d1604a..1f36a4eccc72 100644
390 --- a/arch/arm/include/asm/thread_info.h
391 +++ b/arch/arm/include/asm/thread_info.h
392 @@ -49,6 +49,7 @@ struct cpu_context_save {
393  struct thread_info {
394         unsigned long           flags;          /* low level flags */
395         int                     preempt_count;  /* 0 => preemptable, <0 => bug */
396 +       int                     preempt_lazy_count; /* 0 => preemptable, <0 => bug */
397         mm_segment_t            addr_limit;     /* address limit */
398         struct task_struct      *task;          /* main task structure */
399         __u32                   cpu;            /* cpu */
400 @@ -142,7 +143,8 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *,
401  #define TIF_SYSCALL_TRACE      4       /* syscall trace active */
402  #define TIF_SYSCALL_AUDIT      5       /* syscall auditing active */
403  #define TIF_SYSCALL_TRACEPOINT 6       /* syscall tracepoint instrumentation */
404 -#define TIF_SECCOMP            7       /* seccomp syscall filtering active */
405 +#define TIF_SECCOMP            8       /* seccomp syscall filtering active */
406 +#define TIF_NEED_RESCHED_LAZY  7
408  #define TIF_NOHZ               12      /* in adaptive nohz mode */
409  #define TIF_USING_IWMMXT       17
410 @@ -152,6 +154,7 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *,
411  #define _TIF_SIGPENDING                (1 << TIF_SIGPENDING)
412  #define _TIF_NEED_RESCHED      (1 << TIF_NEED_RESCHED)
413  #define _TIF_NOTIFY_RESUME     (1 << TIF_NOTIFY_RESUME)
414 +#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY)
415  #define _TIF_UPROBE            (1 << TIF_UPROBE)
416  #define _TIF_SYSCALL_TRACE     (1 << TIF_SYSCALL_TRACE)
417  #define _TIF_SYSCALL_AUDIT     (1 << TIF_SYSCALL_AUDIT)
418 @@ -167,7 +170,8 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *,
419   * Change these and you break ASM code in entry-common.S
420   */
421  #define _TIF_WORK_MASK         (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
422 -                                _TIF_NOTIFY_RESUME | _TIF_UPROBE)
423 +                                _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
424 +                                _TIF_NEED_RESCHED_LAZY)
426  #endif /* __KERNEL__ */
427  #endif /* __ASM_ARM_THREAD_INFO_H */
428 diff --git a/arch/arm/kernel/asm-offsets.c b/arch/arm/kernel/asm-offsets.c
429 index 871b8267d211..4dbe70de7318 100644
430 --- a/arch/arm/kernel/asm-offsets.c
431 +++ b/arch/arm/kernel/asm-offsets.c
432 @@ -65,6 +65,7 @@ int main(void)
433    BLANK();
434    DEFINE(TI_FLAGS,             offsetof(struct thread_info, flags));
435    DEFINE(TI_PREEMPT,           offsetof(struct thread_info, preempt_count));
436 +  DEFINE(TI_PREEMPT_LAZY,      offsetof(struct thread_info, preempt_lazy_count));
437    DEFINE(TI_ADDR_LIMIT,                offsetof(struct thread_info, addr_limit));
438    DEFINE(TI_TASK,              offsetof(struct thread_info, task));
439    DEFINE(TI_CPU,               offsetof(struct thread_info, cpu));
440 diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S
441 index 3ce377f7251f..d044cea59f54 100644
442 --- a/arch/arm/kernel/entry-armv.S
443 +++ b/arch/arm/kernel/entry-armv.S
444 @@ -215,11 +215,18 @@ __irq_svc:
445  #ifdef CONFIG_PREEMPT
446         get_thread_info tsk
447         ldr     r8, [tsk, #TI_PREEMPT]          @ get preempt count
448 -       ldr     r0, [tsk, #TI_FLAGS]            @ get flags
449         teq     r8, #0                          @ if preempt count != 0
450 +       bne     1f                              @ return from exeption
451 +       ldr     r0, [tsk, #TI_FLAGS]            @ get flags
452 +       tst     r0, #_TIF_NEED_RESCHED          @ if NEED_RESCHED is set
453 +       blne    svc_preempt                     @ preempt!
455 +       ldr     r8, [tsk, #TI_PREEMPT_LAZY]     @ get preempt lazy count
456 +       teq     r8, #0                          @ if preempt lazy count != 0
457         movne   r0, #0                          @ force flags to 0
458 -       tst     r0, #_TIF_NEED_RESCHED
459 +       tst     r0, #_TIF_NEED_RESCHED_LAZY
460         blne    svc_preempt
462  #endif
464         svc_exit r5, irq = 1                    @ return from exception
465 @@ -234,8 +241,14 @@ svc_preempt:
466  1:     bl      preempt_schedule_irq            @ irq en/disable is done inside
467         ldr     r0, [tsk, #TI_FLAGS]            @ get new tasks TI_FLAGS
468         tst     r0, #_TIF_NEED_RESCHED
469 +       bne     1b
470 +       tst     r0, #_TIF_NEED_RESCHED_LAZY
471         reteq   r8                              @ go again
472 -       b       1b
473 +       ldr     r0, [tsk, #TI_PREEMPT_LAZY]     @ get preempt lazy count
474 +       teq     r0, #0                          @ if preempt lazy count != 0
475 +       beq     1b
476 +       ret     r8                              @ go again
478  #endif
480  __und_fault:
481 diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S
482 index 30a7228eaceb..c3bd6cbfce4b 100644
483 --- a/arch/arm/kernel/entry-common.S
484 +++ b/arch/arm/kernel/entry-common.S
485 @@ -36,7 +36,9 @@ ret_fast_syscall:
486   UNWIND(.cantunwind    )
487         disable_irq_notrace                     @ disable interrupts
488         ldr     r1, [tsk, #TI_FLAGS]            @ re-check for syscall tracing
489 -       tst     r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
490 +       tst     r1, #((_TIF_SYSCALL_WORK | _TIF_WORK_MASK) & ~_TIF_SECCOMP)
491 +       bne     fast_work_pending
492 +       tst     r1, #_TIF_SECCOMP
493         bne     fast_work_pending
495         /* perform architecture specific actions before user return */
496 @@ -62,8 +64,11 @@ ret_fast_syscall:
497         str     r0, [sp, #S_R0 + S_OFF]!        @ save returned r0
498         disable_irq_notrace                     @ disable interrupts
499         ldr     r1, [tsk, #TI_FLAGS]            @ re-check for syscall tracing
500 -       tst     r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
501 +       tst     r1, #((_TIF_SYSCALL_WORK | _TIF_WORK_MASK) & ~_TIF_SECCOMP)
502 +       bne     do_slower_path
503 +       tst     r1, #_TIF_SECCOMP
504         beq     no_work_pending
505 +do_slower_path:
506   UNWIND(.fnend         )
507  ENDPROC(ret_fast_syscall)
509 diff --git a/arch/arm/kernel/patch.c b/arch/arm/kernel/patch.c
510 index 69bda1a5707e..1f665acaa6a9 100644
511 --- a/arch/arm/kernel/patch.c
512 +++ b/arch/arm/kernel/patch.c
513 @@ -15,7 +15,7 @@ struct patch {
514         unsigned int insn;
515  };
517 -static DEFINE_SPINLOCK(patch_lock);
518 +static DEFINE_RAW_SPINLOCK(patch_lock);
520  static void __kprobes *patch_map(void *addr, int fixmap, unsigned long *flags)
521         __acquires(&patch_lock)
522 @@ -32,7 +32,7 @@ static void __kprobes *patch_map(void *addr, int fixmap, unsigned long *flags)
523                 return addr;
525         if (flags)
526 -               spin_lock_irqsave(&patch_lock, *flags);
527 +               raw_spin_lock_irqsave(&patch_lock, *flags);
528         else
529                 __acquire(&patch_lock);
531 @@ -47,7 +47,7 @@ static void __kprobes patch_unmap(int fixmap, unsigned long *flags)
532         clear_fixmap(fixmap);
534         if (flags)
535 -               spin_unlock_irqrestore(&patch_lock, *flags);
536 +               raw_spin_unlock_irqrestore(&patch_lock, *flags);
537         else
538                 __release(&patch_lock);
540 diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c
541 index 4adfb46e3ee9..15f1d94b47c5 100644
542 --- a/arch/arm/kernel/process.c
543 +++ b/arch/arm/kernel/process.c
544 @@ -319,6 +319,30 @@ unsigned long arch_randomize_brk(struct mm_struct *mm)
547  #ifdef CONFIG_MMU
549 + * CONFIG_SPLIT_PTLOCK_CPUS results in a page->ptl lock.  If the lock is not
550 + * initialized by pgtable_page_ctor() then a coredump of the vector page will
551 + * fail.
552 + */
553 +static int __init vectors_user_mapping_init_page(void)
555 +       struct page *page;
556 +       unsigned long addr = 0xffff0000;
557 +       pgd_t *pgd;
558 +       pud_t *pud;
559 +       pmd_t *pmd;
561 +       pgd = pgd_offset_k(addr);
562 +       pud = pud_offset(pgd, addr);
563 +       pmd = pmd_offset(pud, addr);
564 +       page = pmd_page(*(pmd));
566 +       pgtable_page_ctor(page);
568 +       return 0;
570 +late_initcall(vectors_user_mapping_init_page);
572  #ifdef CONFIG_KUSER_HELPERS
573  /*
574   * The vectors page is always readable from user space for the
575 diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
576 index 7b8f2141427b..96541e00b74a 100644
577 --- a/arch/arm/kernel/signal.c
578 +++ b/arch/arm/kernel/signal.c
579 @@ -572,7 +572,8 @@ do_work_pending(struct pt_regs *regs, unsigned int thread_flags, int syscall)
580          */
581         trace_hardirqs_off();
582         do {
583 -               if (likely(thread_flags & _TIF_NEED_RESCHED)) {
584 +               if (likely(thread_flags & (_TIF_NEED_RESCHED |
585 +                                          _TIF_NEED_RESCHED_LAZY))) {
586                         schedule();
587                 } else {
588                         if (unlikely(!user_mode(regs)))
589 diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
590 index b26361355dae..e5754e3b03c4 100644
591 --- a/arch/arm/kernel/smp.c
592 +++ b/arch/arm/kernel/smp.c
593 @@ -230,8 +230,6 @@ int __cpu_disable(void)
594         flush_cache_louis();
595         local_flush_tlb_all();
597 -       clear_tasks_mm_cpumask(cpu);
599         return 0;
602 @@ -247,6 +245,9 @@ void __cpu_die(unsigned int cpu)
603                 pr_err("CPU%u: cpu didn't die\n", cpu);
604                 return;
605         }
607 +       clear_tasks_mm_cpumask(cpu);
609         pr_notice("CPU%u: shutdown\n", cpu);
611         /*
612 diff --git a/arch/arm/kernel/unwind.c b/arch/arm/kernel/unwind.c
613 index 0bee233fef9a..314cfb232a63 100644
614 --- a/arch/arm/kernel/unwind.c
615 +++ b/arch/arm/kernel/unwind.c
616 @@ -93,7 +93,7 @@ extern const struct unwind_idx __start_unwind_idx[];
617  static const struct unwind_idx *__origin_unwind_idx;
618  extern const struct unwind_idx __stop_unwind_idx[];
620 -static DEFINE_SPINLOCK(unwind_lock);
621 +static DEFINE_RAW_SPINLOCK(unwind_lock);
622  static LIST_HEAD(unwind_tables);
624  /* Convert a prel31 symbol to an absolute address */
625 @@ -201,7 +201,7 @@ static const struct unwind_idx *unwind_find_idx(unsigned long addr)
626                 /* module unwind tables */
627                 struct unwind_table *table;
629 -               spin_lock_irqsave(&unwind_lock, flags);
630 +               raw_spin_lock_irqsave(&unwind_lock, flags);
631                 list_for_each_entry(table, &unwind_tables, list) {
632                         if (addr >= table->begin_addr &&
633                             addr < table->end_addr) {
634 @@ -213,7 +213,7 @@ static const struct unwind_idx *unwind_find_idx(unsigned long addr)
635                                 break;
636                         }
637                 }
638 -               spin_unlock_irqrestore(&unwind_lock, flags);
639 +               raw_spin_unlock_irqrestore(&unwind_lock, flags);
640         }
642         pr_debug("%s: idx = %p\n", __func__, idx);
643 @@ -529,9 +529,9 @@ struct unwind_table *unwind_table_add(unsigned long start, unsigned long size,
644         tab->begin_addr = text_addr;
645         tab->end_addr = text_addr + text_size;
647 -       spin_lock_irqsave(&unwind_lock, flags);
648 +       raw_spin_lock_irqsave(&unwind_lock, flags);
649         list_add_tail(&tab->list, &unwind_tables);
650 -       spin_unlock_irqrestore(&unwind_lock, flags);
651 +       raw_spin_unlock_irqrestore(&unwind_lock, flags);
653         return tab;
655 @@ -543,9 +543,9 @@ void unwind_table_del(struct unwind_table *tab)
656         if (!tab)
657                 return;
659 -       spin_lock_irqsave(&unwind_lock, flags);
660 +       raw_spin_lock_irqsave(&unwind_lock, flags);
661         list_del(&tab->list);
662 -       spin_unlock_irqrestore(&unwind_lock, flags);
663 +       raw_spin_unlock_irqrestore(&unwind_lock, flags);
665         kfree(tab);
667 diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
668 index d7bef2144760..36a3e51492f7 100644
669 --- a/arch/arm/kvm/arm.c
670 +++ b/arch/arm/kvm/arm.c
671 @@ -496,18 +496,18 @@ static void kvm_arm_resume_guest(struct kvm *kvm)
672         struct kvm_vcpu *vcpu;
674         kvm_for_each_vcpu(i, vcpu, kvm) {
675 -               wait_queue_head_t *wq = kvm_arch_vcpu_wq(vcpu);
676 +               struct swait_queue_head *wq = kvm_arch_vcpu_wq(vcpu);
678                 vcpu->arch.pause = false;
679 -               wake_up_interruptible(wq);
680 +               swake_up(wq);
681         }
684  static void vcpu_sleep(struct kvm_vcpu *vcpu)
686 -       wait_queue_head_t *wq = kvm_arch_vcpu_wq(vcpu);
687 +       struct swait_queue_head *wq = kvm_arch_vcpu_wq(vcpu);
689 -       wait_event_interruptible(*wq, ((!vcpu->arch.power_off) &&
690 +       swait_event_interruptible(*wq, ((!vcpu->arch.power_off) &&
691                                        (!vcpu->arch.pause)));
694 @@ -566,7 +566,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
695                  * involves poking the GIC, which must be done in a
696                  * non-preemptible context.
697                  */
698 -               preempt_disable();
699 +               migrate_disable();
700                 kvm_timer_flush_hwstate(vcpu);
701                 kvm_vgic_flush_hwstate(vcpu);
703 @@ -585,7 +585,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
704                         local_irq_enable();
705                         kvm_timer_sync_hwstate(vcpu);
706                         kvm_vgic_sync_hwstate(vcpu);
707 -                       preempt_enable();
708 +                       migrate_enable();
709                         continue;
710                 }
712 @@ -639,7 +639,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
714                 kvm_vgic_sync_hwstate(vcpu);
716 -               preempt_enable();
717 +               migrate_enable();
719                 ret = handle_exit(vcpu, run, ret);
720         }
721 diff --git a/arch/arm/kvm/psci.c b/arch/arm/kvm/psci.c
722 index 443db0c43d7c..a08d7a93aebb 100644
723 --- a/arch/arm/kvm/psci.c
724 +++ b/arch/arm/kvm/psci.c
725 @@ -70,7 +70,7 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
727         struct kvm *kvm = source_vcpu->kvm;
728         struct kvm_vcpu *vcpu = NULL;
729 -       wait_queue_head_t *wq;
730 +       struct swait_queue_head *wq;
731         unsigned long cpu_id;
732         unsigned long context_id;
733         phys_addr_t target_pc;
734 @@ -119,7 +119,7 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
735         smp_mb();               /* Make sure the above is visible */
737         wq = kvm_arch_vcpu_wq(vcpu);
738 -       wake_up_interruptible(wq);
739 +       swake_up(wq);
741         return PSCI_RET_SUCCESS;
743 diff --git a/arch/arm/mach-at91/Kconfig b/arch/arm/mach-at91/Kconfig
744 index 28656c2b54a0..3f501305ca26 100644
745 --- a/arch/arm/mach-at91/Kconfig
746 +++ b/arch/arm/mach-at91/Kconfig
747 @@ -99,6 +99,7 @@ config HAVE_AT91_USB_CLK
748  config COMMON_CLK_AT91
749         bool
750         select COMMON_CLK
751 +       select MFD_SYSCON
753  config HAVE_AT91_SMD
754         bool
755 diff --git a/arch/arm/mach-at91/at91rm9200.c b/arch/arm/mach-at91/at91rm9200.c
756 index c1a7c6cc00e1..63b4fa25b48a 100644
757 --- a/arch/arm/mach-at91/at91rm9200.c
758 +++ b/arch/arm/mach-at91/at91rm9200.c
759 @@ -12,7 +12,6 @@
760  #include <linux/of_platform.h>
762  #include <asm/mach/arch.h>
763 -#include <asm/system_misc.h>
765  #include "generic.h"
766  #include "soc.h"
767 @@ -33,7 +32,6 @@ static void __init at91rm9200_dt_device_init(void)
769         of_platform_populate(NULL, of_default_bus_match_table, NULL, soc_dev);
771 -       arm_pm_idle = at91rm9200_idle;
772         at91rm9200_pm_init();
775 diff --git a/arch/arm/mach-at91/at91sam9.c b/arch/arm/mach-at91/at91sam9.c
776 index 7eb64f763034..cada2a6412b3 100644
777 --- a/arch/arm/mach-at91/at91sam9.c
778 +++ b/arch/arm/mach-at91/at91sam9.c
779 @@ -62,8 +62,6 @@ static void __init at91sam9_common_init(void)
780                 soc_dev = soc_device_to_device(soc);
782         of_platform_populate(NULL, of_default_bus_match_table, NULL, soc_dev);
784 -       arm_pm_idle = at91sam9_idle;
787  static void __init at91sam9_dt_device_init(void)
788 diff --git a/arch/arm/mach-at91/generic.h b/arch/arm/mach-at91/generic.h
789 index b0fa7dc7286d..28ca57a2060f 100644
790 --- a/arch/arm/mach-at91/generic.h
791 +++ b/arch/arm/mach-at91/generic.h
792 @@ -11,27 +11,18 @@
793  #ifndef _AT91_GENERIC_H
794  #define _AT91_GENERIC_H
796 -#include <linux/of.h>
797 -#include <linux/reboot.h>
799 - /* Map io */
800 -extern void __init at91_map_io(void);
801 -extern void __init at91_alt_map_io(void);
803 -/* idle */
804 -extern void at91rm9200_idle(void);
805 -extern void at91sam9_idle(void);
807  #ifdef CONFIG_PM
808  extern void __init at91rm9200_pm_init(void);
809  extern void __init at91sam9260_pm_init(void);
810  extern void __init at91sam9g45_pm_init(void);
811  extern void __init at91sam9x5_pm_init(void);
812 +extern void __init sama5_pm_init(void);
813  #else
814  static inline void __init at91rm9200_pm_init(void) { }
815  static inline void __init at91sam9260_pm_init(void) { }
816  static inline void __init at91sam9g45_pm_init(void) { }
817  static inline void __init at91sam9x5_pm_init(void) { }
818 +static inline void __init sama5_pm_init(void) { }
819  #endif
821  #endif /* _AT91_GENERIC_H */
822 diff --git a/arch/arm/mach-at91/pm.c b/arch/arm/mach-at91/pm.c
823 index d687f860a2da..ab53332a9637 100644
824 --- a/arch/arm/mach-at91/pm.c
825 +++ b/arch/arm/mach-at91/pm.c
826 @@ -31,10 +31,13 @@
827  #include <asm/mach/irq.h>
828  #include <asm/fncpy.h>
829  #include <asm/cacheflush.h>
830 +#include <asm/system_misc.h>
832  #include "generic.h"
833  #include "pm.h"
835 +static void __iomem *pmc;
837  /*
838   * FIXME: this is needed to communicate between the pinctrl driver and
839   * the PM implementation in the machine. Possibly part of the PM
840 @@ -87,7 +90,7 @@ static int at91_pm_verify_clocks(void)
841         unsigned long scsr;
842         int i;
844 -       scsr = at91_pmc_read(AT91_PMC_SCSR);
845 +       scsr = readl(pmc + AT91_PMC_SCSR);
847         /* USB must not be using PLLB */
848         if ((scsr & at91_pm_data.uhp_udp_mask) != 0) {
849 @@ -101,8 +104,7 @@ static int at91_pm_verify_clocks(void)
851                 if ((scsr & (AT91_PMC_PCK0 << i)) == 0)
852                         continue;
854 -               css = at91_pmc_read(AT91_PMC_PCKR(i)) & AT91_PMC_CSS;
855 +               css = readl(pmc + AT91_PMC_PCKR(i)) & AT91_PMC_CSS;
856                 if (css != AT91_PMC_CSS_SLOW) {
857                         pr_err("AT91: PM - Suspend-to-RAM with PCK%d src %d\n", i, css);
858                         return 0;
859 @@ -145,8 +147,8 @@ static void at91_pm_suspend(suspend_state_t state)
860         flush_cache_all();
861         outer_disable();
863 -       at91_suspend_sram_fn(at91_pmc_base, at91_ramc_base[0],
864 -                               at91_ramc_base[1], pm_data);
865 +       at91_suspend_sram_fn(pmc, at91_ramc_base[0],
866 +                            at91_ramc_base[1], pm_data);
868         outer_resume();
870 @@ -369,6 +371,21 @@ static __init void at91_dt_ramc(void)
871         at91_pm_set_standby(standby);
874 +void at91rm9200_idle(void)
876 +       /*
877 +        * Disable the processor clock.  The processor will be automatically
878 +        * re-enabled by an interrupt or by a reset.
879 +        */
880 +       writel(AT91_PMC_PCK, pmc + AT91_PMC_SCDR);
883 +void at91sam9_idle(void)
885 +       writel(AT91_PMC_PCK, pmc + AT91_PMC_SCDR);
886 +       cpu_do_idle();
889  static void __init at91_pm_sram_init(void)
891         struct gen_pool *sram_pool;
892 @@ -415,13 +432,36 @@ static void __init at91_pm_sram_init(void)
893                         &at91_pm_suspend_in_sram, at91_pm_suspend_in_sram_sz);
896 -static void __init at91_pm_init(void)
897 +static const struct of_device_id atmel_pmc_ids[] __initconst = {
898 +       { .compatible = "atmel,at91rm9200-pmc"  },
899 +       { .compatible = "atmel,at91sam9260-pmc" },
900 +       { .compatible = "atmel,at91sam9g45-pmc" },
901 +       { .compatible = "atmel,at91sam9n12-pmc" },
902 +       { .compatible = "atmel,at91sam9x5-pmc" },
903 +       { .compatible = "atmel,sama5d3-pmc" },
904 +       { .compatible = "atmel,sama5d2-pmc" },
905 +       { /* sentinel */ },
908 +static void __init at91_pm_init(void (*pm_idle)(void))
910 -       at91_pm_sram_init();
911 +       struct device_node *pmc_np;
913         if (at91_cpuidle_device.dev.platform_data)
914                 platform_device_register(&at91_cpuidle_device);
916 +       pmc_np = of_find_matching_node(NULL, atmel_pmc_ids);
917 +       pmc = of_iomap(pmc_np, 0);
918 +       if (!pmc) {
919 +               pr_err("AT91: PM not supported, PMC not found\n");
920 +               return;
921 +       }
923 +       if (pm_idle)
924 +               arm_pm_idle = pm_idle;
926 +       at91_pm_sram_init();
928         if (at91_suspend_sram_fn)
929                 suspend_set_ops(&at91_pm_ops);
930         else
931 @@ -440,7 +480,7 @@ void __init at91rm9200_pm_init(void)
932         at91_pm_data.uhp_udp_mask = AT91RM9200_PMC_UHP | AT91RM9200_PMC_UDP;
933         at91_pm_data.memctrl = AT91_MEMCTRL_MC;
935 -       at91_pm_init();
936 +       at91_pm_init(at91rm9200_idle);
939  void __init at91sam9260_pm_init(void)
940 @@ -448,7 +488,7 @@ void __init at91sam9260_pm_init(void)
941         at91_dt_ramc();
942         at91_pm_data.memctrl = AT91_MEMCTRL_SDRAMC;
943         at91_pm_data.uhp_udp_mask = AT91SAM926x_PMC_UHP | AT91SAM926x_PMC_UDP;
944 -       return at91_pm_init();
945 +       at91_pm_init(at91sam9_idle);
948  void __init at91sam9g45_pm_init(void)
949 @@ -456,7 +496,7 @@ void __init at91sam9g45_pm_init(void)
950         at91_dt_ramc();
951         at91_pm_data.uhp_udp_mask = AT91SAM926x_PMC_UHP;
952         at91_pm_data.memctrl = AT91_MEMCTRL_DDRSDR;
953 -       return at91_pm_init();
954 +       at91_pm_init(at91sam9_idle);
957  void __init at91sam9x5_pm_init(void)
958 @@ -464,5 +504,13 @@ void __init at91sam9x5_pm_init(void)
959         at91_dt_ramc();
960         at91_pm_data.uhp_udp_mask = AT91SAM926x_PMC_UHP | AT91SAM926x_PMC_UDP;
961         at91_pm_data.memctrl = AT91_MEMCTRL_DDRSDR;
962 -       return at91_pm_init();
963 +       at91_pm_init(at91sam9_idle);
966 +void __init sama5_pm_init(void)
968 +       at91_dt_ramc();
969 +       at91_pm_data.uhp_udp_mask = AT91SAM926x_PMC_UHP | AT91SAM926x_PMC_UDP;
970 +       at91_pm_data.memctrl = AT91_MEMCTRL_DDRSDR;
971 +       at91_pm_init(NULL);
973 diff --git a/arch/arm/mach-at91/sama5.c b/arch/arm/mach-at91/sama5.c
974 index d9cf6799aec0..df8fdf1cf66d 100644
975 --- a/arch/arm/mach-at91/sama5.c
976 +++ b/arch/arm/mach-at91/sama5.c
977 @@ -51,7 +51,7 @@ static void __init sama5_dt_device_init(void)
978                 soc_dev = soc_device_to_device(soc);
980         of_platform_populate(NULL, of_default_bus_match_table, NULL, soc_dev);
981 -       at91sam9x5_pm_init();
982 +       sama5_pm_init();
985  static const char *const sama5_dt_board_compat[] __initconst = {
986 diff --git a/arch/arm/mach-exynos/platsmp.c b/arch/arm/mach-exynos/platsmp.c
987 index 98a2c0cbb833..310dce500d3e 100644
988 --- a/arch/arm/mach-exynos/platsmp.c
989 +++ b/arch/arm/mach-exynos/platsmp.c
990 @@ -230,7 +230,7 @@ static void __iomem *scu_base_addr(void)
991         return (void __iomem *)(S5P_VA_SCU);
994 -static DEFINE_SPINLOCK(boot_lock);
995 +static DEFINE_RAW_SPINLOCK(boot_lock);
997  static void exynos_secondary_init(unsigned int cpu)
999 @@ -243,8 +243,8 @@ static void exynos_secondary_init(unsigned int cpu)
1000         /*
1001          * Synchronise with the boot thread.
1002          */
1003 -       spin_lock(&boot_lock);
1004 -       spin_unlock(&boot_lock);
1005 +       raw_spin_lock(&boot_lock);
1006 +       raw_spin_unlock(&boot_lock);
1009  int exynos_set_boot_addr(u32 core_id, unsigned long boot_addr)
1010 @@ -308,7 +308,7 @@ static int exynos_boot_secondary(unsigned int cpu, struct task_struct *idle)
1011          * Set synchronisation state between this boot processor
1012          * and the secondary one
1013          */
1014 -       spin_lock(&boot_lock);
1015 +       raw_spin_lock(&boot_lock);
1017         /*
1018          * The secondary processor is waiting to be released from
1019 @@ -335,7 +335,7 @@ static int exynos_boot_secondary(unsigned int cpu, struct task_struct *idle)
1021                 if (timeout == 0) {
1022                         printk(KERN_ERR "cpu1 power enable failed");
1023 -                       spin_unlock(&boot_lock);
1024 +                       raw_spin_unlock(&boot_lock);
1025                         return -ETIMEDOUT;
1026                 }
1027         }
1028 @@ -381,7 +381,7 @@ static int exynos_boot_secondary(unsigned int cpu, struct task_struct *idle)
1029          * calibrations, then wait for it to finish
1030          */
1031  fail:
1032 -       spin_unlock(&boot_lock);
1033 +       raw_spin_unlock(&boot_lock);
1035         return pen_release != -1 ? ret : 0;
1037 diff --git a/arch/arm/mach-hisi/platmcpm.c b/arch/arm/mach-hisi/platmcpm.c
1038 index b5f8f5ffda79..9753a84df9c4 100644
1039 --- a/arch/arm/mach-hisi/platmcpm.c
1040 +++ b/arch/arm/mach-hisi/platmcpm.c
1041 @@ -61,7 +61,7 @@
1043  static void __iomem *sysctrl, *fabric;
1044  static int hip04_cpu_table[HIP04_MAX_CLUSTERS][HIP04_MAX_CPUS_PER_CLUSTER];
1045 -static DEFINE_SPINLOCK(boot_lock);
1046 +static DEFINE_RAW_SPINLOCK(boot_lock);
1047  static u32 fabric_phys_addr;
1048  /*
1049   * [0]: bootwrapper physical address
1050 @@ -113,7 +113,7 @@ static int hip04_boot_secondary(unsigned int l_cpu, struct task_struct *idle)
1051         if (cluster >= HIP04_MAX_CLUSTERS || cpu >= HIP04_MAX_CPUS_PER_CLUSTER)
1052                 return -EINVAL;
1054 -       spin_lock_irq(&boot_lock);
1055 +       raw_spin_lock_irq(&boot_lock);
1057         if (hip04_cpu_table[cluster][cpu])
1058                 goto out;
1059 @@ -147,7 +147,7 @@ static int hip04_boot_secondary(unsigned int l_cpu, struct task_struct *idle)
1061  out:
1062         hip04_cpu_table[cluster][cpu]++;
1063 -       spin_unlock_irq(&boot_lock);
1064 +       raw_spin_unlock_irq(&boot_lock);
1066         return 0;
1068 @@ -162,11 +162,11 @@ static void hip04_cpu_die(unsigned int l_cpu)
1069         cpu = MPIDR_AFFINITY_LEVEL(mpidr, 0);
1070         cluster = MPIDR_AFFINITY_LEVEL(mpidr, 1);
1072 -       spin_lock(&boot_lock);
1073 +       raw_spin_lock(&boot_lock);
1074         hip04_cpu_table[cluster][cpu]--;
1075         if (hip04_cpu_table[cluster][cpu] == 1) {
1076                 /* A power_up request went ahead of us. */
1077 -               spin_unlock(&boot_lock);
1078 +               raw_spin_unlock(&boot_lock);
1079                 return;
1080         } else if (hip04_cpu_table[cluster][cpu] > 1) {
1081                 pr_err("Cluster %d CPU%d boots multiple times\n", cluster, cpu);
1082 @@ -174,7 +174,7 @@ static void hip04_cpu_die(unsigned int l_cpu)
1083         }
1085         last_man = hip04_cluster_is_down(cluster);
1086 -       spin_unlock(&boot_lock);
1087 +       raw_spin_unlock(&boot_lock);
1088         if (last_man) {
1089                 /* Since it's Cortex A15, disable L2 prefetching. */
1090                 asm volatile(
1091 @@ -203,7 +203,7 @@ static int hip04_cpu_kill(unsigned int l_cpu)
1092                cpu >= HIP04_MAX_CPUS_PER_CLUSTER);
1094         count = TIMEOUT_MSEC / POLL_MSEC;
1095 -       spin_lock_irq(&boot_lock);
1096 +       raw_spin_lock_irq(&boot_lock);
1097         for (tries = 0; tries < count; tries++) {
1098                 if (hip04_cpu_table[cluster][cpu])
1099                         goto err;
1100 @@ -211,10 +211,10 @@ static int hip04_cpu_kill(unsigned int l_cpu)
1101                 data = readl_relaxed(sysctrl + SC_CPU_RESET_STATUS(cluster));
1102                 if (data & CORE_WFI_STATUS(cpu))
1103                         break;
1104 -               spin_unlock_irq(&boot_lock);
1105 +               raw_spin_unlock_irq(&boot_lock);
1106                 /* Wait for clean L2 when the whole cluster is down. */
1107                 msleep(POLL_MSEC);
1108 -               spin_lock_irq(&boot_lock);
1109 +               raw_spin_lock_irq(&boot_lock);
1110         }
1111         if (tries >= count)
1112                 goto err;
1113 @@ -231,10 +231,10 @@ static int hip04_cpu_kill(unsigned int l_cpu)
1114                 goto err;
1115         if (hip04_cluster_is_down(cluster))
1116                 hip04_set_snoop_filter(cluster, 0);
1117 -       spin_unlock_irq(&boot_lock);
1118 +       raw_spin_unlock_irq(&boot_lock);
1119         return 1;
1120  err:
1121 -       spin_unlock_irq(&boot_lock);
1122 +       raw_spin_unlock_irq(&boot_lock);
1123         return 0;
1125  #endif
1126 diff --git a/arch/arm/mach-imx/Kconfig b/arch/arm/mach-imx/Kconfig
1127 index 8ceda2844c4f..08bcf8fb76f2 100644
1128 --- a/arch/arm/mach-imx/Kconfig
1129 +++ b/arch/arm/mach-imx/Kconfig
1130 @@ -524,7 +524,7 @@ config SOC_IMX6Q
1131         bool "i.MX6 Quad/DualLite support"
1132         select ARM_ERRATA_764369 if SMP
1133         select HAVE_ARM_SCU if SMP
1134 -       select HAVE_ARM_TWD if SMP
1135 +       select HAVE_ARM_TWD
1136         select PCI_DOMAINS if PCI
1137         select PINCTRL_IMX6Q
1138         select SOC_IMX6
1139 diff --git a/arch/arm/mach-omap2/omap-smp.c b/arch/arm/mach-omap2/omap-smp.c
1140 index 79e1f876d1c9..7e625c17f78e 100644
1141 --- a/arch/arm/mach-omap2/omap-smp.c
1142 +++ b/arch/arm/mach-omap2/omap-smp.c
1143 @@ -43,7 +43,7 @@
1144  /* SCU base address */
1145  static void __iomem *scu_base;
1147 -static DEFINE_SPINLOCK(boot_lock);
1148 +static DEFINE_RAW_SPINLOCK(boot_lock);
1150  void __iomem *omap4_get_scu_base(void)
1152 @@ -74,8 +74,8 @@ static void omap4_secondary_init(unsigned int cpu)
1153         /*
1154          * Synchronise with the boot thread.
1155          */
1156 -       spin_lock(&boot_lock);
1157 -       spin_unlock(&boot_lock);
1158 +       raw_spin_lock(&boot_lock);
1159 +       raw_spin_unlock(&boot_lock);
1162  static int omap4_boot_secondary(unsigned int cpu, struct task_struct *idle)
1163 @@ -89,7 +89,7 @@ static int omap4_boot_secondary(unsigned int cpu, struct task_struct *idle)
1164          * Set synchronisation state between this boot processor
1165          * and the secondary one
1166          */
1167 -       spin_lock(&boot_lock);
1168 +       raw_spin_lock(&boot_lock);
1170         /*
1171          * Update the AuxCoreBoot0 with boot state for secondary core.
1172 @@ -166,7 +166,7 @@ static int omap4_boot_secondary(unsigned int cpu, struct task_struct *idle)
1173          * Now the secondary core is starting up let it run its
1174          * calibrations, then wait for it to finish
1175          */
1176 -       spin_unlock(&boot_lock);
1177 +       raw_spin_unlock(&boot_lock);
1179         return 0;
1181 diff --git a/arch/arm/mach-prima2/platsmp.c b/arch/arm/mach-prima2/platsmp.c
1182 index e46c91094dde..dcb3ed0c26da 100644
1183 --- a/arch/arm/mach-prima2/platsmp.c
1184 +++ b/arch/arm/mach-prima2/platsmp.c
1185 @@ -22,7 +22,7 @@
1187  static void __iomem *clk_base;
1189 -static DEFINE_SPINLOCK(boot_lock);
1190 +static DEFINE_RAW_SPINLOCK(boot_lock);
1192  static void sirfsoc_secondary_init(unsigned int cpu)
1194 @@ -36,8 +36,8 @@ static void sirfsoc_secondary_init(unsigned int cpu)
1195         /*
1196          * Synchronise with the boot thread.
1197          */
1198 -       spin_lock(&boot_lock);
1199 -       spin_unlock(&boot_lock);
1200 +       raw_spin_lock(&boot_lock);
1201 +       raw_spin_unlock(&boot_lock);
1204  static const struct of_device_id clk_ids[]  = {
1205 @@ -75,7 +75,7 @@ static int sirfsoc_boot_secondary(unsigned int cpu, struct task_struct *idle)
1206         /* make sure write buffer is drained */
1207         mb();
1209 -       spin_lock(&boot_lock);
1210 +       raw_spin_lock(&boot_lock);
1212         /*
1213          * The secondary processor is waiting to be released from
1214 @@ -107,7 +107,7 @@ static int sirfsoc_boot_secondary(unsigned int cpu, struct task_struct *idle)
1215          * now the secondary core is starting up let it run its
1216          * calibrations, then wait for it to finish
1217          */
1218 -       spin_unlock(&boot_lock);
1219 +       raw_spin_unlock(&boot_lock);
1221         return pen_release != -1 ? -ENOSYS : 0;
1223 diff --git a/arch/arm/mach-qcom/platsmp.c b/arch/arm/mach-qcom/platsmp.c
1224 index 9b00123a315d..0a49fe1bc8cf 100644
1225 --- a/arch/arm/mach-qcom/platsmp.c
1226 +++ b/arch/arm/mach-qcom/platsmp.c
1227 @@ -46,7 +46,7 @@
1229  extern void secondary_startup_arm(void);
1231 -static DEFINE_SPINLOCK(boot_lock);
1232 +static DEFINE_RAW_SPINLOCK(boot_lock);
1234  #ifdef CONFIG_HOTPLUG_CPU
1235  static void qcom_cpu_die(unsigned int cpu)
1236 @@ -60,8 +60,8 @@ static void qcom_secondary_init(unsigned int cpu)
1237         /*
1238          * Synchronise with the boot thread.
1239          */
1240 -       spin_lock(&boot_lock);
1241 -       spin_unlock(&boot_lock);
1242 +       raw_spin_lock(&boot_lock);
1243 +       raw_spin_unlock(&boot_lock);
1246  static int scss_release_secondary(unsigned int cpu)
1247 @@ -284,7 +284,7 @@ static int qcom_boot_secondary(unsigned int cpu, int (*func)(unsigned int))
1248          * set synchronisation state between this boot processor
1249          * and the secondary one
1250          */
1251 -       spin_lock(&boot_lock);
1252 +       raw_spin_lock(&boot_lock);
1254         /*
1255          * Send the secondary CPU a soft interrupt, thereby causing
1256 @@ -297,7 +297,7 @@ static int qcom_boot_secondary(unsigned int cpu, int (*func)(unsigned int))
1257          * now the secondary core is starting up let it run its
1258          * calibrations, then wait for it to finish
1259          */
1260 -       spin_unlock(&boot_lock);
1261 +       raw_spin_unlock(&boot_lock);
1263         return ret;
1265 diff --git a/arch/arm/mach-spear/platsmp.c b/arch/arm/mach-spear/platsmp.c
1266 index fd4297713d67..b0553b2c2d53 100644
1267 --- a/arch/arm/mach-spear/platsmp.c
1268 +++ b/arch/arm/mach-spear/platsmp.c
1269 @@ -32,7 +32,7 @@ static void write_pen_release(int val)
1270         sync_cache_w(&pen_release);
1273 -static DEFINE_SPINLOCK(boot_lock);
1274 +static DEFINE_RAW_SPINLOCK(boot_lock);
1276  static void __iomem *scu_base = IOMEM(VA_SCU_BASE);
1278 @@ -47,8 +47,8 @@ static void spear13xx_secondary_init(unsigned int cpu)
1279         /*
1280          * Synchronise with the boot thread.
1281          */
1282 -       spin_lock(&boot_lock);
1283 -       spin_unlock(&boot_lock);
1284 +       raw_spin_lock(&boot_lock);
1285 +       raw_spin_unlock(&boot_lock);
1288  static int spear13xx_boot_secondary(unsigned int cpu, struct task_struct *idle)
1289 @@ -59,7 +59,7 @@ static int spear13xx_boot_secondary(unsigned int cpu, struct task_struct *idle)
1290          * set synchronisation state between this boot processor
1291          * and the secondary one
1292          */
1293 -       spin_lock(&boot_lock);
1294 +       raw_spin_lock(&boot_lock);
1296         /*
1297          * The secondary processor is waiting to be released from
1298 @@ -84,7 +84,7 @@ static int spear13xx_boot_secondary(unsigned int cpu, struct task_struct *idle)
1299          * now the secondary core is starting up let it run its
1300          * calibrations, then wait for it to finish
1301          */
1302 -       spin_unlock(&boot_lock);
1303 +       raw_spin_unlock(&boot_lock);
1305         return pen_release != -1 ? -ENOSYS : 0;
1307 diff --git a/arch/arm/mach-sti/platsmp.c b/arch/arm/mach-sti/platsmp.c
1308 index c4ad6eae67fa..e830b20b212f 100644
1309 --- a/arch/arm/mach-sti/platsmp.c
1310 +++ b/arch/arm/mach-sti/platsmp.c
1311 @@ -35,7 +35,7 @@ static void write_pen_release(int val)
1312         sync_cache_w(&pen_release);
1315 -static DEFINE_SPINLOCK(boot_lock);
1316 +static DEFINE_RAW_SPINLOCK(boot_lock);
1318  static void sti_secondary_init(unsigned int cpu)
1320 @@ -48,8 +48,8 @@ static void sti_secondary_init(unsigned int cpu)
1321         /*
1322          * Synchronise with the boot thread.
1323          */
1324 -       spin_lock(&boot_lock);
1325 -       spin_unlock(&boot_lock);
1326 +       raw_spin_lock(&boot_lock);
1327 +       raw_spin_unlock(&boot_lock);
1330  static int sti_boot_secondary(unsigned int cpu, struct task_struct *idle)
1331 @@ -60,7 +60,7 @@ static int sti_boot_secondary(unsigned int cpu, struct task_struct *idle)
1332          * set synchronisation state between this boot processor
1333          * and the secondary one
1334          */
1335 -       spin_lock(&boot_lock);
1336 +       raw_spin_lock(&boot_lock);
1338         /*
1339          * The secondary processor is waiting to be released from
1340 @@ -91,7 +91,7 @@ static int sti_boot_secondary(unsigned int cpu, struct task_struct *idle)
1341          * now the secondary core is starting up let it run its
1342          * calibrations, then wait for it to finish
1343          */
1344 -       spin_unlock(&boot_lock);
1345 +       raw_spin_unlock(&boot_lock);
1347         return pen_release != -1 ? -ENOSYS : 0;
1349 diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c
1350 index c095455d496e..276ac73f8650 100644
1351 --- a/arch/arm/mm/fault.c
1352 +++ b/arch/arm/mm/fault.c
1353 @@ -430,6 +430,9 @@ do_translation_fault(unsigned long addr, unsigned int fsr,
1354         if (addr < TASK_SIZE)
1355                 return do_page_fault(addr, fsr, regs);
1357 +       if (interrupts_enabled(regs))
1358 +               local_irq_enable();
1360         if (user_mode(regs))
1361                 goto bad_area;
1363 @@ -497,6 +500,9 @@ do_translation_fault(unsigned long addr, unsigned int fsr,
1364  static int
1365  do_sect_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
1367 +       if (interrupts_enabled(regs))
1368 +               local_irq_enable();
1370         do_bad_area(addr, fsr, regs);
1371         return 0;
1373 diff --git a/arch/arm/mm/highmem.c b/arch/arm/mm/highmem.c
1374 index d02f8187b1cc..542692dbd40a 100644
1375 --- a/arch/arm/mm/highmem.c
1376 +++ b/arch/arm/mm/highmem.c
1377 @@ -34,6 +34,11 @@ static inline pte_t get_fixmap_pte(unsigned long vaddr)
1378         return *ptep;
1381 +static unsigned int fixmap_idx(int type)
1383 +       return FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id();
1386  void *kmap(struct page *page)
1388         might_sleep();
1389 @@ -54,12 +59,13 @@ EXPORT_SYMBOL(kunmap);
1391  void *kmap_atomic(struct page *page)
1393 +       pte_t pte = mk_pte(page, kmap_prot);
1394         unsigned int idx;
1395         unsigned long vaddr;
1396         void *kmap;
1397         int type;
1399 -       preempt_disable();
1400 +       preempt_disable_nort();
1401         pagefault_disable();
1402         if (!PageHighMem(page))
1403                 return page_address(page);
1404 @@ -79,7 +85,7 @@ void *kmap_atomic(struct page *page)
1406         type = kmap_atomic_idx_push();
1408 -       idx = FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id();
1409 +       idx = fixmap_idx(type);
1410         vaddr = __fix_to_virt(idx);
1411  #ifdef CONFIG_DEBUG_HIGHMEM
1412         /*
1413 @@ -93,7 +99,10 @@ void *kmap_atomic(struct page *page)
1414          * in place, so the contained TLB flush ensures the TLB is updated
1415          * with the new mapping.
1416          */
1417 -       set_fixmap_pte(idx, mk_pte(page, kmap_prot));
1418 +#ifdef CONFIG_PREEMPT_RT_FULL
1419 +       current->kmap_pte[type] = pte;
1420 +#endif
1421 +       set_fixmap_pte(idx, pte);
1423         return (void *)vaddr;
1425 @@ -106,44 +115,75 @@ void __kunmap_atomic(void *kvaddr)
1427         if (kvaddr >= (void *)FIXADDR_START) {
1428                 type = kmap_atomic_idx();
1429 -               idx = FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id();
1430 +               idx = fixmap_idx(type);
1432                 if (cache_is_vivt())
1433                         __cpuc_flush_dcache_area((void *)vaddr, PAGE_SIZE);
1434 +#ifdef CONFIG_PREEMPT_RT_FULL
1435 +               current->kmap_pte[type] = __pte(0);
1436 +#endif
1437  #ifdef CONFIG_DEBUG_HIGHMEM
1438                 BUG_ON(vaddr != __fix_to_virt(idx));
1439 -               set_fixmap_pte(idx, __pte(0));
1440  #else
1441                 (void) idx;  /* to kill a warning */
1442  #endif
1443 +               set_fixmap_pte(idx, __pte(0));
1444                 kmap_atomic_idx_pop();
1445         } else if (vaddr >= PKMAP_ADDR(0) && vaddr < PKMAP_ADDR(LAST_PKMAP)) {
1446                 /* this address was obtained through kmap_high_get() */
1447                 kunmap_high(pte_page(pkmap_page_table[PKMAP_NR(vaddr)]));
1448         }
1449         pagefault_enable();
1450 -       preempt_enable();
1451 +       preempt_enable_nort();
1453  EXPORT_SYMBOL(__kunmap_atomic);
1455  void *kmap_atomic_pfn(unsigned long pfn)
1457 +       pte_t pte = pfn_pte(pfn, kmap_prot);
1458         unsigned long vaddr;
1459         int idx, type;
1460         struct page *page = pfn_to_page(pfn);
1462 -       preempt_disable();
1463 +       preempt_disable_nort();
1464         pagefault_disable();
1465         if (!PageHighMem(page))
1466                 return page_address(page);
1468         type = kmap_atomic_idx_push();
1469 -       idx = FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id();
1470 +       idx = fixmap_idx(type);
1471         vaddr = __fix_to_virt(idx);
1472  #ifdef CONFIG_DEBUG_HIGHMEM
1473         BUG_ON(!pte_none(get_fixmap_pte(vaddr)));
1474  #endif
1475 -       set_fixmap_pte(idx, pfn_pte(pfn, kmap_prot));
1476 +#ifdef CONFIG_PREEMPT_RT_FULL
1477 +       current->kmap_pte[type] = pte;
1478 +#endif
1479 +       set_fixmap_pte(idx, pte);
1481         return (void *)vaddr;
1483 +#if defined CONFIG_PREEMPT_RT_FULL
1484 +void switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p)
1486 +       int i;
1488 +       /*
1489 +        * Clear @prev's kmap_atomic mappings
1490 +        */
1491 +       for (i = 0; i < prev_p->kmap_idx; i++) {
1492 +               int idx = fixmap_idx(i);
1494 +               set_fixmap_pte(idx, __pte(0));
1495 +       }
1496 +       /*
1497 +        * Restore @next_p's kmap_atomic mappings
1498 +        */
1499 +       for (i = 0; i < next_p->kmap_idx; i++) {
1500 +               int idx = fixmap_idx(i);
1502 +               if (!pte_none(next_p->kmap_pte[i]))
1503 +                       set_fixmap_pte(idx, next_p->kmap_pte[i]);
1504 +       }
1506 +#endif
1507 diff --git a/arch/arm/plat-versatile/platsmp.c b/arch/arm/plat-versatile/platsmp.c
1508 index 53feb90c840c..b4a8d54fc3f3 100644
1509 --- a/arch/arm/plat-versatile/platsmp.c
1510 +++ b/arch/arm/plat-versatile/platsmp.c
1511 @@ -30,7 +30,7 @@ static void write_pen_release(int val)
1512         sync_cache_w(&pen_release);
1515 -static DEFINE_SPINLOCK(boot_lock);
1516 +static DEFINE_RAW_SPINLOCK(boot_lock);
1518  void versatile_secondary_init(unsigned int cpu)
1520 @@ -43,8 +43,8 @@ void versatile_secondary_init(unsigned int cpu)
1521         /*
1522          * Synchronise with the boot thread.
1523          */
1524 -       spin_lock(&boot_lock);
1525 -       spin_unlock(&boot_lock);
1526 +       raw_spin_lock(&boot_lock);
1527 +       raw_spin_unlock(&boot_lock);
1530  int versatile_boot_secondary(unsigned int cpu, struct task_struct *idle)
1531 @@ -55,7 +55,7 @@ int versatile_boot_secondary(unsigned int cpu, struct task_struct *idle)
1532          * Set synchronisation state between this boot processor
1533          * and the secondary one
1534          */
1535 -       spin_lock(&boot_lock);
1536 +       raw_spin_lock(&boot_lock);
1538         /*
1539          * This is really belt and braces; we hold unintended secondary
1540 @@ -85,7 +85,7 @@ int versatile_boot_secondary(unsigned int cpu, struct task_struct *idle)
1541          * now the secondary core is starting up let it run its
1542          * calibrations, then wait for it to finish
1543          */
1544 -       spin_unlock(&boot_lock);
1545 +       raw_spin_unlock(&boot_lock);
1547         return pen_release != -1 ? -ENOSYS : 0;
1549 diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
1550 index 14cdc6dea493..9196cf82f7be 100644
1551 --- a/arch/arm64/Kconfig
1552 +++ b/arch/arm64/Kconfig
1553 @@ -76,6 +76,7 @@ config ARM64
1554         select HAVE_PERF_REGS
1555         select HAVE_PERF_USER_STACK_DUMP
1556         select HAVE_RCU_TABLE_FREE
1557 +       select HAVE_PREEMPT_LAZY
1558         select HAVE_SYSCALL_TRACEPOINTS
1559         select IOMMU_DMA if IOMMU_SUPPORT
1560         select IRQ_DOMAIN
1561 @@ -582,7 +583,7 @@ config XEN_DOM0
1563  config XEN
1564         bool "Xen guest support on ARM64"
1565 -       depends on ARM64 && OF
1566 +       depends on ARM64 && OF && !PREEMPT_RT_FULL
1567         select SWIOTLB_XEN
1568         help
1569           Say Y if you want to run Linux in a Virtual Machine on Xen on ARM64.
1570 diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h
1571 index 90c7ff233735..5f4e89fbc290 100644
1572 --- a/arch/arm64/include/asm/thread_info.h
1573 +++ b/arch/arm64/include/asm/thread_info.h
1574 @@ -49,6 +49,7 @@ struct thread_info {
1575         mm_segment_t            addr_limit;     /* address limit */
1576         struct task_struct      *task;          /* main task structure */
1577         int                     preempt_count;  /* 0 => preemptable, <0 => bug */
1578 +       int                     preempt_lazy_count; /* 0 => preemptable, <0 => bug */
1579         int                     cpu;            /* cpu */
1580  };
1582 @@ -103,6 +104,7 @@ static inline struct thread_info *current_thread_info(void)
1583  #define TIF_NEED_RESCHED       1
1584  #define TIF_NOTIFY_RESUME      2       /* callback before returning to user */
1585  #define TIF_FOREIGN_FPSTATE    3       /* CPU's FP state is not current's */
1586 +#define TIF_NEED_RESCHED_LAZY  4
1587  #define TIF_NOHZ               7
1588  #define TIF_SYSCALL_TRACE      8
1589  #define TIF_SYSCALL_AUDIT      9
1590 @@ -118,6 +120,7 @@ static inline struct thread_info *current_thread_info(void)
1591  #define _TIF_NEED_RESCHED      (1 << TIF_NEED_RESCHED)
1592  #define _TIF_NOTIFY_RESUME     (1 << TIF_NOTIFY_RESUME)
1593  #define _TIF_FOREIGN_FPSTATE   (1 << TIF_FOREIGN_FPSTATE)
1594 +#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY)
1595  #define _TIF_NOHZ              (1 << TIF_NOHZ)
1596  #define _TIF_SYSCALL_TRACE     (1 << TIF_SYSCALL_TRACE)
1597  #define _TIF_SYSCALL_AUDIT     (1 << TIF_SYSCALL_AUDIT)
1598 @@ -126,7 +129,8 @@ static inline struct thread_info *current_thread_info(void)
1599  #define _TIF_32BIT             (1 << TIF_32BIT)
1601  #define _TIF_WORK_MASK         (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
1602 -                                _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE)
1603 +                                _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE | \
1604 +                                _TIF_NEED_RESCHED_LAZY)
1606  #define _TIF_SYSCALL_WORK      (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \
1607                                  _TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP | \
1608 diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
1609 index 087cf9a65359..d74475928399 100644
1610 --- a/arch/arm64/kernel/asm-offsets.c
1611 +++ b/arch/arm64/kernel/asm-offsets.c
1612 @@ -35,6 +35,7 @@ int main(void)
1613    BLANK();
1614    DEFINE(TI_FLAGS,             offsetof(struct thread_info, flags));
1615    DEFINE(TI_PREEMPT,           offsetof(struct thread_info, preempt_count));
1616 +  DEFINE(TI_PREEMPT_LAZY,      offsetof(struct thread_info, preempt_lazy_count));
1617    DEFINE(TI_ADDR_LIMIT,                offsetof(struct thread_info, addr_limit));
1618    DEFINE(TI_TASK,              offsetof(struct thread_info, task));
1619    DEFINE(TI_CPU,               offsetof(struct thread_info, cpu));
1620 diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
1621 index bd14849beb73..cf92d4ee51db 100644
1622 --- a/arch/arm64/kernel/entry.S
1623 +++ b/arch/arm64/kernel/entry.S
1624 @@ -376,11 +376,16 @@ el1_irq:
1625  #ifdef CONFIG_PREEMPT
1626         get_thread_info tsk
1627         ldr     w24, [tsk, #TI_PREEMPT]         // get preempt count
1628 -       cbnz    w24, 1f                         // preempt count != 0
1629 +       cbnz    w24, 2f                         // preempt count != 0
1630         ldr     x0, [tsk, #TI_FLAGS]            // get flags
1631 -       tbz     x0, #TIF_NEED_RESCHED, 1f       // needs rescheduling?
1632 -       bl      el1_preempt
1633 +       tbnz    x0, #TIF_NEED_RESCHED, 1f       // needs rescheduling?
1635 +       ldr     w24, [tsk, #TI_PREEMPT_LAZY]    // get preempt lazy count
1636 +       cbnz    w24, 2f                         // preempt lazy count != 0
1637 +       tbz     x0, #TIF_NEED_RESCHED_LAZY, 2f  // needs rescheduling?
1638  1:
1639 +       bl      el1_preempt
1641  #endif
1642  #ifdef CONFIG_TRACE_IRQFLAGS
1643         bl      trace_hardirqs_on
1644 @@ -394,6 +399,7 @@ el1_preempt:
1645  1:     bl      preempt_schedule_irq            // irq en/disable is done inside
1646         ldr     x0, [tsk, #TI_FLAGS]            // get new tasks TI_FLAGS
1647         tbnz    x0, #TIF_NEED_RESCHED, 1b       // needs rescheduling?
1648 +       tbnz    x0, #TIF_NEED_RESCHED_LAZY, 1b  // needs rescheduling?
1649         ret     x24
1650  #endif
1652 @@ -638,6 +644,7 @@ ret_fast_syscall_trace:
1653   */
1654  work_pending:
1655         tbnz    x1, #TIF_NEED_RESCHED, work_resched
1656 +       tbnz    x1, #TIF_NEED_RESCHED_LAZY, work_resched
1657         /* TIF_SIGPENDING, TIF_NOTIFY_RESUME or TIF_FOREIGN_FPSTATE case */
1658         ldr     x2, [sp, #S_PSTATE]
1659         mov     x0, sp                          // 'regs'
1660 diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
1661 index 8b0424abc84c..5422d4c0bbdf 100644
1662 --- a/arch/mips/Kconfig
1663 +++ b/arch/mips/Kconfig
1664 @@ -2411,7 +2411,7 @@ config CPU_R4400_WORKAROUNDS
1666  config HIGHMEM
1667         bool "High Memory Support"
1668 -       depends on 32BIT && CPU_SUPPORTS_HIGHMEM && SYS_SUPPORTS_HIGHMEM && !CPU_MIPS32_3_5_EVA
1669 +       depends on 32BIT && CPU_SUPPORTS_HIGHMEM && SYS_SUPPORTS_HIGHMEM && !CPU_MIPS32_3_5_EVA && !PREEMPT_RT_FULL
1671  config CPU_SUPPORTS_HIGHMEM
1672         bool
1673 diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
1674 index a017b23ee4aa..8d4d9270140f 100644
1675 --- a/arch/mips/kvm/mips.c
1676 +++ b/arch/mips/kvm/mips.c
1677 @@ -454,8 +454,8 @@ int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
1679         dvcpu->arch.wait = 0;
1681 -       if (waitqueue_active(&dvcpu->wq))
1682 -               wake_up_interruptible(&dvcpu->wq);
1683 +       if (swait_active(&dvcpu->wq))
1684 +               swake_up(&dvcpu->wq);
1686         return 0;
1688 @@ -1183,8 +1183,8 @@ static void kvm_mips_comparecount_func(unsigned long data)
1689         kvm_mips_callbacks->queue_timer_int(vcpu);
1691         vcpu->arch.wait = 0;
1692 -       if (waitqueue_active(&vcpu->wq))
1693 -               wake_up_interruptible(&vcpu->wq);
1694 +       if (swait_active(&vcpu->wq))
1695 +               swake_up(&vcpu->wq);
1698  /* low level hrtimer wake routine */
1699 diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
1700 index db49e0d796b1..1d2be228661c 100644
1701 --- a/arch/powerpc/Kconfig
1702 +++ b/arch/powerpc/Kconfig
1703 @@ -60,10 +60,11 @@ config LOCKDEP_SUPPORT
1705  config RWSEM_GENERIC_SPINLOCK
1706         bool
1707 +       default y if PREEMPT_RT_FULL
1709  config RWSEM_XCHGADD_ALGORITHM
1710         bool
1711 -       default y
1712 +       default y if !PREEMPT_RT_FULL
1714  config GENERIC_LOCKBREAK
1715         bool
1716 @@ -141,6 +142,7 @@ config PPC
1717         select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
1718         select GENERIC_STRNCPY_FROM_USER
1719         select GENERIC_STRNLEN_USER
1720 +       select HAVE_PREEMPT_LAZY
1721         select HAVE_MOD_ARCH_SPECIFIC
1722         select MODULES_USE_ELF_RELA
1723         select CLONE_BACKWARDS
1724 @@ -319,7 +321,7 @@ menu "Kernel options"
1726  config HIGHMEM
1727         bool "High memory support"
1728 -       depends on PPC32
1729 +       depends on PPC32 && !PREEMPT_RT_FULL
1731  source kernel/Kconfig.hz
1732  source kernel/Kconfig.preempt
1733 diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
1734 index a92d95aee42d..20376580583f 100644
1735 --- a/arch/powerpc/include/asm/kvm_host.h
1736 +++ b/arch/powerpc/include/asm/kvm_host.h
1737 @@ -286,7 +286,7 @@ struct kvmppc_vcore {
1738         struct list_head runnable_threads;
1739         struct list_head preempt_list;
1740         spinlock_t lock;
1741 -       wait_queue_head_t wq;
1742 +       struct swait_queue_head wq;
1743         spinlock_t stoltb_lock; /* protects stolen_tb and preempt_tb */
1744         u64 stolen_tb;
1745         u64 preempt_tb;
1746 @@ -627,7 +627,7 @@ struct kvm_vcpu_arch {
1747         u8 prodded;
1748         u32 last_inst;
1750 -       wait_queue_head_t *wqp;
1751 +       struct swait_queue_head *wqp;
1752         struct kvmppc_vcore *vcore;
1753         int ret;
1754         int trap;
1755 diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h
1756 index 7efee4a3240b..40e6fa1b85b2 100644
1757 --- a/arch/powerpc/include/asm/thread_info.h
1758 +++ b/arch/powerpc/include/asm/thread_info.h
1759 @@ -42,6 +42,8 @@ struct thread_info {
1760         int             cpu;                    /* cpu we're on */
1761         int             preempt_count;          /* 0 => preemptable,
1762                                                    <0 => BUG */
1763 +       int             preempt_lazy_count;      /* 0 => preemptable,
1764 +                                                  <0 => BUG */
1765         unsigned long   local_flags;            /* private flags for thread */
1767         /* low level flags - has atomic operations done on it */
1768 @@ -82,8 +84,7 @@ static inline struct thread_info *current_thread_info(void)
1769  #define TIF_SYSCALL_TRACE      0       /* syscall trace active */
1770  #define TIF_SIGPENDING         1       /* signal pending */
1771  #define TIF_NEED_RESCHED       2       /* rescheduling necessary */
1772 -#define TIF_POLLING_NRFLAG     3       /* true if poll_idle() is polling
1773 -                                          TIF_NEED_RESCHED */
1774 +#define TIF_NEED_RESCHED_LAZY  3       /* lazy rescheduling necessary */
1775  #define TIF_32BIT              4       /* 32 bit binary */
1776  #define TIF_RESTORE_TM         5       /* need to restore TM FP/VEC/VSX */
1777  #define TIF_SYSCALL_AUDIT      7       /* syscall auditing active */
1778 @@ -101,6 +102,8 @@ static inline struct thread_info *current_thread_info(void)
1779  #if defined(CONFIG_PPC64)
1780  #define TIF_ELF2ABI            18      /* function descriptors must die! */
1781  #endif
1782 +#define TIF_POLLING_NRFLAG     19      /* true if poll_idle() is polling
1783 +                                          TIF_NEED_RESCHED */
1785  /* as above, but as bit values */
1786  #define _TIF_SYSCALL_TRACE     (1<<TIF_SYSCALL_TRACE)
1787 @@ -119,14 +122,16 @@ static inline struct thread_info *current_thread_info(void)
1788  #define _TIF_SYSCALL_TRACEPOINT        (1<<TIF_SYSCALL_TRACEPOINT)
1789  #define _TIF_EMULATE_STACK_STORE       (1<<TIF_EMULATE_STACK_STORE)
1790  #define _TIF_NOHZ              (1<<TIF_NOHZ)
1791 +#define _TIF_NEED_RESCHED_LAZY (1<<TIF_NEED_RESCHED_LAZY)
1792  #define _TIF_SYSCALL_DOTRACE   (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \
1793                                  _TIF_SECCOMP | _TIF_SYSCALL_TRACEPOINT | \
1794                                  _TIF_NOHZ)
1796  #define _TIF_USER_WORK_MASK    (_TIF_SIGPENDING | _TIF_NEED_RESCHED | \
1797                                  _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
1798 -                                _TIF_RESTORE_TM)
1799 +                                _TIF_RESTORE_TM | _TIF_NEED_RESCHED_LAZY)
1800  #define _TIF_PERSYSCALL_MASK   (_TIF_RESTOREALL|_TIF_NOERROR)
1801 +#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)
1803  /* Bits in local_flags */
1804  /* Don't move TLF_NAPPING without adjusting the code in entry_32.S */
1805 diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
1806 index 40da69163d51..bd040815334b 100644
1807 --- a/arch/powerpc/kernel/asm-offsets.c
1808 +++ b/arch/powerpc/kernel/asm-offsets.c
1809 @@ -160,6 +160,7 @@ int main(void)
1810         DEFINE(TI_FLAGS, offsetof(struct thread_info, flags));
1811         DEFINE(TI_LOCAL_FLAGS, offsetof(struct thread_info, local_flags));
1812         DEFINE(TI_PREEMPT, offsetof(struct thread_info, preempt_count));
1813 +       DEFINE(TI_PREEMPT_LAZY, offsetof(struct thread_info, preempt_lazy_count));
1814         DEFINE(TI_TASK, offsetof(struct thread_info, task));
1815         DEFINE(TI_CPU, offsetof(struct thread_info, cpu));
1817 diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
1818 index 2405631e91a2..c21b4b42eaa0 100644
1819 --- a/arch/powerpc/kernel/entry_32.S
1820 +++ b/arch/powerpc/kernel/entry_32.S
1821 @@ -818,7 +818,14 @@ resume_kernel:
1822         cmpwi   0,r0,0          /* if non-zero, just restore regs and return */
1823         bne     restore
1824         andi.   r8,r8,_TIF_NEED_RESCHED
1825 +       bne+    1f
1826 +       lwz     r0,TI_PREEMPT_LAZY(r9)
1827 +       cmpwi   0,r0,0          /* if non-zero, just restore regs and return */
1828 +       bne     restore
1829 +       lwz     r0,TI_FLAGS(r9)
1830 +       andi.   r0,r0,_TIF_NEED_RESCHED_LAZY
1831         beq+    restore
1833         lwz     r3,_MSR(r1)
1834         andi.   r0,r3,MSR_EE    /* interrupts off? */
1835         beq     restore         /* don't schedule if so */
1836 @@ -829,11 +836,11 @@ resume_kernel:
1837          */
1838         bl      trace_hardirqs_off
1839  #endif
1840 -1:     bl      preempt_schedule_irq
1841 +2:     bl      preempt_schedule_irq
1842         CURRENT_THREAD_INFO(r9, r1)
1843         lwz     r3,TI_FLAGS(r9)
1844 -       andi.   r0,r3,_TIF_NEED_RESCHED
1845 -       bne-    1b
1846 +       andi.   r0,r3,_TIF_NEED_RESCHED_MASK
1847 +       bne-    2b
1848  #ifdef CONFIG_TRACE_IRQFLAGS
1849         /* And now, to properly rebalance the above, we tell lockdep they
1850          * are being turned back on, which will happen when we return
1851 @@ -1154,7 +1161,7 @@ global_dbcr0:
1852  #endif /* !(CONFIG_4xx || CONFIG_BOOKE) */
1854  do_work:                       /* r10 contains MSR_KERNEL here */
1855 -       andi.   r0,r9,_TIF_NEED_RESCHED
1856 +       andi.   r0,r9,_TIF_NEED_RESCHED_MASK
1857         beq     do_user_signal
1859  do_resched:                    /* r10 contains MSR_KERNEL here */
1860 @@ -1175,7 +1182,7 @@ recheck:
1861         MTMSRD(r10)             /* disable interrupts */
1862         CURRENT_THREAD_INFO(r9, r1)
1863         lwz     r9,TI_FLAGS(r9)
1864 -       andi.   r0,r9,_TIF_NEED_RESCHED
1865 +       andi.   r0,r9,_TIF_NEED_RESCHED_MASK
1866         bne-    do_resched
1867         andi.   r0,r9,_TIF_USER_WORK_MASK
1868         beq     restore_user
1869 diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
1870 index f6fd0332c3a2..96235fe0a581 100644
1871 --- a/arch/powerpc/kernel/entry_64.S
1872 +++ b/arch/powerpc/kernel/entry_64.S
1873 @@ -683,7 +683,7 @@ _GLOBAL(ret_from_except_lite)
1874  #else
1875         beq     restore
1876  #endif
1877 -1:     andi.   r0,r4,_TIF_NEED_RESCHED
1878 +1:     andi.   r0,r4,_TIF_NEED_RESCHED_MASK
1879         beq     2f
1880         bl      restore_interrupts
1881         SCHEDULE_USER
1882 @@ -745,10 +745,18 @@ resume_kernel:
1884  #ifdef CONFIG_PREEMPT
1885         /* Check if we need to preempt */
1886 +       lwz     r8,TI_PREEMPT(r9)
1887 +       cmpwi   0,r8,0          /* if non-zero, just restore regs and return */
1888 +       bne     restore
1889         andi.   r0,r4,_TIF_NEED_RESCHED
1890 +       bne+    check_count
1892 +       andi.   r0,r4,_TIF_NEED_RESCHED_LAZY
1893         beq+    restore
1894 +       lwz     r8,TI_PREEMPT_LAZY(r9)
1896         /* Check that preempt_count() == 0 and interrupts are enabled */
1897 -       lwz     r8,TI_PREEMPT(r9)
1898 +check_count:
1899         cmpwi   cr1,r8,0
1900         ld      r0,SOFTE(r1)
1901         cmpdi   r0,0
1902 @@ -765,7 +773,7 @@ resume_kernel:
1903         /* Re-test flags and eventually loop */
1904         CURRENT_THREAD_INFO(r9, r1)
1905         ld      r4,TI_FLAGS(r9)
1906 -       andi.   r0,r4,_TIF_NEED_RESCHED
1907 +       andi.   r0,r4,_TIF_NEED_RESCHED_MASK
1908         bne     1b
1910         /*
1911 diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
1912 index 290559df1e8b..070afa6da35d 100644
1913 --- a/arch/powerpc/kernel/irq.c
1914 +++ b/arch/powerpc/kernel/irq.c
1915 @@ -614,6 +614,7 @@ void irq_ctx_init(void)
1916         }
1919 +#ifndef CONFIG_PREEMPT_RT_FULL
1920  void do_softirq_own_stack(void)
1922         struct thread_info *curtp, *irqtp;
1923 @@ -631,6 +632,7 @@ void do_softirq_own_stack(void)
1924         if (irqtp->flags)
1925                 set_bits(irqtp->flags, &curtp->flags);
1927 +#endif
1929  irq_hw_number_t virq_to_hw(unsigned int virq)
1931 diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S
1932 index df4efa304b2c..9cb0c2f6e7ac 100644
1933 --- a/arch/powerpc/kernel/misc_32.S
1934 +++ b/arch/powerpc/kernel/misc_32.S
1935 @@ -40,6 +40,7 @@
1936   * We store the saved ksp_limit in the unused part
1937   * of the STACK_FRAME_OVERHEAD
1938   */
1939 +#ifndef CONFIG_PREEMPT_RT_FULL
1940  _GLOBAL(call_do_softirq)
1941         mflr    r0
1942         stw     r0,4(r1)
1943 @@ -56,6 +57,7 @@ _GLOBAL(call_do_softirq)
1944         stw     r10,THREAD+KSP_LIMIT(r2)
1945         mtlr    r0
1946         blr
1947 +#endif
1949  /*
1950   * void call_do_irq(struct pt_regs *regs, struct thread_info *irqtp);
1951 diff --git a/arch/powerpc/kernel/misc_64.S b/arch/powerpc/kernel/misc_64.S
1952 index db475d41b57a..96b7ef80e05d 100644
1953 --- a/arch/powerpc/kernel/misc_64.S
1954 +++ b/arch/powerpc/kernel/misc_64.S
1955 @@ -30,6 +30,7 @@
1957         .text
1959 +#ifndef CONFIG_PREEMPT_RT_FULL
1960  _GLOBAL(call_do_softirq)
1961         mflr    r0
1962         std     r0,16(r1)
1963 @@ -40,6 +41,7 @@ _GLOBAL(call_do_softirq)
1964         ld      r0,16(r1)
1965         mtlr    r0
1966         blr
1967 +#endif
1969  _GLOBAL(call_do_irq)
1970         mflr    r0
1971 diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
1972 index c2024ac9d4e8..2303788da7e1 100644
1973 --- a/arch/powerpc/kvm/Kconfig
1974 +++ b/arch/powerpc/kvm/Kconfig
1975 @@ -172,6 +172,7 @@ config KVM_E500MC
1976  config KVM_MPIC
1977         bool "KVM in-kernel MPIC emulation"
1978         depends on KVM && E500
1979 +       depends on !PREEMPT_RT_FULL
1980         select HAVE_KVM_IRQCHIP
1981         select HAVE_KVM_IRQFD
1982         select HAVE_KVM_IRQ_ROUTING
1983 diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
1984 index 3c3a367b6e59..2ac7a8b108ac 100644
1985 --- a/arch/powerpc/kvm/book3s_hv.c
1986 +++ b/arch/powerpc/kvm/book3s_hv.c
1987 @@ -114,11 +114,11 @@ static bool kvmppc_ipi_thread(int cpu)
1988  static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu)
1990         int cpu;
1991 -       wait_queue_head_t *wqp;
1992 +       struct swait_queue_head *wqp;
1994         wqp = kvm_arch_vcpu_wq(vcpu);
1995 -       if (waitqueue_active(wqp)) {
1996 -               wake_up_interruptible(wqp);
1997 +       if (swait_active(wqp)) {
1998 +               swake_up(wqp);
1999                 ++vcpu->stat.halt_wakeup;
2000         }
2002 @@ -707,8 +707,8 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
2003                 tvcpu->arch.prodded = 1;
2004                 smp_mb();
2005                 if (vcpu->arch.ceded) {
2006 -                       if (waitqueue_active(&vcpu->wq)) {
2007 -                               wake_up_interruptible(&vcpu->wq);
2008 +                       if (swait_active(&vcpu->wq)) {
2009 +                               swake_up(&vcpu->wq);
2010                                 vcpu->stat.halt_wakeup++;
2011                         }
2012                 }
2013 @@ -1453,7 +1453,7 @@ static struct kvmppc_vcore *kvmppc_vcore_create(struct kvm *kvm, int core)
2014         INIT_LIST_HEAD(&vcore->runnable_threads);
2015         spin_lock_init(&vcore->lock);
2016         spin_lock_init(&vcore->stoltb_lock);
2017 -       init_waitqueue_head(&vcore->wq);
2018 +       init_swait_queue_head(&vcore->wq);
2019         vcore->preempt_tb = TB_NIL;
2020         vcore->lpcr = kvm->arch.lpcr;
2021         vcore->first_vcpuid = core * threads_per_subcore;
2022 @@ -2525,10 +2525,9 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
2024         struct kvm_vcpu *vcpu;
2025         int do_sleep = 1;
2026 +       DECLARE_SWAITQUEUE(wait);
2028 -       DEFINE_WAIT(wait);
2030 -       prepare_to_wait(&vc->wq, &wait, TASK_INTERRUPTIBLE);
2031 +       prepare_to_swait(&vc->wq, &wait, TASK_INTERRUPTIBLE);
2033         /*
2034          * Check one last time for pending exceptions and ceded state after
2035 @@ -2542,7 +2541,7 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
2036         }
2038         if (!do_sleep) {
2039 -               finish_wait(&vc->wq, &wait);
2040 +               finish_swait(&vc->wq, &wait);
2041                 return;
2042         }
2044 @@ -2550,7 +2549,7 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
2045         trace_kvmppc_vcore_blocked(vc, 0);
2046         spin_unlock(&vc->lock);
2047         schedule();
2048 -       finish_wait(&vc->wq, &wait);
2049 +       finish_swait(&vc->wq, &wait);
2050         spin_lock(&vc->lock);
2051         vc->vcore_state = VCORE_INACTIVE;
2052         trace_kvmppc_vcore_blocked(vc, 1);
2053 @@ -2606,7 +2605,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
2054                         kvmppc_start_thread(vcpu, vc);
2055                         trace_kvm_guest_enter(vcpu);
2056                 } else if (vc->vcore_state == VCORE_SLEEPING) {
2057 -                       wake_up(&vc->wq);
2058 +                       swake_up(&vc->wq);
2059                 }
2061         }
2062 diff --git a/arch/powerpc/platforms/ps3/device-init.c b/arch/powerpc/platforms/ps3/device-init.c
2063 index 3f175e8aedb4..c4c02f91904c 100644
2064 --- a/arch/powerpc/platforms/ps3/device-init.c
2065 +++ b/arch/powerpc/platforms/ps3/device-init.c
2066 @@ -752,7 +752,7 @@ static int ps3_notification_read_write(struct ps3_notification_device *dev,
2067         }
2068         pr_debug("%s:%u: notification %s issued\n", __func__, __LINE__, op);
2070 -       res = wait_event_interruptible(dev->done.wait,
2071 +       res = swait_event_interruptible(dev->done.wait,
2072                                        dev->done.done || kthread_should_stop());
2073         if (kthread_should_stop())
2074                 res = -EINTR;
2075 diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
2076 index e9a983f40a24..bbdc539fb3c6 100644
2077 --- a/arch/s390/include/asm/kvm_host.h
2078 +++ b/arch/s390/include/asm/kvm_host.h
2079 @@ -427,7 +427,7 @@ struct kvm_s390_irq_payload {
2080  struct kvm_s390_local_interrupt {
2081         spinlock_t lock;
2082         struct kvm_s390_float_interrupt *float_int;
2083 -       wait_queue_head_t *wq;
2084 +       struct swait_queue_head *wq;
2085         atomic_t *cpuflags;
2086         DECLARE_BITMAP(sigp_emerg_pending, KVM_MAX_VCPUS);
2087         struct kvm_s390_irq_payload irq;
2088 diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
2089 index 6a75352f453c..cc862c486002 100644
2090 --- a/arch/s390/kvm/interrupt.c
2091 +++ b/arch/s390/kvm/interrupt.c
2092 @@ -868,13 +868,13 @@ no_timer:
2094  void kvm_s390_vcpu_wakeup(struct kvm_vcpu *vcpu)
2096 -       if (waitqueue_active(&vcpu->wq)) {
2097 +       if (swait_active(&vcpu->wq)) {
2098                 /*
2099                  * The vcpu gave up the cpu voluntarily, mark it as a good
2100                  * yield-candidate.
2101                  */
2102                 vcpu->preempted = true;
2103 -               wake_up_interruptible(&vcpu->wq);
2104 +               swake_up(&vcpu->wq);
2105                 vcpu->stat.halt_wakeup++;
2106         }
2108 diff --git a/arch/sh/kernel/irq.c b/arch/sh/kernel/irq.c
2109 index 6c0378c0b8b5..abd58b4dff97 100644
2110 --- a/arch/sh/kernel/irq.c
2111 +++ b/arch/sh/kernel/irq.c
2112 @@ -147,6 +147,7 @@ void irq_ctx_exit(int cpu)
2113         hardirq_ctx[cpu] = NULL;
2116 +#ifndef CONFIG_PREEMPT_RT_FULL
2117  void do_softirq_own_stack(void)
2119         struct thread_info *curctx;
2120 @@ -174,6 +175,7 @@ void do_softirq_own_stack(void)
2121                   "r5", "r6", "r7", "r8", "r9", "r15", "t", "pr"
2122         );
2124 +#endif
2125  #else
2126  static inline void handle_one_irq(unsigned int irq)
2128 diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
2129 index 56442d2d7bbc..8c9598f534c9 100644
2130 --- a/arch/sparc/Kconfig
2131 +++ b/arch/sparc/Kconfig
2132 @@ -189,12 +189,10 @@ config NR_CPUS
2133  source kernel/Kconfig.hz
2135  config RWSEM_GENERIC_SPINLOCK
2136 -       bool
2137 -       default y if SPARC32
2138 +       def_bool PREEMPT_RT_FULL
2140  config RWSEM_XCHGADD_ALGORITHM
2141 -       bool
2142 -       default y if SPARC64
2143 +       def_bool !RWSEM_GENERIC_SPINLOCK && !PREEMPT_RT_FULL
2145  config GENERIC_HWEIGHT
2146         bool
2147 diff --git a/arch/sparc/kernel/irq_64.c b/arch/sparc/kernel/irq_64.c
2148 index e22416ce56ea..d359de71153a 100644
2149 --- a/arch/sparc/kernel/irq_64.c
2150 +++ b/arch/sparc/kernel/irq_64.c
2151 @@ -854,6 +854,7 @@ void __irq_entry handler_irq(int pil, struct pt_regs *regs)
2152         set_irq_regs(old_regs);
2155 +#ifndef CONFIG_PREEMPT_RT_FULL
2156  void do_softirq_own_stack(void)
2158         void *orig_sp, *sp = softirq_stack[smp_processor_id()];
2159 @@ -868,6 +869,7 @@ void do_softirq_own_stack(void)
2160         __asm__ __volatile__("mov %0, %%sp"
2161                              : : "r" (orig_sp));
2163 +#endif
2165  #ifdef CONFIG_HOTPLUG_CPU
2166  void fixup_irqs(void)
2167 diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
2168 index 436639a31624..6ee1dd0deadc 100644
2169 --- a/arch/x86/Kconfig
2170 +++ b/arch/x86/Kconfig
2171 @@ -17,6 +17,7 @@ config X86_64
2172  ### Arch settings
2173  config X86
2174         def_bool y
2175 +       select HAVE_PREEMPT_LAZY
2176         select ACPI_LEGACY_TABLES_LOOKUP        if ACPI
2177         select ACPI_SYSTEM_POWER_STATES_SUPPORT if ACPI
2178         select ANON_INODES
2179 @@ -212,8 +213,11 @@ config ARCH_MAY_HAVE_PC_FDC
2180         def_bool y
2181         depends on ISA_DMA_API
2183 +config RWSEM_GENERIC_SPINLOCK
2184 +       def_bool PREEMPT_RT_FULL
2186  config RWSEM_XCHGADD_ALGORITHM
2187 -       def_bool y
2188 +       def_bool !RWSEM_GENERIC_SPINLOCK && !PREEMPT_RT_FULL
2190  config GENERIC_CALIBRATE_DELAY
2191         def_bool y
2192 @@ -848,7 +852,7 @@ config IOMMU_HELPER
2193  config MAXSMP
2194         bool "Enable Maximum number of SMP Processors and NUMA Nodes"
2195         depends on X86_64 && SMP && DEBUG_KERNEL
2196 -       select CPUMASK_OFFSTACK
2197 +       select CPUMASK_OFFSTACK if !PREEMPT_RT_FULL
2198         ---help---
2199           Enable maximum number of CPUS and NUMA Nodes for this architecture.
2200           If unsure, say N.
2201 diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
2202 index 3633ad6145c5..c6d5458ee7f9 100644
2203 --- a/arch/x86/crypto/aesni-intel_glue.c
2204 +++ b/arch/x86/crypto/aesni-intel_glue.c
2205 @@ -383,14 +383,14 @@ static int ecb_encrypt(struct blkcipher_desc *desc,
2206         err = blkcipher_walk_virt(desc, &walk);
2207         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
2209 -       kernel_fpu_begin();
2210         while ((nbytes = walk.nbytes)) {
2211 +               kernel_fpu_begin();
2212                 aesni_ecb_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr,
2213 -                             nbytes & AES_BLOCK_MASK);
2214 +                               nbytes & AES_BLOCK_MASK);
2215 +               kernel_fpu_end();
2216                 nbytes &= AES_BLOCK_SIZE - 1;
2217                 err = blkcipher_walk_done(desc, &walk, nbytes);
2218         }
2219 -       kernel_fpu_end();
2221         return err;
2223 @@ -407,14 +407,14 @@ static int ecb_decrypt(struct blkcipher_desc *desc,
2224         err = blkcipher_walk_virt(desc, &walk);
2225         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
2227 -       kernel_fpu_begin();
2228         while ((nbytes = walk.nbytes)) {
2229 +               kernel_fpu_begin();
2230                 aesni_ecb_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr,
2231                               nbytes & AES_BLOCK_MASK);
2232 +               kernel_fpu_end();
2233                 nbytes &= AES_BLOCK_SIZE - 1;
2234                 err = blkcipher_walk_done(desc, &walk, nbytes);
2235         }
2236 -       kernel_fpu_end();
2238         return err;
2240 @@ -431,14 +431,14 @@ static int cbc_encrypt(struct blkcipher_desc *desc,
2241         err = blkcipher_walk_virt(desc, &walk);
2242         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
2244 -       kernel_fpu_begin();
2245         while ((nbytes = walk.nbytes)) {
2246 +               kernel_fpu_begin();
2247                 aesni_cbc_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr,
2248                               nbytes & AES_BLOCK_MASK, walk.iv);
2249 +               kernel_fpu_end();
2250                 nbytes &= AES_BLOCK_SIZE - 1;
2251                 err = blkcipher_walk_done(desc, &walk, nbytes);
2252         }
2253 -       kernel_fpu_end();
2255         return err;
2257 @@ -455,14 +455,14 @@ static int cbc_decrypt(struct blkcipher_desc *desc,
2258         err = blkcipher_walk_virt(desc, &walk);
2259         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
2261 -       kernel_fpu_begin();
2262         while ((nbytes = walk.nbytes)) {
2263 +               kernel_fpu_begin();
2264                 aesni_cbc_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr,
2265                               nbytes & AES_BLOCK_MASK, walk.iv);
2266 +               kernel_fpu_end();
2267                 nbytes &= AES_BLOCK_SIZE - 1;
2268                 err = blkcipher_walk_done(desc, &walk, nbytes);
2269         }
2270 -       kernel_fpu_end();
2272         return err;
2274 @@ -514,18 +514,20 @@ static int ctr_crypt(struct blkcipher_desc *desc,
2275         err = blkcipher_walk_virt_block(desc, &walk, AES_BLOCK_SIZE);
2276         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
2278 -       kernel_fpu_begin();
2279         while ((nbytes = walk.nbytes) >= AES_BLOCK_SIZE) {
2280 +               kernel_fpu_begin();
2281                 aesni_ctr_enc_tfm(ctx, walk.dst.virt.addr, walk.src.virt.addr,
2282                                       nbytes & AES_BLOCK_MASK, walk.iv);
2283 +               kernel_fpu_end();
2284                 nbytes &= AES_BLOCK_SIZE - 1;
2285                 err = blkcipher_walk_done(desc, &walk, nbytes);
2286         }
2287         if (walk.nbytes) {
2288 +               kernel_fpu_begin();
2289                 ctr_crypt_final(ctx, &walk);
2290 +               kernel_fpu_end();
2291                 err = blkcipher_walk_done(desc, &walk, 0);
2292         }
2293 -       kernel_fpu_end();
2295         return err;
2297 diff --git a/arch/x86/crypto/cast5_avx_glue.c b/arch/x86/crypto/cast5_avx_glue.c
2298 index 8648158f3916..d7699130ee36 100644
2299 --- a/arch/x86/crypto/cast5_avx_glue.c
2300 +++ b/arch/x86/crypto/cast5_avx_glue.c
2301 @@ -59,7 +59,7 @@ static inline void cast5_fpu_end(bool fpu_enabled)
2302  static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
2303                      bool enc)
2305 -       bool fpu_enabled = false;
2306 +       bool fpu_enabled;
2307         struct cast5_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
2308         const unsigned int bsize = CAST5_BLOCK_SIZE;
2309         unsigned int nbytes;
2310 @@ -75,7 +75,7 @@ static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
2311                 u8 *wsrc = walk->src.virt.addr;
2312                 u8 *wdst = walk->dst.virt.addr;
2314 -               fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes);
2315 +               fpu_enabled = cast5_fpu_begin(false, nbytes);
2317                 /* Process multi-block batch */
2318                 if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) {
2319 @@ -103,10 +103,9 @@ static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
2320                 } while (nbytes >= bsize);
2322  done:
2323 +               cast5_fpu_end(fpu_enabled);
2324                 err = blkcipher_walk_done(desc, walk, nbytes);
2325         }
2327 -       cast5_fpu_end(fpu_enabled);
2328         return err;
2331 @@ -227,7 +226,7 @@ done:
2332  static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
2333                        struct scatterlist *src, unsigned int nbytes)
2335 -       bool fpu_enabled = false;
2336 +       bool fpu_enabled;
2337         struct blkcipher_walk walk;
2338         int err;
2340 @@ -236,12 +235,11 @@ static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
2341         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
2343         while ((nbytes = walk.nbytes)) {
2344 -               fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes);
2345 +               fpu_enabled = cast5_fpu_begin(false, nbytes);
2346                 nbytes = __cbc_decrypt(desc, &walk);
2347 +               cast5_fpu_end(fpu_enabled);
2348                 err = blkcipher_walk_done(desc, &walk, nbytes);
2349         }
2351 -       cast5_fpu_end(fpu_enabled);
2352         return err;
2355 @@ -311,7 +309,7 @@ done:
2356  static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
2357                      struct scatterlist *src, unsigned int nbytes)
2359 -       bool fpu_enabled = false;
2360 +       bool fpu_enabled;
2361         struct blkcipher_walk walk;
2362         int err;
2364 @@ -320,13 +318,12 @@ static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
2365         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
2367         while ((nbytes = walk.nbytes) >= CAST5_BLOCK_SIZE) {
2368 -               fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes);
2369 +               fpu_enabled = cast5_fpu_begin(false, nbytes);
2370                 nbytes = __ctr_crypt(desc, &walk);
2371 +               cast5_fpu_end(fpu_enabled);
2372                 err = blkcipher_walk_done(desc, &walk, nbytes);
2373         }
2375 -       cast5_fpu_end(fpu_enabled);
2377         if (walk.nbytes) {
2378                 ctr_crypt_final(desc, &walk);
2379                 err = blkcipher_walk_done(desc, &walk, 0);
2380 diff --git a/arch/x86/crypto/glue_helper.c b/arch/x86/crypto/glue_helper.c
2381 index 6a85598931b5..3a506ce7ed93 100644
2382 --- a/arch/x86/crypto/glue_helper.c
2383 +++ b/arch/x86/crypto/glue_helper.c
2384 @@ -39,7 +39,7 @@ static int __glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx,
2385         void *ctx = crypto_blkcipher_ctx(desc->tfm);
2386         const unsigned int bsize = 128 / 8;
2387         unsigned int nbytes, i, func_bytes;
2388 -       bool fpu_enabled = false;
2389 +       bool fpu_enabled;
2390         int err;
2392         err = blkcipher_walk_virt(desc, walk);
2393 @@ -49,7 +49,7 @@ static int __glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx,
2394                 u8 *wdst = walk->dst.virt.addr;
2396                 fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
2397 -                                            desc, fpu_enabled, nbytes);
2398 +                                            desc, false, nbytes);
2400                 for (i = 0; i < gctx->num_funcs; i++) {
2401                         func_bytes = bsize * gctx->funcs[i].num_blocks;
2402 @@ -71,10 +71,10 @@ static int __glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx,
2403                 }
2405  done:
2406 +               glue_fpu_end(fpu_enabled);
2407                 err = blkcipher_walk_done(desc, walk, nbytes);
2408         }
2410 -       glue_fpu_end(fpu_enabled);
2411         return err;
2414 @@ -194,7 +194,7 @@ int glue_cbc_decrypt_128bit(const struct common_glue_ctx *gctx,
2415                             struct scatterlist *src, unsigned int nbytes)
2417         const unsigned int bsize = 128 / 8;
2418 -       bool fpu_enabled = false;
2419 +       bool fpu_enabled;
2420         struct blkcipher_walk walk;
2421         int err;
2423 @@ -203,12 +203,12 @@ int glue_cbc_decrypt_128bit(const struct common_glue_ctx *gctx,
2425         while ((nbytes = walk.nbytes)) {
2426                 fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
2427 -                                            desc, fpu_enabled, nbytes);
2428 +                                            desc, false, nbytes);
2429                 nbytes = __glue_cbc_decrypt_128bit(gctx, desc, &walk);
2430 +               glue_fpu_end(fpu_enabled);
2431                 err = blkcipher_walk_done(desc, &walk, nbytes);
2432         }
2434 -       glue_fpu_end(fpu_enabled);
2435         return err;
2437  EXPORT_SYMBOL_GPL(glue_cbc_decrypt_128bit);
2438 @@ -277,7 +277,7 @@ int glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx,
2439                           struct scatterlist *src, unsigned int nbytes)
2441         const unsigned int bsize = 128 / 8;
2442 -       bool fpu_enabled = false;
2443 +       bool fpu_enabled;
2444         struct blkcipher_walk walk;
2445         int err;
2447 @@ -286,13 +286,12 @@ int glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx,
2449         while ((nbytes = walk.nbytes) >= bsize) {
2450                 fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
2451 -                                            desc, fpu_enabled, nbytes);
2452 +                                            desc, false, nbytes);
2453                 nbytes = __glue_ctr_crypt_128bit(gctx, desc, &walk);
2454 +               glue_fpu_end(fpu_enabled);
2455                 err = blkcipher_walk_done(desc, &walk, nbytes);
2456         }
2458 -       glue_fpu_end(fpu_enabled);
2460         if (walk.nbytes) {
2461                 glue_ctr_crypt_final_128bit(
2462                         gctx->funcs[gctx->num_funcs - 1].fn_u.ctr, desc, &walk);
2463 @@ -347,7 +346,7 @@ int glue_xts_crypt_128bit(const struct common_glue_ctx *gctx,
2464                           void *tweak_ctx, void *crypt_ctx)
2466         const unsigned int bsize = 128 / 8;
2467 -       bool fpu_enabled = false;
2468 +       bool fpu_enabled;
2469         struct blkcipher_walk walk;
2470         int err;
2472 @@ -360,21 +359,21 @@ int glue_xts_crypt_128bit(const struct common_glue_ctx *gctx,
2474         /* set minimum length to bsize, for tweak_fn */
2475         fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
2476 -                                    desc, fpu_enabled,
2477 +                                    desc, false,
2478                                      nbytes < bsize ? bsize : nbytes);
2480         /* calculate first value of T */
2481         tweak_fn(tweak_ctx, walk.iv, walk.iv);
2482 +       glue_fpu_end(fpu_enabled);
2484         while (nbytes) {
2485 +               fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
2486 +                               desc, false, nbytes);
2487                 nbytes = __glue_xts_crypt_128bit(gctx, crypt_ctx, desc, &walk);
2489 +               glue_fpu_end(fpu_enabled);
2490                 err = blkcipher_walk_done(desc, &walk, nbytes);
2491                 nbytes = walk.nbytes;
2492         }
2494 -       glue_fpu_end(fpu_enabled);
2496         return err;
2498  EXPORT_SYMBOL_GPL(glue_xts_crypt_128bit);
2499 diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
2500 index 1a4477cedc49..75a301b6a5b6 100644
2501 --- a/arch/x86/entry/common.c
2502 +++ b/arch/x86/entry/common.c
2503 @@ -220,7 +220,7 @@ long syscall_trace_enter(struct pt_regs *regs)
2505  #define EXIT_TO_USERMODE_LOOP_FLAGS                            \
2506         (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE |   \
2507 -        _TIF_NEED_RESCHED | _TIF_USER_RETURN_NOTIFY)
2508 +        _TIF_NEED_RESCHED_MASK | _TIF_USER_RETURN_NOTIFY)
2510  static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags)
2512 @@ -236,9 +236,16 @@ static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags)
2513                 /* We have work to do. */
2514                 local_irq_enable();
2516 -               if (cached_flags & _TIF_NEED_RESCHED)
2517 +               if (cached_flags & _TIF_NEED_RESCHED_MASK)
2518                         schedule();
2520 +#ifdef ARCH_RT_DELAYS_SIGNAL_SEND
2521 +               if (unlikely(current->forced_info.si_signo)) {
2522 +                       struct task_struct *t = current;
2523 +                       force_sig_info(t->forced_info.si_signo, &t->forced_info, t);
2524 +                       t->forced_info.si_signo = 0;
2525 +               }
2526 +#endif
2527                 if (cached_flags & _TIF_UPROBE)
2528                         uprobe_notify_resume(regs);
2530 diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
2531 index ae678ad128a9..3bcef8bdb911 100644
2532 --- a/arch/x86/entry/entry_32.S
2533 +++ b/arch/x86/entry/entry_32.S
2534 @@ -278,8 +278,24 @@ END(ret_from_exception)
2535  ENTRY(resume_kernel)
2536         DISABLE_INTERRUPTS(CLBR_ANY)
2537  need_resched:
2538 +       # preempt count == 0 + NEED_RS set?
2539         cmpl    $0, PER_CPU_VAR(__preempt_count)
2540 +#ifndef CONFIG_PREEMPT_LAZY
2541         jnz     restore_all
2542 +#else
2543 +       jz test_int_off
2545 +       # atleast preempt count == 0 ?
2546 +       cmpl $_PREEMPT_ENABLED,PER_CPU_VAR(__preempt_count)
2547 +       jne restore_all
2549 +       cmpl $0,TI_preempt_lazy_count(%ebp)     # non-zero preempt_lazy_count ?
2550 +       jnz restore_all
2552 +       testl $_TIF_NEED_RESCHED_LAZY, TI_flags(%ebp)
2553 +       jz restore_all
2554 +test_int_off:
2555 +#endif
2556         testl   $X86_EFLAGS_IF, PT_EFLAGS(%esp) # interrupts off (exception path) ?
2557         jz      restore_all
2558         call    preempt_schedule_irq
2559 diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
2560 index a55697d19824..316081a2ca85 100644
2561 --- a/arch/x86/entry/entry_64.S
2562 +++ b/arch/x86/entry/entry_64.S
2563 @@ -579,7 +579,23 @@ retint_kernel:
2564         bt      $9, EFLAGS(%rsp)                /* were interrupts off? */
2565         jnc     1f
2566  0:     cmpl    $0, PER_CPU_VAR(__preempt_count)
2567 +#ifndef CONFIG_PREEMPT_LAZY
2568         jnz     1f
2569 +#else
2570 +       jz      do_preempt_schedule_irq
2572 +       # atleast preempt count == 0 ?
2573 +       cmpl $_PREEMPT_ENABLED,PER_CPU_VAR(__preempt_count)
2574 +       jnz     1f
2576 +       GET_THREAD_INFO(%rcx)
2577 +       cmpl    $0, TI_preempt_lazy_count(%rcx)
2578 +       jnz     1f
2580 +       bt      $TIF_NEED_RESCHED_LAZY,TI_flags(%rcx)
2581 +       jnc     1f
2582 +do_preempt_schedule_irq:
2583 +#endif
2584         call    preempt_schedule_irq
2585         jmp     0b
2586  1:
2587 @@ -867,6 +883,7 @@ bad_gs:
2588         jmp     2b
2589         .previous
2591 +#ifndef CONFIG_PREEMPT_RT_FULL
2592  /* Call softirq on interrupt stack. Interrupts are off. */
2593  ENTRY(do_softirq_own_stack)
2594         pushq   %rbp
2595 @@ -879,6 +896,7 @@ ENTRY(do_softirq_own_stack)
2596         decl    PER_CPU_VAR(irq_count)
2597         ret
2598  END(do_softirq_own_stack)
2599 +#endif
2601  #ifdef CONFIG_XEN
2602  idtentry xen_hypervisor_callback xen_do_hypervisor_callback has_error_code=0
2603 diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
2604 index 01bcde84d3e4..6f432adc55cd 100644
2605 --- a/arch/x86/include/asm/preempt.h
2606 +++ b/arch/x86/include/asm/preempt.h
2607 @@ -79,17 +79,46 @@ static __always_inline void __preempt_count_sub(int val)
2608   * a decrement which hits zero means we have no preempt_count and should
2609   * reschedule.
2610   */
2611 -static __always_inline bool __preempt_count_dec_and_test(void)
2612 +static __always_inline bool ____preempt_count_dec_and_test(void)
2614         GEN_UNARY_RMWcc("decl", __preempt_count, __percpu_arg(0), "e");
2617 +static __always_inline bool __preempt_count_dec_and_test(void)
2619 +       if (____preempt_count_dec_and_test())
2620 +               return true;
2621 +#ifdef CONFIG_PREEMPT_LAZY
2622 +       if (current_thread_info()->preempt_lazy_count)
2623 +               return false;
2624 +       return test_thread_flag(TIF_NEED_RESCHED_LAZY);
2625 +#else
2626 +       return false;
2627 +#endif
2630  /*
2631   * Returns true when we need to resched and can (barring IRQ state).
2632   */
2633  static __always_inline bool should_resched(int preempt_offset)
2635 +#ifdef CONFIG_PREEMPT_LAZY
2636 +       u32 tmp;
2638 +       tmp = raw_cpu_read_4(__preempt_count);
2639 +       if (tmp == preempt_offset)
2640 +               return true;
2642 +       /* preempt count == 0 ? */
2643 +       tmp &= ~PREEMPT_NEED_RESCHED;
2644 +       if (tmp)
2645 +               return false;
2646 +       if (current_thread_info()->preempt_lazy_count)
2647 +               return false;
2648 +       return test_thread_flag(TIF_NEED_RESCHED_LAZY);
2649 +#else
2650         return unlikely(raw_cpu_read_4(__preempt_count) == preempt_offset);
2651 +#endif
2654  #ifdef CONFIG_PREEMPT
2655 diff --git a/arch/x86/include/asm/signal.h b/arch/x86/include/asm/signal.h
2656 index 2138c9ae19ee..3f5b4ee2e2c1 100644
2657 --- a/arch/x86/include/asm/signal.h
2658 +++ b/arch/x86/include/asm/signal.h
2659 @@ -23,6 +23,19 @@ typedef struct {
2660         unsigned long sig[_NSIG_WORDS];
2661  } sigset_t;
2664 + * Because some traps use the IST stack, we must keep preemption
2665 + * disabled while calling do_trap(), but do_trap() may call
2666 + * force_sig_info() which will grab the signal spin_locks for the
2667 + * task, which in PREEMPT_RT_FULL are mutexes.  By defining
2668 + * ARCH_RT_DELAYS_SIGNAL_SEND the force_sig_info() will set
2669 + * TIF_NOTIFY_RESUME and set up the signal to be sent on exit of the
2670 + * trap.
2671 + */
2672 +#if defined(CONFIG_PREEMPT_RT_FULL)
2673 +#define ARCH_RT_DELAYS_SIGNAL_SEND
2674 +#endif
2676  #ifndef CONFIG_COMPAT
2677  typedef sigset_t compat_sigset_t;
2678  #endif
2679 diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h
2680 index 58505f01962f..02fa39652cd6 100644
2681 --- a/arch/x86/include/asm/stackprotector.h
2682 +++ b/arch/x86/include/asm/stackprotector.h
2683 @@ -59,7 +59,7 @@
2684   */
2685  static __always_inline void boot_init_stack_canary(void)
2687 -       u64 canary;
2688 +       u64 uninitialized_var(canary);
2689         u64 tsc;
2691  #ifdef CONFIG_X86_64
2692 @@ -70,8 +70,15 @@ static __always_inline void boot_init_stack_canary(void)
2693          * of randomness. The TSC only matters for very early init,
2694          * there it already has some randomness on most systems. Later
2695          * on during the bootup the random pool has true entropy too.
2696 +        *
2697 +        * For preempt-rt we need to weaken the randomness a bit, as
2698 +        * we can't call into the random generator from atomic context
2699 +        * due to locking constraints. We just leave canary
2700 +        * uninitialized and use the TSC based randomness on top of it.
2701          */
2702 +#ifndef CONFIG_PREEMPT_RT_FULL
2703         get_random_bytes(&canary, sizeof(canary));
2704 +#endif
2705         tsc = rdtsc();
2706         canary += tsc + (tsc << 32UL);
2708 diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
2709 index c7b551028740..ddb63bd90e3c 100644
2710 --- a/arch/x86/include/asm/thread_info.h
2711 +++ b/arch/x86/include/asm/thread_info.h
2712 @@ -58,6 +58,8 @@ struct thread_info {
2713         __u32                   status;         /* thread synchronous flags */
2714         __u32                   cpu;            /* current CPU */
2715         mm_segment_t            addr_limit;
2716 +       int                     preempt_lazy_count;     /* 0 => lazy preemptable
2717 +                                                         <0 => BUG */
2718         unsigned int            sig_on_uaccess_error:1;
2719         unsigned int            uaccess_err:1;  /* uaccess failed */
2720  };
2721 @@ -95,6 +97,7 @@ struct thread_info {
2722  #define TIF_SYSCALL_EMU                6       /* syscall emulation active */
2723  #define TIF_SYSCALL_AUDIT      7       /* syscall auditing active */
2724  #define TIF_SECCOMP            8       /* secure computing */
2725 +#define TIF_NEED_RESCHED_LAZY  9       /* lazy rescheduling necessary */
2726  #define TIF_USER_RETURN_NOTIFY 11      /* notify kernel of userspace return */
2727  #define TIF_UPROBE             12      /* breakpointed or singlestepping */
2728  #define TIF_NOTSC              16      /* TSC is not accessible in userland */
2729 @@ -119,6 +122,7 @@ struct thread_info {
2730  #define _TIF_SYSCALL_EMU       (1 << TIF_SYSCALL_EMU)
2731  #define _TIF_SYSCALL_AUDIT     (1 << TIF_SYSCALL_AUDIT)
2732  #define _TIF_SECCOMP           (1 << TIF_SECCOMP)
2733 +#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY)
2734  #define _TIF_USER_RETURN_NOTIFY        (1 << TIF_USER_RETURN_NOTIFY)
2735  #define _TIF_UPROBE            (1 << TIF_UPROBE)
2736  #define _TIF_NOTSC             (1 << TIF_NOTSC)
2737 @@ -152,6 +156,8 @@ struct thread_info {
2738  #define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY)
2739  #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW)
2741 +#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)
2743  #define STACK_WARN             (THREAD_SIZE/8)
2745  /*
2746 diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
2747 index fc808b83fccb..ebb40118abf5 100644
2748 --- a/arch/x86/include/asm/uv/uv_bau.h
2749 +++ b/arch/x86/include/asm/uv/uv_bau.h
2750 @@ -615,9 +615,9 @@ struct bau_control {
2751         cycles_t                send_message;
2752         cycles_t                period_end;
2753         cycles_t                period_time;
2754 -       spinlock_t              uvhub_lock;
2755 -       spinlock_t              queue_lock;
2756 -       spinlock_t              disable_lock;
2757 +       raw_spinlock_t          uvhub_lock;
2758 +       raw_spinlock_t          queue_lock;
2759 +       raw_spinlock_t          disable_lock;
2760         /* tunables */
2761         int                     max_concurr;
2762         int                     max_concurr_const;
2763 @@ -776,15 +776,15 @@ static inline int atom_asr(short i, struct atomic_short *v)
2764   * to be lowered below the current 'v'.  atomic_add_unless can only stop
2765   * on equal.
2766   */
2767 -static inline int atomic_inc_unless_ge(spinlock_t *lock, atomic_t *v, int u)
2768 +static inline int atomic_inc_unless_ge(raw_spinlock_t *lock, atomic_t *v, int u)
2770 -       spin_lock(lock);
2771 +       raw_spin_lock(lock);
2772         if (atomic_read(v) >= u) {
2773 -               spin_unlock(lock);
2774 +               raw_spin_unlock(lock);
2775                 return 0;
2776         }
2777         atomic_inc(v);
2778 -       spin_unlock(lock);
2779 +       raw_spin_unlock(lock);
2780         return 1;
2783 diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h
2784 index ea7074784cc4..01ec643ce66e 100644
2785 --- a/arch/x86/include/asm/uv/uv_hub.h
2786 +++ b/arch/x86/include/asm/uv/uv_hub.h
2787 @@ -492,7 +492,7 @@ struct uv_blade_info {
2788         unsigned short  nr_online_cpus;
2789         unsigned short  pnode;
2790         short           memory_nid;
2791 -       spinlock_t      nmi_lock;       /* obsolete, see uv_hub_nmi */
2792 +       raw_spinlock_t  nmi_lock;       /* obsolete, see uv_hub_nmi */
2793         unsigned long   nmi_count;      /* obsolete, see uv_hub_nmi */
2794  };
2795  extern struct uv_blade_info *uv_blade_info;
2796 diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
2797 index e75907601a41..a29fc4f84fc4 100644
2798 --- a/arch/x86/kernel/acpi/boot.c
2799 +++ b/arch/x86/kernel/acpi/boot.c
2800 @@ -87,7 +87,9 @@ static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
2801   *             ->ioapic_mutex
2802   *                     ->ioapic_lock
2803   */
2804 +#ifdef CONFIG_X86_IO_APIC
2805  static DEFINE_MUTEX(acpi_ioapic_lock);
2806 +#endif
2808  /* --------------------------------------------------------------------------
2809                                Boot-time Configuration
2810 diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
2811 index 1e5d2f07416b..ac0127bb155e 100644
2812 --- a/arch/x86/kernel/apic/io_apic.c
2813 +++ b/arch/x86/kernel/apic/io_apic.c
2814 @@ -1711,7 +1711,8 @@ static bool io_apic_level_ack_pending(struct mp_chip_data *data)
2815  static inline bool ioapic_irqd_mask(struct irq_data *data)
2817         /* If we are moving the irq we need to mask it */
2818 -       if (unlikely(irqd_is_setaffinity_pending(data))) {
2819 +       if (unlikely(irqd_is_setaffinity_pending(data) &&
2820 +                    !irqd_irq_inprogress(data))) {
2821                 mask_ioapic_irq(data);
2822                 return true;
2823         }
2824 diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
2825 index 4a139465f1d4..ad2afff02b36 100644
2826 --- a/arch/x86/kernel/apic/x2apic_uv_x.c
2827 +++ b/arch/x86/kernel/apic/x2apic_uv_x.c
2828 @@ -947,7 +947,7 @@ void __init uv_system_init(void)
2829                         uv_blade_info[blade].pnode = pnode;
2830                         uv_blade_info[blade].nr_possible_cpus = 0;
2831                         uv_blade_info[blade].nr_online_cpus = 0;
2832 -                       spin_lock_init(&uv_blade_info[blade].nmi_lock);
2833 +                       raw_spin_lock_init(&uv_blade_info[blade].nmi_lock);
2834                         min_pnode = min(pnode, min_pnode);
2835                         max_pnode = max(pnode, max_pnode);
2836                         blade++;
2837 diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
2838 index 439df975bc7a..b7954ddd6a0a 100644
2839 --- a/arch/x86/kernel/asm-offsets.c
2840 +++ b/arch/x86/kernel/asm-offsets.c
2841 @@ -32,6 +32,7 @@ void common(void) {
2842         OFFSET(TI_flags, thread_info, flags);
2843         OFFSET(TI_status, thread_info, status);
2844         OFFSET(TI_addr_limit, thread_info, addr_limit);
2845 +       OFFSET(TI_preempt_lazy_count, thread_info, preempt_lazy_count);
2847         BLANK();
2848         OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx);
2849 @@ -89,4 +90,5 @@ void common(void) {
2851         BLANK();
2852         DEFINE(PTREGS_SIZE, sizeof(struct pt_regs));
2853 +       DEFINE(_PREEMPT_ENABLED, PREEMPT_ENABLED);
2855 diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
2856 index 7e8a736d09db..430a4ec07811 100644
2857 --- a/arch/x86/kernel/cpu/mcheck/mce.c
2858 +++ b/arch/x86/kernel/cpu/mcheck/mce.c
2859 @@ -41,6 +41,8 @@
2860  #include <linux/debugfs.h>
2861  #include <linux/irq_work.h>
2862  #include <linux/export.h>
2863 +#include <linux/jiffies.h>
2864 +#include <linux/swork.h>
2866  #include <asm/processor.h>
2867  #include <asm/traps.h>
2868 @@ -1236,7 +1238,7 @@ void mce_log_therm_throt_event(__u64 status)
2869  static unsigned long check_interval = INITIAL_CHECK_INTERVAL;
2871  static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
2872 -static DEFINE_PER_CPU(struct timer_list, mce_timer);
2873 +static DEFINE_PER_CPU(struct hrtimer, mce_timer);
2875  static unsigned long mce_adjust_timer_default(unsigned long interval)
2877 @@ -1245,32 +1247,18 @@ static unsigned long mce_adjust_timer_default(unsigned long interval)
2879  static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default;
2881 -static void __restart_timer(struct timer_list *t, unsigned long interval)
2882 +static enum hrtimer_restart __restart_timer(struct hrtimer *timer, unsigned long interval)
2884 -       unsigned long when = jiffies + interval;
2885 -       unsigned long flags;
2887 -       local_irq_save(flags);
2889 -       if (timer_pending(t)) {
2890 -               if (time_before(when, t->expires))
2891 -                       mod_timer_pinned(t, when);
2892 -       } else {
2893 -               t->expires = round_jiffies(when);
2894 -               add_timer_on(t, smp_processor_id());
2895 -       }
2897 -       local_irq_restore(flags);
2898 +       if (!interval)
2899 +               return HRTIMER_NORESTART;
2900 +       hrtimer_forward_now(timer, ns_to_ktime(jiffies_to_nsecs(interval)));
2901 +       return HRTIMER_RESTART;
2904 -static void mce_timer_fn(unsigned long data)
2905 +static enum hrtimer_restart mce_timer_fn(struct hrtimer *timer)
2907 -       struct timer_list *t = this_cpu_ptr(&mce_timer);
2908 -       int cpu = smp_processor_id();
2909         unsigned long iv;
2911 -       WARN_ON(cpu != data);
2913         iv = __this_cpu_read(mce_next_interval);
2915         if (mce_available(this_cpu_ptr(&cpu_info))) {
2916 @@ -1293,7 +1281,7 @@ static void mce_timer_fn(unsigned long data)
2918  done:
2919         __this_cpu_write(mce_next_interval, iv);
2920 -       __restart_timer(t, iv);
2921 +       return __restart_timer(timer, iv);
2924  /*
2925 @@ -1301,7 +1289,7 @@ done:
2926   */
2927  void mce_timer_kick(unsigned long interval)
2929 -       struct timer_list *t = this_cpu_ptr(&mce_timer);
2930 +       struct hrtimer *t = this_cpu_ptr(&mce_timer);
2931         unsigned long iv = __this_cpu_read(mce_next_interval);
2933         __restart_timer(t, interval);
2934 @@ -1316,7 +1304,7 @@ static void mce_timer_delete_all(void)
2935         int cpu;
2937         for_each_online_cpu(cpu)
2938 -               del_timer_sync(&per_cpu(mce_timer, cpu));
2939 +               hrtimer_cancel(&per_cpu(mce_timer, cpu));
2942  static void mce_do_trigger(struct work_struct *work)
2943 @@ -1326,6 +1314,56 @@ static void mce_do_trigger(struct work_struct *work)
2945  static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
2947 +static void __mce_notify_work(struct swork_event *event)
2949 +       /* Not more than two messages every minute */
2950 +       static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
2952 +       /* wake processes polling /dev/mcelog */
2953 +       wake_up_interruptible(&mce_chrdev_wait);
2955 +       /*
2956 +        * There is no risk of missing notifications because
2957 +        * work_pending is always cleared before the function is
2958 +        * executed.
2959 +        */
2960 +       if (mce_helper[0] && !work_pending(&mce_trigger_work))
2961 +               schedule_work(&mce_trigger_work);
2963 +       if (__ratelimit(&ratelimit))
2964 +               pr_info(HW_ERR "Machine check events logged\n");
2967 +#ifdef CONFIG_PREEMPT_RT_FULL
2968 +static bool notify_work_ready __read_mostly;
2969 +static struct swork_event notify_work;
2971 +static int mce_notify_work_init(void)
2973 +       int err;
2975 +       err = swork_get();
2976 +       if (err)
2977 +               return err;
2979 +       INIT_SWORK(&notify_work, __mce_notify_work);
2980 +       notify_work_ready = true;
2981 +       return 0;
2984 +static void mce_notify_work(void)
2986 +       if (notify_work_ready)
2987 +               swork_queue(&notify_work);
2989 +#else
2990 +static void mce_notify_work(void)
2992 +       __mce_notify_work(NULL);
2994 +static inline int mce_notify_work_init(void) { return 0; }
2995 +#endif
2997  /*
2998   * Notify the user(s) about new machine check events.
2999   * Can be called from interrupt context, but not from machine check/NMI
3000 @@ -1333,19 +1371,8 @@ static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
3001   */
3002  int mce_notify_irq(void)
3004 -       /* Not more than two messages every minute */
3005 -       static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
3007         if (test_and_clear_bit(0, &mce_need_notify)) {
3008 -               /* wake processes polling /dev/mcelog */
3009 -               wake_up_interruptible(&mce_chrdev_wait);
3011 -               if (mce_helper[0])
3012 -                       schedule_work(&mce_trigger_work);
3014 -               if (__ratelimit(&ratelimit))
3015 -                       pr_info(HW_ERR "Machine check events logged\n");
3017 +               mce_notify_work();
3018                 return 1;
3019         }
3020         return 0;
3021 @@ -1639,7 +1666,7 @@ static void __mcheck_cpu_clear_vendor(struct cpuinfo_x86 *c)
3022         }
3025 -static void mce_start_timer(unsigned int cpu, struct timer_list *t)
3026 +static void mce_start_timer(unsigned int cpu, struct hrtimer *t)
3028         unsigned long iv = check_interval * HZ;
3030 @@ -1648,16 +1675,17 @@ static void mce_start_timer(unsigned int cpu, struct timer_list *t)
3032         per_cpu(mce_next_interval, cpu) = iv;
3034 -       t->expires = round_jiffies(jiffies + iv);
3035 -       add_timer_on(t, cpu);
3036 +       hrtimer_start_range_ns(t, ns_to_ktime(jiffies_to_usecs(iv) * 1000ULL),
3037 +                       0, HRTIMER_MODE_REL_PINNED);
3040  static void __mcheck_cpu_init_timer(void)
3042 -       struct timer_list *t = this_cpu_ptr(&mce_timer);
3043 +       struct hrtimer *t = this_cpu_ptr(&mce_timer);
3044         unsigned int cpu = smp_processor_id();
3046 -       setup_timer(t, mce_timer_fn, cpu);
3047 +       hrtimer_init(t, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
3048 +       t->function = mce_timer_fn;
3049         mce_start_timer(cpu, t);
3052 @@ -2376,6 +2404,8 @@ static void mce_disable_cpu(void *h)
3053         if (!mce_available(raw_cpu_ptr(&cpu_info)))
3054                 return;
3056 +       hrtimer_cancel(this_cpu_ptr(&mce_timer));
3058         if (!(action & CPU_TASKS_FROZEN))
3059                 cmci_clear();
3061 @@ -2398,6 +2428,7 @@ static void mce_reenable_cpu(void *h)
3062                 if (b->init)
3063                         wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl);
3064         }
3065 +       __mcheck_cpu_init_timer();
3068  /* Get notified when a cpu comes on/off. Be hotplug friendly. */
3069 @@ -2405,7 +2436,6 @@ static int
3070  mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
3072         unsigned int cpu = (unsigned long)hcpu;
3073 -       struct timer_list *t = &per_cpu(mce_timer, cpu);
3075         switch (action & ~CPU_TASKS_FROZEN) {
3076         case CPU_ONLINE:
3077 @@ -2425,11 +2455,9 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
3078                 break;
3079         case CPU_DOWN_PREPARE:
3080                 smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
3081 -               del_timer_sync(t);
3082                 break;
3083         case CPU_DOWN_FAILED:
3084                 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
3085 -               mce_start_timer(cpu, t);
3086                 break;
3087         }
3089 @@ -2468,6 +2496,10 @@ static __init int mcheck_init_device(void)
3090                 goto err_out;
3091         }
3093 +       err = mce_notify_work_init();
3094 +       if (err)
3095 +               goto err_out;
3097         if (!zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL)) {
3098                 err = -ENOMEM;
3099                 goto err_out;
3100 diff --git a/arch/x86/kernel/cpu/perf_event_intel_rapl.c b/arch/x86/kernel/cpu/perf_event_intel_rapl.c
3101 index ed446bdcbf31..d2ac364e2118 100644
3102 --- a/arch/x86/kernel/cpu/perf_event_intel_rapl.c
3103 +++ b/arch/x86/kernel/cpu/perf_event_intel_rapl.c
3104 @@ -117,7 +117,7 @@ static struct perf_pmu_events_attr event_attr_##v = {                       \
3105  };
3107  struct rapl_pmu {
3108 -       spinlock_t       lock;
3109 +       raw_spinlock_t   lock;
3110         int              n_active; /* number of active events */
3111         struct list_head active_list;
3112         struct pmu       *pmu; /* pointer to rapl_pmu_class */
3113 @@ -220,13 +220,13 @@ static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer)
3114         if (!pmu->n_active)
3115                 return HRTIMER_NORESTART;
3117 -       spin_lock_irqsave(&pmu->lock, flags);
3118 +       raw_spin_lock_irqsave(&pmu->lock, flags);
3120         list_for_each_entry(event, &pmu->active_list, active_entry) {
3121                 rapl_event_update(event);
3122         }
3124 -       spin_unlock_irqrestore(&pmu->lock, flags);
3125 +       raw_spin_unlock_irqrestore(&pmu->lock, flags);
3127         hrtimer_forward_now(hrtimer, pmu->timer_interval);
3129 @@ -263,9 +263,9 @@ static void rapl_pmu_event_start(struct perf_event *event, int mode)
3130         struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu);
3131         unsigned long flags;
3133 -       spin_lock_irqsave(&pmu->lock, flags);
3134 +       raw_spin_lock_irqsave(&pmu->lock, flags);
3135         __rapl_pmu_event_start(pmu, event);
3136 -       spin_unlock_irqrestore(&pmu->lock, flags);
3137 +       raw_spin_unlock_irqrestore(&pmu->lock, flags);
3140  static void rapl_pmu_event_stop(struct perf_event *event, int mode)
3141 @@ -274,7 +274,7 @@ static void rapl_pmu_event_stop(struct perf_event *event, int mode)
3142         struct hw_perf_event *hwc = &event->hw;
3143         unsigned long flags;
3145 -       spin_lock_irqsave(&pmu->lock, flags);
3146 +       raw_spin_lock_irqsave(&pmu->lock, flags);
3148         /* mark event as deactivated and stopped */
3149         if (!(hwc->state & PERF_HES_STOPPED)) {
3150 @@ -299,7 +299,7 @@ static void rapl_pmu_event_stop(struct perf_event *event, int mode)
3151                 hwc->state |= PERF_HES_UPTODATE;
3152         }
3154 -       spin_unlock_irqrestore(&pmu->lock, flags);
3155 +       raw_spin_unlock_irqrestore(&pmu->lock, flags);
3158  static int rapl_pmu_event_add(struct perf_event *event, int mode)
3159 @@ -308,14 +308,14 @@ static int rapl_pmu_event_add(struct perf_event *event, int mode)
3160         struct hw_perf_event *hwc = &event->hw;
3161         unsigned long flags;
3163 -       spin_lock_irqsave(&pmu->lock, flags);
3164 +       raw_spin_lock_irqsave(&pmu->lock, flags);
3166         hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
3168         if (mode & PERF_EF_START)
3169                 __rapl_pmu_event_start(pmu, event);
3171 -       spin_unlock_irqrestore(&pmu->lock, flags);
3172 +       raw_spin_unlock_irqrestore(&pmu->lock, flags);
3174         return 0;
3176 @@ -603,7 +603,7 @@ static int rapl_cpu_prepare(int cpu)
3177         pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu));
3178         if (!pmu)
3179                 return -1;
3180 -       spin_lock_init(&pmu->lock);
3181 +       raw_spin_lock_init(&pmu->lock);
3183         INIT_LIST_HEAD(&pmu->active_list);
3185 diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
3186 index 464ffd69b92e..00db1aad1548 100644
3187 --- a/arch/x86/kernel/dumpstack_32.c
3188 +++ b/arch/x86/kernel/dumpstack_32.c
3189 @@ -42,7 +42,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
3190                 unsigned long *stack, unsigned long bp,
3191                 const struct stacktrace_ops *ops, void *data)
3193 -       const unsigned cpu = get_cpu();
3194 +       const unsigned cpu = get_cpu_light();
3195         int graph = 0;
3196         u32 *prev_esp;
3198 @@ -86,7 +86,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
3199                         break;
3200                 touch_nmi_watchdog();
3201         }
3202 -       put_cpu();
3203 +       put_cpu_light();
3205  EXPORT_SYMBOL(dump_trace);
3207 diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
3208 index 5f1c6266eb30..c331e3fef465 100644
3209 --- a/arch/x86/kernel/dumpstack_64.c
3210 +++ b/arch/x86/kernel/dumpstack_64.c
3211 @@ -152,7 +152,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
3212                 unsigned long *stack, unsigned long bp,
3213                 const struct stacktrace_ops *ops, void *data)
3215 -       const unsigned cpu = get_cpu();
3216 +       const unsigned cpu = get_cpu_light();
3217         struct thread_info *tinfo;
3218         unsigned long *irq_stack = (unsigned long *)per_cpu(irq_stack_ptr, cpu);
3219         unsigned long dummy;
3220 @@ -241,7 +241,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
3221          * This handles the process stack:
3222          */
3223         bp = ops->walk_stack(tinfo, stack, bp, ops, data, NULL, &graph);
3224 -       put_cpu();
3225 +       put_cpu_light();
3227  EXPORT_SYMBOL(dump_trace);
3229 @@ -255,7 +255,7 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
3230         int cpu;
3231         int i;
3233 -       preempt_disable();
3234 +       migrate_disable();
3235         cpu = smp_processor_id();
3237         irq_stack_end   = (unsigned long *)(per_cpu(irq_stack_ptr, cpu));
3238 @@ -291,7 +291,7 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
3239                         pr_cont(" %016lx", *stack++);
3240                 touch_nmi_watchdog();
3241         }
3242 -       preempt_enable();
3243 +       migrate_enable();
3245         pr_cont("\n");
3246         show_trace_log_lvl(task, regs, sp, bp, log_lvl);
3247 diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
3248 index 38da8f29a9c8..ce71f7098f15 100644
3249 --- a/arch/x86/kernel/irq_32.c
3250 +++ b/arch/x86/kernel/irq_32.c
3251 @@ -128,6 +128,7 @@ void irq_ctx_init(int cpu)
3252                cpu, per_cpu(hardirq_stack, cpu),  per_cpu(softirq_stack, cpu));
3255 +#ifndef CONFIG_PREEMPT_RT_FULL
3256  void do_softirq_own_stack(void)
3258         struct thread_info *curstk;
3259 @@ -146,6 +147,7 @@ void do_softirq_own_stack(void)
3261         call_on_stack(__do_softirq, isp);
3263 +#endif
3265  bool handle_irq(struct irq_desc *desc, struct pt_regs *regs)
3267 diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
3268 index 47190bd399e7..807950860fb7 100644
3269 --- a/arch/x86/kernel/kvm.c
3270 +++ b/arch/x86/kernel/kvm.c
3271 @@ -36,6 +36,7 @@
3272  #include <linux/kprobes.h>
3273  #include <linux/debugfs.h>
3274  #include <linux/nmi.h>
3275 +#include <linux/swait.h>
3276  #include <asm/timer.h>
3277  #include <asm/cpu.h>
3278  #include <asm/traps.h>
3279 @@ -91,14 +92,14 @@ static void kvm_io_delay(void)
3281  struct kvm_task_sleep_node {
3282         struct hlist_node link;
3283 -       wait_queue_head_t wq;
3284 +       struct swait_queue_head wq;
3285         u32 token;
3286         int cpu;
3287         bool halted;
3288  };
3290  static struct kvm_task_sleep_head {
3291 -       spinlock_t lock;
3292 +       raw_spinlock_t lock;
3293         struct hlist_head list;
3294  } async_pf_sleepers[KVM_TASK_SLEEP_HASHSIZE];
3296 @@ -122,17 +123,17 @@ void kvm_async_pf_task_wait(u32 token)
3297         u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
3298         struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
3299         struct kvm_task_sleep_node n, *e;
3300 -       DEFINE_WAIT(wait);
3301 +       DECLARE_SWAITQUEUE(wait);
3303         rcu_irq_enter();
3305 -       spin_lock(&b->lock);
3306 +       raw_spin_lock(&b->lock);
3307         e = _find_apf_task(b, token);
3308         if (e) {
3309                 /* dummy entry exist -> wake up was delivered ahead of PF */
3310                 hlist_del(&e->link);
3311                 kfree(e);
3312 -               spin_unlock(&b->lock);
3313 +               raw_spin_unlock(&b->lock);
3315                 rcu_irq_exit();
3316                 return;
3317 @@ -141,13 +142,13 @@ void kvm_async_pf_task_wait(u32 token)
3318         n.token = token;
3319         n.cpu = smp_processor_id();
3320         n.halted = is_idle_task(current) || preempt_count() > 1;
3321 -       init_waitqueue_head(&n.wq);
3322 +       init_swait_queue_head(&n.wq);
3323         hlist_add_head(&n.link, &b->list);
3324 -       spin_unlock(&b->lock);
3325 +       raw_spin_unlock(&b->lock);
3327         for (;;) {
3328                 if (!n.halted)
3329 -                       prepare_to_wait(&n.wq, &wait, TASK_UNINTERRUPTIBLE);
3330 +                       prepare_to_swait(&n.wq, &wait, TASK_UNINTERRUPTIBLE);
3331                 if (hlist_unhashed(&n.link))
3332                         break;
3334 @@ -166,7 +167,7 @@ void kvm_async_pf_task_wait(u32 token)
3335                 }
3336         }
3337         if (!n.halted)
3338 -               finish_wait(&n.wq, &wait);
3339 +               finish_swait(&n.wq, &wait);
3341         rcu_irq_exit();
3342         return;
3343 @@ -178,8 +179,8 @@ static void apf_task_wake_one(struct kvm_task_sleep_node *n)
3344         hlist_del_init(&n->link);
3345         if (n->halted)
3346                 smp_send_reschedule(n->cpu);
3347 -       else if (waitqueue_active(&n->wq))
3348 -               wake_up(&n->wq);
3349 +       else if (swait_active(&n->wq))
3350 +               swake_up(&n->wq);
3353  static void apf_task_wake_all(void)
3354 @@ -189,14 +190,14 @@ static void apf_task_wake_all(void)
3355         for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) {
3356                 struct hlist_node *p, *next;
3357                 struct kvm_task_sleep_head *b = &async_pf_sleepers[i];
3358 -               spin_lock(&b->lock);
3359 +               raw_spin_lock(&b->lock);
3360                 hlist_for_each_safe(p, next, &b->list) {
3361                         struct kvm_task_sleep_node *n =
3362                                 hlist_entry(p, typeof(*n), link);
3363                         if (n->cpu == smp_processor_id())
3364                                 apf_task_wake_one(n);
3365                 }
3366 -               spin_unlock(&b->lock);
3367 +               raw_spin_unlock(&b->lock);
3368         }
3371 @@ -212,7 +213,7 @@ void kvm_async_pf_task_wake(u32 token)
3372         }
3374  again:
3375 -       spin_lock(&b->lock);
3376 +       raw_spin_lock(&b->lock);
3377         n = _find_apf_task(b, token);
3378         if (!n) {
3379                 /*
3380 @@ -225,17 +226,17 @@ again:
3381                          * Allocation failed! Busy wait while other cpu
3382                          * handles async PF.
3383                          */
3384 -                       spin_unlock(&b->lock);
3385 +                       raw_spin_unlock(&b->lock);
3386                         cpu_relax();
3387                         goto again;
3388                 }
3389                 n->token = token;
3390                 n->cpu = smp_processor_id();
3391 -               init_waitqueue_head(&n->wq);
3392 +               init_swait_queue_head(&n->wq);
3393                 hlist_add_head(&n->link, &b->list);
3394         } else
3395                 apf_task_wake_one(n);
3396 -       spin_unlock(&b->lock);
3397 +       raw_spin_unlock(&b->lock);
3398         return;
3400  EXPORT_SYMBOL_GPL(kvm_async_pf_task_wake);
3401 @@ -486,7 +487,7 @@ void __init kvm_guest_init(void)
3402         paravirt_ops_setup();
3403         register_reboot_notifier(&kvm_pv_reboot_nb);
3404         for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++)
3405 -               spin_lock_init(&async_pf_sleepers[i].lock);
3406 +               raw_spin_lock_init(&async_pf_sleepers[i].lock);
3407         if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF))
3408                 x86_init.irqs.trap_init = kvm_apf_trap_init;
3410 diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
3411 index 697f90db0e37..424aec4a4c71 100644
3412 --- a/arch/x86/kernel/nmi.c
3413 +++ b/arch/x86/kernel/nmi.c
3414 @@ -231,7 +231,7 @@ pci_serr_error(unsigned char reason, struct pt_regs *regs)
3415  #endif
3417         if (panic_on_unrecovered_nmi)
3418 -               panic("NMI: Not continuing");
3419 +               nmi_panic(regs, "NMI: Not continuing");
3421         pr_emerg("Dazed and confused, but trying to continue\n");
3423 @@ -255,8 +255,16 @@ io_check_error(unsigned char reason, struct pt_regs *regs)
3424                  reason, smp_processor_id());
3425         show_regs(regs);
3427 -       if (panic_on_io_nmi)
3428 -               panic("NMI IOCK error: Not continuing");
3429 +       if (panic_on_io_nmi) {
3430 +               nmi_panic(regs, "NMI IOCK error: Not continuing");
3432 +               /*
3433 +                * If we end up here, it means we have received an NMI while
3434 +                * processing panic(). Simply return without delaying and
3435 +                * re-enabling NMIs.
3436 +                */
3437 +               return;
3438 +       }
3440         /* Re-enable the IOCK line, wait for a few seconds */
3441         reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_IOCHK;
3442 @@ -297,7 +305,7 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
3444         pr_emerg("Do you have a strange power saving mode enabled?\n");
3445         if (unknown_nmi_panic || panic_on_unrecovered_nmi)
3446 -               panic("NMI: Not continuing");
3447 +               nmi_panic(regs, "NMI: Not continuing");
3449         pr_emerg("Dazed and confused, but trying to continue\n");
3451 diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
3452 index 9f950917528b..4dd4beae917a 100644
3453 --- a/arch/x86/kernel/process_32.c
3454 +++ b/arch/x86/kernel/process_32.c
3455 @@ -35,6 +35,7 @@
3456  #include <linux/uaccess.h>
3457  #include <linux/io.h>
3458  #include <linux/kdebug.h>
3459 +#include <linux/highmem.h>
3461  #include <asm/pgtable.h>
3462  #include <asm/ldt.h>
3463 @@ -210,6 +211,35 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
3465  EXPORT_SYMBOL_GPL(start_thread);
3467 +#ifdef CONFIG_PREEMPT_RT_FULL
3468 +static void switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p)
3470 +       int i;
3472 +       /*
3473 +        * Clear @prev's kmap_atomic mappings
3474 +        */
3475 +       for (i = 0; i < prev_p->kmap_idx; i++) {
3476 +               int idx = i + KM_TYPE_NR * smp_processor_id();
3477 +               pte_t *ptep = kmap_pte - idx;
3479 +               kpte_clear_flush(ptep, __fix_to_virt(FIX_KMAP_BEGIN + idx));
3480 +       }
3481 +       /*
3482 +        * Restore @next_p's kmap_atomic mappings
3483 +        */
3484 +       for (i = 0; i < next_p->kmap_idx; i++) {
3485 +               int idx = i + KM_TYPE_NR * smp_processor_id();
3487 +               if (!pte_none(next_p->kmap_pte[i]))
3488 +                       set_pte(kmap_pte - idx, next_p->kmap_pte[i]);
3489 +       }
3491 +#else
3492 +static inline void
3493 +switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p) { }
3494 +#endif
3497  /*
3498   *     switch_to(x,y) should switch tasks from x to y.
3499 @@ -286,6 +316,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
3500                      task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT))
3501                 __switch_to_xtra(prev_p, next_p, tss);
3503 +       switch_kmaps(prev_p, next_p);
3505         /*
3506          * Leave lazy mode, flushing any hypercalls made here.
3507          * This must be done before restoring TLS segments so
3508 diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
3509 index f660d63f40fe..8384207adde2 100644
3510 --- a/arch/x86/kernel/reboot.c
3511 +++ b/arch/x86/kernel/reboot.c
3512 @@ -726,6 +726,7 @@ static int crashing_cpu;
3513  static nmi_shootdown_cb shootdown_callback;
3515  static atomic_t waiting_for_crash_ipi;
3516 +static int crash_ipi_issued;
3518  static int crash_nmi_callback(unsigned int val, struct pt_regs *regs)
3520 @@ -788,6 +789,9 @@ void nmi_shootdown_cpus(nmi_shootdown_cb callback)
3522         smp_send_nmi_allbutself();
3524 +       /* Kick CPUs looping in NMI context. */
3525 +       WRITE_ONCE(crash_ipi_issued, 1);
3527         msecs = 1000; /* Wait at most a second for the other cpus to stop */
3528         while ((atomic_read(&waiting_for_crash_ipi) > 0) && msecs) {
3529                 mdelay(1);
3530 @@ -796,6 +800,22 @@ void nmi_shootdown_cpus(nmi_shootdown_cb callback)
3532         /* Leave the nmi callback set */
3535 +/* Override the weak function in kernel/panic.c */
3536 +void nmi_panic_self_stop(struct pt_regs *regs)
3538 +       while (1) {
3539 +               /*
3540 +                * Wait for the crash dumping IPI to be issued, and then
3541 +                * call its callback directly.
3542 +                */
3543 +               if (READ_ONCE(crash_ipi_issued))
3544 +                       crash_nmi_callback(0, regs); /* Don't return */
3546 +               cpu_relax();
3547 +       }
3550  #else /* !CONFIG_SMP */
3551  void nmi_shootdown_cpus(nmi_shootdown_cb callback)
3553 diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
3554 index 1c96f09367ae..ffb6d9859122 100644
3555 --- a/arch/x86/kvm/lapic.c
3556 +++ b/arch/x86/kvm/lapic.c
3557 @@ -1195,7 +1195,7 @@ static void apic_update_lvtt(struct kvm_lapic *apic)
3558  static void apic_timer_expired(struct kvm_lapic *apic)
3560         struct kvm_vcpu *vcpu = apic->vcpu;
3561 -       wait_queue_head_t *q = &vcpu->wq;
3562 +       struct swait_queue_head *q = &vcpu->wq;
3563         struct kvm_timer *ktimer = &apic->lapic_timer;
3565         if (atomic_read(&apic->lapic_timer.pending))
3566 @@ -1204,8 +1204,8 @@ static void apic_timer_expired(struct kvm_lapic *apic)
3567         atomic_inc(&apic->lapic_timer.pending);
3568         kvm_set_pending_timer(vcpu);
3570 -       if (waitqueue_active(q))
3571 -               wake_up_interruptible(q);
3572 +       if (swait_active(q))
3573 +               swake_up(q);
3575         if (apic_lvtt_tscdeadline(apic))
3576                 ktimer->expired_tscdeadline = ktimer->tscdeadline;
3577 @@ -1801,6 +1801,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
3578         hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC,
3579                      HRTIMER_MODE_ABS);
3580         apic->lapic_timer.timer.function = apic_timer_fn;
3581 +       apic->lapic_timer.timer.irqsafe = 1;
3583         /*
3584          * APIC is created enabled. This will prevent kvm_lapic_set_base from
3585 diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
3586 index ae2b9cd358f2..50c9861d4219 100644
3587 --- a/arch/x86/kvm/x86.c
3588 +++ b/arch/x86/kvm/x86.c
3589 @@ -5827,6 +5827,13 @@ int kvm_arch_init(void *opaque)
3590                 goto out;
3591         }
3593 +#ifdef CONFIG_PREEMPT_RT_FULL
3594 +       if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
3595 +               printk(KERN_ERR "RT requires X86_FEATURE_CONSTANT_TSC\n");
3596 +               return -EOPNOTSUPP;
3597 +       }
3598 +#endif
3600         r = kvm_mmu_module_init();
3601         if (r)
3602                 goto out_free_percpu;
3603 diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
3604 index a6d739258137..bd24ba1c4a86 100644
3605 --- a/arch/x86/mm/highmem_32.c
3606 +++ b/arch/x86/mm/highmem_32.c
3607 @@ -32,10 +32,11 @@ EXPORT_SYMBOL(kunmap);
3608   */
3609  void *kmap_atomic_prot(struct page *page, pgprot_t prot)
3611 +       pte_t pte = mk_pte(page, prot);
3612         unsigned long vaddr;
3613         int idx, type;
3615 -       preempt_disable();
3616 +       preempt_disable_nort();
3617         pagefault_disable();
3619         if (!PageHighMem(page))
3620 @@ -45,7 +46,10 @@ void *kmap_atomic_prot(struct page *page, pgprot_t prot)
3621         idx = type + KM_TYPE_NR*smp_processor_id();
3622         vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
3623         BUG_ON(!pte_none(*(kmap_pte-idx)));
3624 -       set_pte(kmap_pte-idx, mk_pte(page, prot));
3625 +#ifdef CONFIG_PREEMPT_RT_FULL
3626 +       current->kmap_pte[type] = pte;
3627 +#endif
3628 +       set_pte(kmap_pte-idx, pte);
3629         arch_flush_lazy_mmu_mode();
3631         return (void *)vaddr;
3632 @@ -88,6 +92,9 @@ void __kunmap_atomic(void *kvaddr)
3633                  * is a bad idea also, in case the page changes cacheability
3634                  * attributes or becomes a protected page in a hypervisor.
3635                  */
3636 +#ifdef CONFIG_PREEMPT_RT_FULL
3637 +               current->kmap_pte[type] = __pte(0);
3638 +#endif
3639                 kpte_clear_flush(kmap_pte-idx, vaddr);
3640                 kmap_atomic_idx_pop();
3641                 arch_flush_lazy_mmu_mode();
3642 @@ -100,7 +107,7 @@ void __kunmap_atomic(void *kvaddr)
3643  #endif
3645         pagefault_enable();
3646 -       preempt_enable();
3647 +       preempt_enable_nort();
3649  EXPORT_SYMBOL(__kunmap_atomic);
3651 diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c
3652 index 9c0ff045fdd4..dd25dd1671b6 100644
3653 --- a/arch/x86/mm/iomap_32.c
3654 +++ b/arch/x86/mm/iomap_32.c
3655 @@ -56,6 +56,7 @@ EXPORT_SYMBOL_GPL(iomap_free);
3657  void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
3659 +       pte_t pte = pfn_pte(pfn, prot);
3660         unsigned long vaddr;
3661         int idx, type;
3663 @@ -65,7 +66,12 @@ void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
3664         type = kmap_atomic_idx_push();
3665         idx = type + KM_TYPE_NR * smp_processor_id();
3666         vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
3667 -       set_pte(kmap_pte - idx, pfn_pte(pfn, prot));
3668 +       WARN_ON(!pte_none(*(kmap_pte - idx)));
3670 +#ifdef CONFIG_PREEMPT_RT_FULL
3671 +       current->kmap_pte[type] = pte;
3672 +#endif
3673 +       set_pte(kmap_pte - idx, pte);
3674         arch_flush_lazy_mmu_mode();
3676         return (void *)vaddr;
3677 @@ -113,6 +119,9 @@ iounmap_atomic(void __iomem *kvaddr)
3678                  * is a bad idea also, in case the page changes cacheability
3679                  * attributes or becomes a protected page in a hypervisor.
3680                  */
3681 +#ifdef CONFIG_PREEMPT_RT_FULL
3682 +               current->kmap_pte[type] = __pte(0);
3683 +#endif
3684                 kpte_clear_flush(kmap_pte-idx, vaddr);
3685                 kmap_atomic_idx_pop();
3686         }
3687 diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
3688 index b599a780a5a9..2e85c4117daf 100644
3689 --- a/arch/x86/mm/pageattr.c
3690 +++ b/arch/x86/mm/pageattr.c
3691 @@ -208,7 +208,15 @@ static void cpa_flush_array(unsigned long *start, int numpages, int cache,
3692                             int in_flags, struct page **pages)
3694         unsigned int i, level;
3695 +#ifdef CONFIG_PREEMPT
3696 +       /*
3697 +        * Avoid wbinvd() because it causes latencies on all CPUs,
3698 +        * regardless of any CPU isolation that may be in effect.
3699 +        */
3700 +       unsigned long do_wbinvd = 0;
3701 +#else
3702         unsigned long do_wbinvd = cache && numpages >= 1024; /* 4M threshold */
3703 +#endif
3705         BUG_ON(irqs_disabled());
3707 diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
3708 index 3b6ec42718e4..7871083de089 100644
3709 --- a/arch/x86/platform/uv/tlb_uv.c
3710 +++ b/arch/x86/platform/uv/tlb_uv.c
3711 @@ -714,9 +714,9 @@ static void destination_plugged(struct bau_desc *bau_desc,
3713                 quiesce_local_uvhub(hmaster);
3715 -               spin_lock(&hmaster->queue_lock);
3716 +               raw_spin_lock(&hmaster->queue_lock);
3717                 reset_with_ipi(&bau_desc->distribution, bcp);
3718 -               spin_unlock(&hmaster->queue_lock);
3719 +               raw_spin_unlock(&hmaster->queue_lock);
3721                 end_uvhub_quiesce(hmaster);
3723 @@ -736,9 +736,9 @@ static void destination_timeout(struct bau_desc *bau_desc,
3725                 quiesce_local_uvhub(hmaster);
3727 -               spin_lock(&hmaster->queue_lock);
3728 +               raw_spin_lock(&hmaster->queue_lock);
3729                 reset_with_ipi(&bau_desc->distribution, bcp);
3730 -               spin_unlock(&hmaster->queue_lock);
3731 +               raw_spin_unlock(&hmaster->queue_lock);
3733                 end_uvhub_quiesce(hmaster);
3735 @@ -759,7 +759,7 @@ static void disable_for_period(struct bau_control *bcp, struct ptc_stats *stat)
3736         cycles_t tm1;
3738         hmaster = bcp->uvhub_master;
3739 -       spin_lock(&hmaster->disable_lock);
3740 +       raw_spin_lock(&hmaster->disable_lock);
3741         if (!bcp->baudisabled) {
3742                 stat->s_bau_disabled++;
3743                 tm1 = get_cycles();
3744 @@ -772,7 +772,7 @@ static void disable_for_period(struct bau_control *bcp, struct ptc_stats *stat)
3745                         }
3746                 }
3747         }
3748 -       spin_unlock(&hmaster->disable_lock);
3749 +       raw_spin_unlock(&hmaster->disable_lock);
3752  static void count_max_concurr(int stat, struct bau_control *bcp,
3753 @@ -835,7 +835,7 @@ static void record_send_stats(cycles_t time1, cycles_t time2,
3754   */
3755  static void uv1_throttle(struct bau_control *hmaster, struct ptc_stats *stat)
3757 -       spinlock_t *lock = &hmaster->uvhub_lock;
3758 +       raw_spinlock_t *lock = &hmaster->uvhub_lock;
3759         atomic_t *v;
3761         v = &hmaster->active_descriptor_count;
3762 @@ -968,7 +968,7 @@ static int check_enable(struct bau_control *bcp, struct ptc_stats *stat)
3763         struct bau_control *hmaster;
3765         hmaster = bcp->uvhub_master;
3766 -       spin_lock(&hmaster->disable_lock);
3767 +       raw_spin_lock(&hmaster->disable_lock);
3768         if (bcp->baudisabled && (get_cycles() >= bcp->set_bau_on_time)) {
3769                 stat->s_bau_reenabled++;
3770                 for_each_present_cpu(tcpu) {
3771 @@ -980,10 +980,10 @@ static int check_enable(struct bau_control *bcp, struct ptc_stats *stat)
3772                                 tbcp->period_giveups = 0;
3773                         }
3774                 }
3775 -               spin_unlock(&hmaster->disable_lock);
3776 +               raw_spin_unlock(&hmaster->disable_lock);
3777                 return 0;
3778         }
3779 -       spin_unlock(&hmaster->disable_lock);
3780 +       raw_spin_unlock(&hmaster->disable_lock);
3781         return -1;
3784 @@ -1901,9 +1901,9 @@ static void __init init_per_cpu_tunables(void)
3785                 bcp->cong_reps                  = congested_reps;
3786                 bcp->disabled_period =          sec_2_cycles(disabled_period);
3787                 bcp->giveup_limit =             giveup_limit;
3788 -               spin_lock_init(&bcp->queue_lock);
3789 -               spin_lock_init(&bcp->uvhub_lock);
3790 -               spin_lock_init(&bcp->disable_lock);
3791 +               raw_spin_lock_init(&bcp->queue_lock);
3792 +               raw_spin_lock_init(&bcp->uvhub_lock);
3793 +               raw_spin_lock_init(&bcp->disable_lock);
3794         }
3797 diff --git a/arch/x86/platform/uv/uv_time.c b/arch/x86/platform/uv/uv_time.c
3798 index 2b158a9fa1d7..5e0b122620cb 100644
3799 --- a/arch/x86/platform/uv/uv_time.c
3800 +++ b/arch/x86/platform/uv/uv_time.c
3801 @@ -57,7 +57,7 @@ static DEFINE_PER_CPU(struct clock_event_device, cpu_ced);
3803  /* There is one of these allocated per node */
3804  struct uv_rtc_timer_head {
3805 -       spinlock_t      lock;
3806 +       raw_spinlock_t  lock;
3807         /* next cpu waiting for timer, local node relative: */
3808         int             next_cpu;
3809         /* number of cpus on this node: */
3810 @@ -177,7 +177,7 @@ static __init int uv_rtc_allocate_timers(void)
3811                                 uv_rtc_deallocate_timers();
3812                                 return -ENOMEM;
3813                         }
3814 -                       spin_lock_init(&head->lock);
3815 +                       raw_spin_lock_init(&head->lock);
3816                         head->ncpus = uv_blade_nr_possible_cpus(bid);
3817                         head->next_cpu = -1;
3818                         blade_info[bid] = head;
3819 @@ -231,7 +231,7 @@ static int uv_rtc_set_timer(int cpu, u64 expires)
3820         unsigned long flags;
3821         int next_cpu;
3823 -       spin_lock_irqsave(&head->lock, flags);
3824 +       raw_spin_lock_irqsave(&head->lock, flags);
3826         next_cpu = head->next_cpu;
3827         *t = expires;
3828 @@ -243,12 +243,12 @@ static int uv_rtc_set_timer(int cpu, u64 expires)
3829                 if (uv_setup_intr(cpu, expires)) {
3830                         *t = ULLONG_MAX;
3831                         uv_rtc_find_next_timer(head, pnode);
3832 -                       spin_unlock_irqrestore(&head->lock, flags);
3833 +                       raw_spin_unlock_irqrestore(&head->lock, flags);
3834                         return -ETIME;
3835                 }
3836         }
3838 -       spin_unlock_irqrestore(&head->lock, flags);
3839 +       raw_spin_unlock_irqrestore(&head->lock, flags);
3840         return 0;
3843 @@ -267,7 +267,7 @@ static int uv_rtc_unset_timer(int cpu, int force)
3844         unsigned long flags;
3845         int rc = 0;
3847 -       spin_lock_irqsave(&head->lock, flags);
3848 +       raw_spin_lock_irqsave(&head->lock, flags);
3850         if ((head->next_cpu == bcpu && uv_read_rtc(NULL) >= *t) || force)
3851                 rc = 1;
3852 @@ -279,7 +279,7 @@ static int uv_rtc_unset_timer(int cpu, int force)
3853                         uv_rtc_find_next_timer(head, pnode);
3854         }
3856 -       spin_unlock_irqrestore(&head->lock, flags);
3857 +       raw_spin_unlock_irqrestore(&head->lock, flags);
3859         return rc;
3861 @@ -299,13 +299,18 @@ static int uv_rtc_unset_timer(int cpu, int force)
3862  static cycle_t uv_read_rtc(struct clocksource *cs)
3864         unsigned long offset;
3865 +       cycle_t cycles;
3867 +       preempt_disable();
3868         if (uv_get_min_hub_revision_id() == 1)
3869                 offset = 0;
3870         else
3871                 offset = (uv_blade_processor_id() * L1_CACHE_BYTES) % PAGE_SIZE;
3873 -       return (cycle_t)uv_read_local_mmr(UVH_RTC | offset);
3874 +       cycles = (cycle_t)uv_read_local_mmr(UVH_RTC | offset);
3875 +       preempt_enable();
3877 +       return cycles;
3880  /*
3881 diff --git a/block/blk-core.c b/block/blk-core.c
3882 index ef083e7a37c5..7a7926b1b4b5 100644
3883 --- a/block/blk-core.c
3884 +++ b/block/blk-core.c
3885 @@ -125,6 +125,9 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
3887         INIT_LIST_HEAD(&rq->queuelist);
3888         INIT_LIST_HEAD(&rq->timeout_list);
3889 +#ifdef CONFIG_PREEMPT_RT_FULL
3890 +       INIT_WORK(&rq->work, __blk_mq_complete_request_remote_work);
3891 +#endif
3892         rq->cpu = -1;
3893         rq->q = q;
3894         rq->__sector = (sector_t) -1;
3895 @@ -233,7 +236,7 @@ EXPORT_SYMBOL(blk_start_queue_async);
3896   **/
3897  void blk_start_queue(struct request_queue *q)
3899 -       WARN_ON(!irqs_disabled());
3900 +       WARN_ON_NONRT(!irqs_disabled());
3902         queue_flag_clear(QUEUE_FLAG_STOPPED, q);
3903         __blk_run_queue(q);
3904 @@ -659,7 +662,7 @@ int blk_queue_enter(struct request_queue *q, gfp_t gfp)
3905                 if (!gfpflags_allow_blocking(gfp))
3906                         return -EBUSY;
3908 -               ret = wait_event_interruptible(q->mq_freeze_wq,
3909 +               ret = swait_event_interruptible(q->mq_freeze_wq,
3910                                 !atomic_read(&q->mq_freeze_depth) ||
3911                                 blk_queue_dying(q));
3912                 if (blk_queue_dying(q))
3913 @@ -679,7 +682,7 @@ static void blk_queue_usage_counter_release(struct percpu_ref *ref)
3914         struct request_queue *q =
3915                 container_of(ref, struct request_queue, q_usage_counter);
3917 -       wake_up_all(&q->mq_freeze_wq);
3918 +       swake_up_all(&q->mq_freeze_wq);
3921  struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
3922 @@ -741,7 +744,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
3923         q->bypass_depth = 1;
3924         __set_bit(QUEUE_FLAG_BYPASS, &q->queue_flags);
3926 -       init_waitqueue_head(&q->mq_freeze_wq);
3927 +       init_swait_queue_head(&q->mq_freeze_wq);
3929         /*
3930          * Init percpu_ref in atomic mode so that it's faster to shutdown.
3931 @@ -3222,7 +3225,7 @@ static void queue_unplugged(struct request_queue *q, unsigned int depth,
3932                 blk_run_queue_async(q);
3933         else
3934                 __blk_run_queue(q);
3935 -       spin_unlock(q->queue_lock);
3936 +       spin_unlock_irq(q->queue_lock);
3939  static void flush_plug_callbacks(struct blk_plug *plug, bool from_schedule)
3940 @@ -3270,7 +3273,6 @@ EXPORT_SYMBOL(blk_check_plugged);
3941  void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
3943         struct request_queue *q;
3944 -       unsigned long flags;
3945         struct request *rq;
3946         LIST_HEAD(list);
3947         unsigned int depth;
3948 @@ -3290,11 +3292,6 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
3949         q = NULL;
3950         depth = 0;
3952 -       /*
3953 -        * Save and disable interrupts here, to avoid doing it for every
3954 -        * queue lock we have to take.
3955 -        */
3956 -       local_irq_save(flags);
3957         while (!list_empty(&list)) {
3958                 rq = list_entry_rq(list.next);
3959                 list_del_init(&rq->queuelist);
3960 @@ -3307,7 +3304,7 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
3961                                 queue_unplugged(q, depth, from_schedule);
3962                         q = rq->q;
3963                         depth = 0;
3964 -                       spin_lock(q->queue_lock);
3965 +                       spin_lock_irq(q->queue_lock);
3966                 }
3968                 /*
3969 @@ -3334,8 +3331,6 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
3970          */
3971         if (q)
3972                 queue_unplugged(q, depth, from_schedule);
3974 -       local_irq_restore(flags);
3977  void blk_finish_plug(struct blk_plug *plug)
3978 diff --git a/block/blk-ioc.c b/block/blk-ioc.c
3979 index 381cb50a673c..dc8785233d94 100644
3980 --- a/block/blk-ioc.c
3981 +++ b/block/blk-ioc.c
3982 @@ -7,6 +7,7 @@
3983  #include <linux/bio.h>
3984  #include <linux/blkdev.h>
3985  #include <linux/slab.h>
3986 +#include <linux/delay.h>
3988  #include "blk.h"
3990 @@ -109,7 +110,7 @@ static void ioc_release_fn(struct work_struct *work)
3991                         spin_unlock(q->queue_lock);
3992                 } else {
3993                         spin_unlock_irqrestore(&ioc->lock, flags);
3994 -                       cpu_relax();
3995 +                       cpu_chill();
3996                         spin_lock_irqsave_nested(&ioc->lock, flags, 1);
3997                 }
3998         }
3999 @@ -187,7 +188,7 @@ retry:
4000                         spin_unlock(icq->q->queue_lock);
4001                 } else {
4002                         spin_unlock_irqrestore(&ioc->lock, flags);
4003 -                       cpu_relax();
4004 +                       cpu_chill();
4005                         goto retry;
4006                 }
4007         }
4008 diff --git a/block/blk-iopoll.c b/block/blk-iopoll.c
4009 index 0736729d6494..3e21e31d0d7e 100644
4010 --- a/block/blk-iopoll.c
4011 +++ b/block/blk-iopoll.c
4012 @@ -35,6 +35,7 @@ void blk_iopoll_sched(struct blk_iopoll *iop)
4013         list_add_tail(&iop->list, this_cpu_ptr(&blk_cpu_iopoll));
4014         __raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ);
4015         local_irq_restore(flags);
4016 +       preempt_check_resched_rt();
4018  EXPORT_SYMBOL(blk_iopoll_sched);
4020 @@ -132,6 +133,7 @@ static void blk_iopoll_softirq(struct softirq_action *h)
4021                 __raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ);
4023         local_irq_enable();
4024 +       preempt_check_resched_rt();
4027  /**
4028 @@ -201,6 +203,7 @@ static int blk_iopoll_cpu_notify(struct notifier_block *self,
4029                                  this_cpu_ptr(&blk_cpu_iopoll));
4030                 __raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ);
4031                 local_irq_enable();
4032 +               preempt_check_resched_rt();
4033         }
4035         return NOTIFY_OK;
4036 diff --git a/block/blk-mq-cpu.c b/block/blk-mq-cpu.c
4037 index bb3ed488f7b5..628c6c13c482 100644
4038 --- a/block/blk-mq-cpu.c
4039 +++ b/block/blk-mq-cpu.c
4040 @@ -16,7 +16,7 @@
4041  #include "blk-mq.h"
4043  static LIST_HEAD(blk_mq_cpu_notify_list);
4044 -static DEFINE_RAW_SPINLOCK(blk_mq_cpu_notify_lock);
4045 +static DEFINE_SPINLOCK(blk_mq_cpu_notify_lock);
4047  static int blk_mq_main_cpu_notify(struct notifier_block *self,
4048                                   unsigned long action, void *hcpu)
4049 @@ -25,7 +25,10 @@ static int blk_mq_main_cpu_notify(struct notifier_block *self,
4050         struct blk_mq_cpu_notifier *notify;
4051         int ret = NOTIFY_OK;
4053 -       raw_spin_lock(&blk_mq_cpu_notify_lock);
4054 +       if (action != CPU_POST_DEAD)
4055 +               return NOTIFY_OK;
4057 +       spin_lock(&blk_mq_cpu_notify_lock);
4059         list_for_each_entry(notify, &blk_mq_cpu_notify_list, list) {
4060                 ret = notify->notify(notify->data, action, cpu);
4061 @@ -33,7 +36,7 @@ static int blk_mq_main_cpu_notify(struct notifier_block *self,
4062                         break;
4063         }
4065 -       raw_spin_unlock(&blk_mq_cpu_notify_lock);
4066 +       spin_unlock(&blk_mq_cpu_notify_lock);
4067         return ret;
4070 @@ -41,16 +44,16 @@ void blk_mq_register_cpu_notifier(struct blk_mq_cpu_notifier *notifier)
4072         BUG_ON(!notifier->notify);
4074 -       raw_spin_lock(&blk_mq_cpu_notify_lock);
4075 +       spin_lock(&blk_mq_cpu_notify_lock);
4076         list_add_tail(&notifier->list, &blk_mq_cpu_notify_list);
4077 -       raw_spin_unlock(&blk_mq_cpu_notify_lock);
4078 +       spin_unlock(&blk_mq_cpu_notify_lock);
4081  void blk_mq_unregister_cpu_notifier(struct blk_mq_cpu_notifier *notifier)
4083 -       raw_spin_lock(&blk_mq_cpu_notify_lock);
4084 +       spin_lock(&blk_mq_cpu_notify_lock);
4085         list_del(&notifier->list);
4086 -       raw_spin_unlock(&blk_mq_cpu_notify_lock);
4087 +       spin_unlock(&blk_mq_cpu_notify_lock);
4090  void blk_mq_init_cpu_notifier(struct blk_mq_cpu_notifier *notifier,
4091 diff --git a/block/blk-mq.c b/block/blk-mq.c
4092 index 0d1af3e44efb..e4fc80184dd8 100644
4093 --- a/block/blk-mq.c
4094 +++ b/block/blk-mq.c
4095 @@ -92,7 +92,7 @@ EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_start);
4097  static void blk_mq_freeze_queue_wait(struct request_queue *q)
4099 -       wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter));
4100 +       swait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter));
4103  /*
4104 @@ -130,7 +130,7 @@ void blk_mq_unfreeze_queue(struct request_queue *q)
4105         WARN_ON_ONCE(freeze_depth < 0);
4106         if (!freeze_depth) {
4107                 percpu_ref_reinit(&q->q_usage_counter);
4108 -               wake_up_all(&q->mq_freeze_wq);
4109 +               swake_up_all(&q->mq_freeze_wq);
4110         }
4112  EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);
4113 @@ -149,7 +149,7 @@ void blk_mq_wake_waiters(struct request_queue *q)
4114          * dying, we need to ensure that processes currently waiting on
4115          * the queue are notified as well.
4116          */
4117 -       wake_up_all(&q->mq_freeze_wq);
4118 +       swake_up_all(&q->mq_freeze_wq);
4121  bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
4122 @@ -196,6 +196,9 @@ static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
4123         rq->resid_len = 0;
4124         rq->sense = NULL;
4126 +#ifdef CONFIG_PREEMPT_RT_FULL
4127 +       INIT_WORK(&rq->work, __blk_mq_complete_request_remote_work);
4128 +#endif
4129         INIT_LIST_HEAD(&rq->timeout_list);
4130         rq->timeout = 0;
4132 @@ -325,6 +328,17 @@ void blk_mq_end_request(struct request *rq, int error)
4134  EXPORT_SYMBOL(blk_mq_end_request);
4136 +#ifdef CONFIG_PREEMPT_RT_FULL
4138 +void __blk_mq_complete_request_remote_work(struct work_struct *work)
4140 +       struct request *rq = container_of(work, struct request, work);
4142 +       rq->q->softirq_done_fn(rq);
4145 +#else
4147  static void __blk_mq_complete_request_remote(void *data)
4149         struct request *rq = data;
4150 @@ -332,6 +346,8 @@ static void __blk_mq_complete_request_remote(void *data)
4151         rq->q->softirq_done_fn(rq);
4154 +#endif
4156  static void blk_mq_ipi_complete_request(struct request *rq)
4158         struct blk_mq_ctx *ctx = rq->mq_ctx;
4159 @@ -343,19 +359,23 @@ static void blk_mq_ipi_complete_request(struct request *rq)
4160                 return;
4161         }
4163 -       cpu = get_cpu();
4164 +       cpu = get_cpu_light();
4165         if (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags))
4166                 shared = cpus_share_cache(cpu, ctx->cpu);
4168         if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) {
4169 +#ifdef CONFIG_PREEMPT_RT_FULL
4170 +               schedule_work_on(ctx->cpu, &rq->work);
4171 +#else
4172                 rq->csd.func = __blk_mq_complete_request_remote;
4173                 rq->csd.info = rq;
4174                 rq->csd.flags = 0;
4175                 smp_call_function_single_async(ctx->cpu, &rq->csd);
4176 +#endif
4177         } else {
4178                 rq->q->softirq_done_fn(rq);
4179         }
4180 -       put_cpu();
4181 +       put_cpu_light();
4184  static void __blk_mq_complete_request(struct request *rq)
4185 @@ -862,14 +882,14 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
4186                 return;
4188         if (!async) {
4189 -               int cpu = get_cpu();
4190 +               int cpu = get_cpu_light();
4191                 if (cpumask_test_cpu(cpu, hctx->cpumask)) {
4192                         __blk_mq_run_hw_queue(hctx);
4193 -                       put_cpu();
4194 +                       put_cpu_light();
4195                         return;
4196                 }
4198 -               put_cpu();
4199 +               put_cpu_light();
4200         }
4202         kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
4203 @@ -1616,7 +1636,7 @@ static int blk_mq_hctx_notify(void *data, unsigned long action,
4205         struct blk_mq_hw_ctx *hctx = data;
4207 -       if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)
4208 +       if (action == CPU_POST_DEAD)
4209                 return blk_mq_hctx_cpu_offline(hctx, cpu);
4211         /*
4212 diff --git a/block/blk-mq.h b/block/blk-mq.h
4213 index 713820b47b31..3cb6feb4fe23 100644
4214 --- a/block/blk-mq.h
4215 +++ b/block/blk-mq.h
4216 @@ -74,7 +74,10 @@ struct blk_align_bitmap {
4217  static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q,
4218                                            unsigned int cpu)
4220 -       return per_cpu_ptr(q->queue_ctx, cpu);
4221 +       struct blk_mq_ctx *ctx;
4223 +       ctx = per_cpu_ptr(q->queue_ctx, cpu);
4224 +       return ctx;
4227  /*
4228 @@ -85,12 +88,12 @@ static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q,
4229   */
4230  static inline struct blk_mq_ctx *blk_mq_get_ctx(struct request_queue *q)
4232 -       return __blk_mq_get_ctx(q, get_cpu());
4233 +       return __blk_mq_get_ctx(q, get_cpu_light());
4236  static inline void blk_mq_put_ctx(struct blk_mq_ctx *ctx)
4238 -       put_cpu();
4239 +       put_cpu_light();
4242  struct blk_mq_alloc_data {
4243 diff --git a/block/blk-softirq.c b/block/blk-softirq.c
4244 index 53b1737e978d..81c3c0a62edf 100644
4245 --- a/block/blk-softirq.c
4246 +++ b/block/blk-softirq.c
4247 @@ -51,6 +51,7 @@ static void trigger_softirq(void *data)
4248                 raise_softirq_irqoff(BLOCK_SOFTIRQ);
4250         local_irq_restore(flags);
4251 +       preempt_check_resched_rt();
4254  /*
4255 @@ -93,6 +94,7 @@ static int blk_cpu_notify(struct notifier_block *self, unsigned long action,
4256                                  this_cpu_ptr(&blk_cpu_done));
4257                 raise_softirq_irqoff(BLOCK_SOFTIRQ);
4258                 local_irq_enable();
4259 +               preempt_check_resched_rt();
4260         }
4262         return NOTIFY_OK;
4263 @@ -150,6 +152,7 @@ do_local:
4264                 goto do_local;
4266         local_irq_restore(flags);
4267 +       preempt_check_resched_rt();
4270  /**
4271 diff --git a/block/bounce.c b/block/bounce.c
4272 index 1cb5dd3a5da1..2f1ec8a67cbe 100644
4273 --- a/block/bounce.c
4274 +++ b/block/bounce.c
4275 @@ -55,11 +55,11 @@ static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom)
4276         unsigned long flags;
4277         unsigned char *vto;
4279 -       local_irq_save(flags);
4280 +       local_irq_save_nort(flags);
4281         vto = kmap_atomic(to->bv_page);
4282         memcpy(vto + to->bv_offset, vfrom, to->bv_len);
4283         kunmap_atomic(vto);
4284 -       local_irq_restore(flags);
4285 +       local_irq_restore_nort(flags);
4288  #else /* CONFIG_HIGHMEM */
4289 diff --git a/crypto/algapi.c b/crypto/algapi.c
4290 index 43f5bdb6b570..b54fd9187d12 100644
4291 --- a/crypto/algapi.c
4292 +++ b/crypto/algapi.c
4293 @@ -720,13 +720,13 @@ EXPORT_SYMBOL_GPL(crypto_spawn_tfm2);
4295  int crypto_register_notifier(struct notifier_block *nb)
4297 -       return blocking_notifier_chain_register(&crypto_chain, nb);
4298 +       return srcu_notifier_chain_register(&crypto_chain, nb);
4300  EXPORT_SYMBOL_GPL(crypto_register_notifier);
4302  int crypto_unregister_notifier(struct notifier_block *nb)
4304 -       return blocking_notifier_chain_unregister(&crypto_chain, nb);
4305 +       return srcu_notifier_chain_unregister(&crypto_chain, nb);
4307  EXPORT_SYMBOL_GPL(crypto_unregister_notifier);
4309 diff --git a/crypto/api.c b/crypto/api.c
4310 index bbc147cb5dec..bc1a848f02ec 100644
4311 --- a/crypto/api.c
4312 +++ b/crypto/api.c
4313 @@ -31,7 +31,7 @@ EXPORT_SYMBOL_GPL(crypto_alg_list);
4314  DECLARE_RWSEM(crypto_alg_sem);
4315  EXPORT_SYMBOL_GPL(crypto_alg_sem);
4317 -BLOCKING_NOTIFIER_HEAD(crypto_chain);
4318 +SRCU_NOTIFIER_HEAD(crypto_chain);
4319  EXPORT_SYMBOL_GPL(crypto_chain);
4321  static struct crypto_alg *crypto_larval_wait(struct crypto_alg *alg);
4322 @@ -236,10 +236,10 @@ int crypto_probing_notify(unsigned long val, void *v)
4324         int ok;
4326 -       ok = blocking_notifier_call_chain(&crypto_chain, val, v);
4327 +       ok = srcu_notifier_call_chain(&crypto_chain, val, v);
4328         if (ok == NOTIFY_DONE) {
4329                 request_module("cryptomgr");
4330 -               ok = blocking_notifier_call_chain(&crypto_chain, val, v);
4331 +               ok = srcu_notifier_call_chain(&crypto_chain, val, v);
4332         }
4334         return ok;
4335 diff --git a/crypto/internal.h b/crypto/internal.h
4336 index 00e42a3ed814..2e85551e235f 100644
4337 --- a/crypto/internal.h
4338 +++ b/crypto/internal.h
4339 @@ -47,7 +47,7 @@ struct crypto_larval {
4341  extern struct list_head crypto_alg_list;
4342  extern struct rw_semaphore crypto_alg_sem;
4343 -extern struct blocking_notifier_head crypto_chain;
4344 +extern struct srcu_notifier_head crypto_chain;
4346  #ifdef CONFIG_PROC_FS
4347  void __init crypto_init_proc(void);
4348 @@ -143,7 +143,7 @@ static inline int crypto_is_moribund(struct crypto_alg *alg)
4350  static inline void crypto_notify(unsigned long val, void *v)
4352 -       blocking_notifier_call_chain(&crypto_chain, val, v);
4353 +       srcu_notifier_call_chain(&crypto_chain, val, v);
4356  #endif /* _CRYPTO_INTERNAL_H */
4357 diff --git a/drivers/acpi/acpica/acglobal.h b/drivers/acpi/acpica/acglobal.h
4358 index faa97604d878..941497f31cf0 100644
4359 --- a/drivers/acpi/acpica/acglobal.h
4360 +++ b/drivers/acpi/acpica/acglobal.h
4361 @@ -116,7 +116,7 @@ ACPI_GLOBAL(u8, acpi_gbl_global_lock_pending);
4362   * interrupt level
4363   */
4364  ACPI_GLOBAL(acpi_spinlock, acpi_gbl_gpe_lock); /* For GPE data structs and registers */
4365 -ACPI_GLOBAL(acpi_spinlock, acpi_gbl_hardware_lock);    /* For ACPI H/W except GPE registers */
4366 +ACPI_GLOBAL(acpi_raw_spinlock, acpi_gbl_hardware_lock);        /* For ACPI H/W except GPE registers */
4367  ACPI_GLOBAL(acpi_spinlock, acpi_gbl_reference_count_lock);
4369  /* Mutex for _OSI support */
4370 diff --git a/drivers/acpi/acpica/hwregs.c b/drivers/acpi/acpica/hwregs.c
4371 index 3cf77afd142c..dc32e72132f1 100644
4372 --- a/drivers/acpi/acpica/hwregs.c
4373 +++ b/drivers/acpi/acpica/hwregs.c
4374 @@ -269,14 +269,14 @@ acpi_status acpi_hw_clear_acpi_status(void)
4375                           ACPI_BITMASK_ALL_FIXED_STATUS,
4376                           ACPI_FORMAT_UINT64(acpi_gbl_xpm1a_status.address)));
4378 -       lock_flags = acpi_os_acquire_lock(acpi_gbl_hardware_lock);
4379 +       raw_spin_lock_irqsave(acpi_gbl_hardware_lock, lock_flags);
4381         /* Clear the fixed events in PM1 A/B */
4383         status = acpi_hw_register_write(ACPI_REGISTER_PM1_STATUS,
4384                                         ACPI_BITMASK_ALL_FIXED_STATUS);
4386 -       acpi_os_release_lock(acpi_gbl_hardware_lock, lock_flags);
4387 +       raw_spin_unlock_irqrestore(acpi_gbl_hardware_lock, lock_flags);
4389         if (ACPI_FAILURE(status)) {
4390                 goto exit;
4391 diff --git a/drivers/acpi/acpica/hwxface.c b/drivers/acpi/acpica/hwxface.c
4392 index b2e50d8007fe..ff007084dc48 100644
4393 --- a/drivers/acpi/acpica/hwxface.c
4394 +++ b/drivers/acpi/acpica/hwxface.c
4395 @@ -374,7 +374,7 @@ acpi_status acpi_write_bit_register(u32 register_id, u32 value)
4396                 return_ACPI_STATUS(AE_BAD_PARAMETER);
4397         }
4399 -       lock_flags = acpi_os_acquire_lock(acpi_gbl_hardware_lock);
4400 +       raw_spin_lock_irqsave(acpi_gbl_hardware_lock, lock_flags);
4402         /*
4403          * At this point, we know that the parent register is one of the
4404 @@ -435,7 +435,7 @@ acpi_status acpi_write_bit_register(u32 register_id, u32 value)
4406  unlock_and_exit:
4408 -       acpi_os_release_lock(acpi_gbl_hardware_lock, lock_flags);
4409 +       raw_spin_unlock_irqrestore(acpi_gbl_hardware_lock, lock_flags);
4410         return_ACPI_STATUS(status);
4413 diff --git a/drivers/acpi/acpica/utmutex.c b/drivers/acpi/acpica/utmutex.c
4414 index ce406e39b669..41a75eb3ae9d 100644
4415 --- a/drivers/acpi/acpica/utmutex.c
4416 +++ b/drivers/acpi/acpica/utmutex.c
4417 @@ -88,7 +88,7 @@ acpi_status acpi_ut_mutex_initialize(void)
4418                 return_ACPI_STATUS (status);
4419         }
4421 -       status = acpi_os_create_lock (&acpi_gbl_hardware_lock);
4422 +       status = acpi_os_create_raw_lock (&acpi_gbl_hardware_lock);
4423         if (ACPI_FAILURE (status)) {
4424                 return_ACPI_STATUS (status);
4425         }
4426 @@ -156,7 +156,7 @@ void acpi_ut_mutex_terminate(void)
4427         /* Delete the spinlocks */
4429         acpi_os_delete_lock(acpi_gbl_gpe_lock);
4430 -       acpi_os_delete_lock(acpi_gbl_hardware_lock);
4431 +       acpi_os_delete_raw_lock(acpi_gbl_hardware_lock);
4432         acpi_os_delete_lock(acpi_gbl_reference_count_lock);
4434         /* Delete the reader/writer lock */
4435 diff --git a/drivers/ata/libata-sff.c b/drivers/ata/libata-sff.c
4436 index 7dbba387d12a..65beb7abb4e7 100644
4437 --- a/drivers/ata/libata-sff.c
4438 +++ b/drivers/ata/libata-sff.c
4439 @@ -678,9 +678,9 @@ unsigned int ata_sff_data_xfer_noirq(struct ata_device *dev, unsigned char *buf,
4440         unsigned long flags;
4441         unsigned int consumed;
4443 -       local_irq_save(flags);
4444 +       local_irq_save_nort(flags);
4445         consumed = ata_sff_data_xfer32(dev, buf, buflen, rw);
4446 -       local_irq_restore(flags);
4447 +       local_irq_restore_nort(flags);
4449         return consumed;
4451 @@ -719,7 +719,7 @@ static void ata_pio_sector(struct ata_queued_cmd *qc)
4452                 unsigned long flags;
4454                 /* FIXME: use a bounce buffer */
4455 -               local_irq_save(flags);
4456 +               local_irq_save_nort(flags);
4457                 buf = kmap_atomic(page);
4459                 /* do the actual data transfer */
4460 @@ -727,7 +727,7 @@ static void ata_pio_sector(struct ata_queued_cmd *qc)
4461                                        do_write);
4463                 kunmap_atomic(buf);
4464 -               local_irq_restore(flags);
4465 +               local_irq_restore_nort(flags);
4466         } else {
4467                 buf = page_address(page);
4468                 ap->ops->sff_data_xfer(qc->dev, buf + offset, qc->sect_size,
4469 @@ -864,7 +864,7 @@ next_sg:
4470                 unsigned long flags;
4472                 /* FIXME: use bounce buffer */
4473 -               local_irq_save(flags);
4474 +               local_irq_save_nort(flags);
4475                 buf = kmap_atomic(page);
4477                 /* do the actual data transfer */
4478 @@ -872,7 +872,7 @@ next_sg:
4479                                                                 count, rw);
4481                 kunmap_atomic(buf);
4482 -               local_irq_restore(flags);
4483 +               local_irq_restore_nort(flags);
4484         } else {
4485                 buf = page_address(page);
4486                 consumed = ap->ops->sff_data_xfer(dev,  buf + offset,
4487 diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
4488 index 62a93b685c54..5ad3c4e652a8 100644
4489 --- a/drivers/block/zram/zram_drv.c
4490 +++ b/drivers/block/zram/zram_drv.c
4491 @@ -520,6 +520,8 @@ static struct zram_meta *zram_meta_alloc(char *pool_name, u64 disksize)
4492                 goto out_error;
4493         }
4495 +       zram_meta_init_table_locks(meta, disksize);
4497         return meta;
4499  out_error:
4500 @@ -568,12 +570,12 @@ static int zram_decompress_page(struct zram *zram, char *mem, u32 index)
4501         unsigned long handle;
4502         size_t size;
4504 -       bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
4505 +       zram_lock_table(&meta->table[index]);
4506         handle = meta->table[index].handle;
4507         size = zram_get_obj_size(meta, index);
4509         if (!handle || zram_test_flag(meta, index, ZRAM_ZERO)) {
4510 -               bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
4511 +               zram_unlock_table(&meta->table[index]);
4512                 memset(mem, 0, PAGE_SIZE);
4513                 return 0;
4514         }
4515 @@ -584,7 +586,7 @@ static int zram_decompress_page(struct zram *zram, char *mem, u32 index)
4516         else
4517                 ret = zcomp_decompress(zram->comp, cmem, size, mem);
4518         zs_unmap_object(meta->mem_pool, handle);
4519 -       bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
4520 +       zram_unlock_table(&meta->table[index]);
4522         /* Should NEVER happen. Return bio error if it does. */
4523         if (unlikely(ret)) {
4524 @@ -604,14 +606,14 @@ static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
4525         struct zram_meta *meta = zram->meta;
4526         page = bvec->bv_page;
4528 -       bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
4529 +       zram_lock_table(&meta->table[index]);
4530         if (unlikely(!meta->table[index].handle) ||
4531                         zram_test_flag(meta, index, ZRAM_ZERO)) {
4532 -               bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
4533 +               zram_unlock_table(&meta->table[index]);
4534                 handle_zero_page(bvec);
4535                 return 0;
4536         }
4537 -       bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
4538 +       zram_unlock_table(&meta->table[index]);
4540         if (is_partial_io(bvec))
4541                 /* Use  a temporary buffer to decompress the page */
4542 @@ -689,10 +691,10 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
4543                 if (user_mem)
4544                         kunmap_atomic(user_mem);
4545                 /* Free memory associated with this sector now. */
4546 -               bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
4547 +               zram_lock_table(&meta->table[index]);
4548                 zram_free_page(zram, index);
4549                 zram_set_flag(meta, index, ZRAM_ZERO);
4550 -               bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
4551 +               zram_unlock_table(&meta->table[index]);
4553                 atomic64_inc(&zram->stats.zero_pages);
4554                 ret = 0;
4555 @@ -752,12 +754,12 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
4556          * Free memory associated with this sector
4557          * before overwriting unused sectors.
4558          */
4559 -       bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
4560 +       zram_lock_table(&meta->table[index]);
4561         zram_free_page(zram, index);
4563         meta->table[index].handle = handle;
4564         zram_set_obj_size(meta, index, clen);
4565 -       bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
4566 +       zram_unlock_table(&meta->table[index]);
4568         /* Update stats */
4569         atomic64_add(clen, &zram->stats.compr_data_size);
4570 @@ -800,9 +802,9 @@ static void zram_bio_discard(struct zram *zram, u32 index,
4571         }
4573         while (n >= PAGE_SIZE) {
4574 -               bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
4575 +               zram_lock_table(&meta->table[index]);
4576                 zram_free_page(zram, index);
4577 -               bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
4578 +               zram_unlock_table(&meta->table[index]);
4579                 atomic64_inc(&zram->stats.notify_free);
4580                 index++;
4581                 n -= PAGE_SIZE;
4582 @@ -928,9 +930,9 @@ static void zram_slot_free_notify(struct block_device *bdev,
4583         zram = bdev->bd_disk->private_data;
4584         meta = zram->meta;
4586 -       bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
4587 +       zram_lock_table(&meta->table[index]);
4588         zram_free_page(zram, index);
4589 -       bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
4590 +       zram_unlock_table(&meta->table[index]);
4591         atomic64_inc(&zram->stats.notify_free);
4594 diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
4595 index 8e92339686d7..9e3e953d680e 100644
4596 --- a/drivers/block/zram/zram_drv.h
4597 +++ b/drivers/block/zram/zram_drv.h
4598 @@ -72,6 +72,9 @@ enum zram_pageflags {
4599  struct zram_table_entry {
4600         unsigned long handle;
4601         unsigned long value;
4602 +#ifdef CONFIG_PREEMPT_RT_BASE
4603 +       spinlock_t lock;
4604 +#endif
4605  };
4607  struct zram_stats {
4608 @@ -119,4 +122,42 @@ struct zram {
4609          */
4610         bool claim; /* Protected by bdev->bd_mutex */
4611  };
4613 +#ifndef CONFIG_PREEMPT_RT_BASE
4614 +static inline void zram_lock_table(struct zram_table_entry *table)
4616 +       bit_spin_lock(ZRAM_ACCESS, &table->value);
4619 +static inline void zram_unlock_table(struct zram_table_entry *table)
4621 +       bit_spin_unlock(ZRAM_ACCESS, &table->value);
4624 +static inline void zram_meta_init_table_locks(struct zram_meta *meta, u64 disksize) { }
4625 +#else /* CONFIG_PREEMPT_RT_BASE */
4626 +static inline void zram_lock_table(struct zram_table_entry *table)
4628 +       spin_lock(&table->lock);
4629 +       __set_bit(ZRAM_ACCESS, &table->value);
4632 +static inline void zram_unlock_table(struct zram_table_entry *table)
4634 +       __clear_bit(ZRAM_ACCESS, &table->value);
4635 +       spin_unlock(&table->lock);
4638 +static inline void zram_meta_init_table_locks(struct zram_meta *meta, u64 disksize)
4640 +        size_t num_pages = disksize >> PAGE_SHIFT;
4641 +        size_t index;
4643 +        for (index = 0; index < num_pages; index++) {
4644 +               spinlock_t *lock = &meta->table[index].lock;
4645 +               spin_lock_init(lock);
4646 +        }
4648 +#endif /* CONFIG_PREEMPT_RT_BASE */
4650  #endif
4651 diff --git a/drivers/char/random.c b/drivers/char/random.c
4652 index 491a4dce13fe..cf69b6b42208 100644
4653 --- a/drivers/char/random.c
4654 +++ b/drivers/char/random.c
4655 @@ -799,8 +799,6 @@ static void add_timer_randomness(struct timer_rand_state *state, unsigned num)
4656         } sample;
4657         long delta, delta2, delta3;
4659 -       preempt_disable();
4661         sample.jiffies = jiffies;
4662         sample.cycles = random_get_entropy();
4663         sample.num = num;
4664 @@ -841,7 +839,6 @@ static void add_timer_randomness(struct timer_rand_state *state, unsigned num)
4665                  */
4666                 credit_entropy_bits(r, min_t(int, fls(delta>>1), 11));
4667         }
4668 -       preempt_enable();
4671  void add_input_randomness(unsigned int type, unsigned int code,
4672 @@ -894,28 +891,27 @@ static __u32 get_reg(struct fast_pool *f, struct pt_regs *regs)
4673         return *(ptr + f->reg_idx++);
4676 -void add_interrupt_randomness(int irq, int irq_flags)
4677 +void add_interrupt_randomness(int irq, int irq_flags, __u64 ip)
4679         struct entropy_store    *r;
4680         struct fast_pool        *fast_pool = this_cpu_ptr(&irq_randomness);
4681 -       struct pt_regs          *regs = get_irq_regs();
4682         unsigned long           now = jiffies;
4683         cycles_t                cycles = random_get_entropy();
4684         __u32                   c_high, j_high;
4685 -       __u64                   ip;
4686         unsigned long           seed;
4687         int                     credit = 0;
4689         if (cycles == 0)
4690 -               cycles = get_reg(fast_pool, regs);
4691 +               cycles = get_reg(fast_pool, NULL);
4692         c_high = (sizeof(cycles) > 4) ? cycles >> 32 : 0;
4693         j_high = (sizeof(now) > 4) ? now >> 32 : 0;
4694         fast_pool->pool[0] ^= cycles ^ j_high ^ irq;
4695         fast_pool->pool[1] ^= now ^ c_high;
4696 -       ip = regs ? instruction_pointer(regs) : _RET_IP_;
4697 +       if (!ip)
4698 +               ip = _RET_IP_;
4699         fast_pool->pool[2] ^= ip;
4700         fast_pool->pool[3] ^= (sizeof(ip) > 4) ? ip >> 32 :
4701 -               get_reg(fast_pool, regs);
4702 +               get_reg(fast_pool, NULL);
4704         fast_mix(fast_pool);
4705         add_interrupt_bench(cycles);
4706 diff --git a/drivers/clk/at91/clk-generated.c b/drivers/clk/at91/clk-generated.c
4707 index abc80949e1dd..4ad3298eb372 100644
4708 --- a/drivers/clk/at91/clk-generated.c
4709 +++ b/drivers/clk/at91/clk-generated.c
4710 @@ -15,8 +15,8 @@
4711  #include <linux/clkdev.h>
4712  #include <linux/clk/at91_pmc.h>
4713  #include <linux/of.h>
4714 -#include <linux/of_address.h>
4715 -#include <linux/io.h>
4716 +#include <linux/mfd/syscon.h>
4717 +#include <linux/regmap.h>
4719  #include "pmc.h"
4721 @@ -28,8 +28,9 @@
4723  struct clk_generated {
4724         struct clk_hw hw;
4725 -       struct at91_pmc *pmc;
4726 +       struct regmap *regmap;
4727         struct clk_range range;
4728 +       spinlock_t *lock;
4729         u32 id;
4730         u32 gckdiv;
4731         u8 parent_id;
4732 @@ -41,49 +42,52 @@ struct clk_generated {
4733  static int clk_generated_enable(struct clk_hw *hw)
4735         struct clk_generated *gck = to_clk_generated(hw);
4736 -       struct at91_pmc *pmc = gck->pmc;
4737 -       u32 tmp;
4738 +       unsigned long flags;
4740         pr_debug("GCLK: %s, gckdiv = %d, parent id = %d\n",
4741                  __func__, gck->gckdiv, gck->parent_id);
4743 -       pmc_lock(pmc);
4744 -       pmc_write(pmc, AT91_PMC_PCR, (gck->id & AT91_PMC_PCR_PID_MASK));
4745 -       tmp = pmc_read(pmc, AT91_PMC_PCR) &
4746 -                       ~(AT91_PMC_PCR_GCKDIV_MASK | AT91_PMC_PCR_GCKCSS_MASK);
4747 -       pmc_write(pmc, AT91_PMC_PCR, tmp | AT91_PMC_PCR_GCKCSS(gck->parent_id)
4748 -                                        | AT91_PMC_PCR_CMD
4749 -                                        | AT91_PMC_PCR_GCKDIV(gck->gckdiv)
4750 -                                        | AT91_PMC_PCR_GCKEN);
4751 -       pmc_unlock(pmc);
4752 +       spin_lock_irqsave(gck->lock, flags);
4753 +       regmap_write(gck->regmap, AT91_PMC_PCR,
4754 +                    (gck->id & AT91_PMC_PCR_PID_MASK));
4755 +       regmap_update_bits(gck->regmap, AT91_PMC_PCR,
4756 +                          AT91_PMC_PCR_GCKDIV_MASK | AT91_PMC_PCR_GCKCSS_MASK |
4757 +                          AT91_PMC_PCR_CMD | AT91_PMC_PCR_GCKEN,
4758 +                          AT91_PMC_PCR_GCKCSS(gck->parent_id) |
4759 +                          AT91_PMC_PCR_CMD |
4760 +                          AT91_PMC_PCR_GCKDIV(gck->gckdiv) |
4761 +                          AT91_PMC_PCR_GCKEN);
4762 +       spin_unlock_irqrestore(gck->lock, flags);
4763         return 0;
4766  static void clk_generated_disable(struct clk_hw *hw)
4768         struct clk_generated *gck = to_clk_generated(hw);
4769 -       struct at91_pmc *pmc = gck->pmc;
4770 -       u32 tmp;
4772 -       pmc_lock(pmc);
4773 -       pmc_write(pmc, AT91_PMC_PCR, (gck->id & AT91_PMC_PCR_PID_MASK));
4774 -       tmp = pmc_read(pmc, AT91_PMC_PCR) & ~AT91_PMC_PCR_GCKEN;
4775 -       pmc_write(pmc, AT91_PMC_PCR, tmp | AT91_PMC_PCR_CMD);
4776 -       pmc_unlock(pmc);
4777 +       unsigned long flags;
4779 +       spin_lock_irqsave(gck->lock, flags);
4780 +       regmap_write(gck->regmap, AT91_PMC_PCR,
4781 +                    (gck->id & AT91_PMC_PCR_PID_MASK));
4782 +       regmap_update_bits(gck->regmap, AT91_PMC_PCR,
4783 +                          AT91_PMC_PCR_CMD | AT91_PMC_PCR_GCKEN,
4784 +                          AT91_PMC_PCR_CMD);
4785 +       spin_unlock_irqrestore(gck->lock, flags);
4788  static int clk_generated_is_enabled(struct clk_hw *hw)
4790         struct clk_generated *gck = to_clk_generated(hw);
4791 -       struct at91_pmc *pmc = gck->pmc;
4792 -       int ret;
4793 +       unsigned long flags;
4794 +       unsigned int status;
4796 -       pmc_lock(pmc);
4797 -       pmc_write(pmc, AT91_PMC_PCR, (gck->id & AT91_PMC_PCR_PID_MASK));
4798 -       ret = !!(pmc_read(pmc, AT91_PMC_PCR) & AT91_PMC_PCR_GCKEN);
4799 -       pmc_unlock(pmc);
4800 +       spin_lock_irqsave(gck->lock, flags);
4801 +       regmap_write(gck->regmap, AT91_PMC_PCR,
4802 +                    (gck->id & AT91_PMC_PCR_PID_MASK));
4803 +       regmap_read(gck->regmap, AT91_PMC_PCR, &status);
4804 +       spin_unlock_irqrestore(gck->lock, flags);
4806 -       return ret;
4807 +       return status & AT91_PMC_PCR_GCKEN ? 1 : 0;
4810  static unsigned long
4811 @@ -214,13 +218,14 @@ static const struct clk_ops generated_ops = {
4812   */
4813  static void clk_generated_startup(struct clk_generated *gck)
4815 -       struct at91_pmc *pmc = gck->pmc;
4816         u32 tmp;
4817 +       unsigned long flags;
4819 -       pmc_lock(pmc);
4820 -       pmc_write(pmc, AT91_PMC_PCR, (gck->id & AT91_PMC_PCR_PID_MASK));
4821 -       tmp = pmc_read(pmc, AT91_PMC_PCR);
4822 -       pmc_unlock(pmc);
4823 +       spin_lock_irqsave(gck->lock, flags);
4824 +       regmap_write(gck->regmap, AT91_PMC_PCR,
4825 +                    (gck->id & AT91_PMC_PCR_PID_MASK));
4826 +       regmap_read(gck->regmap, AT91_PMC_PCR, &tmp);
4827 +       spin_unlock_irqrestore(gck->lock, flags);
4829         gck->parent_id = (tmp & AT91_PMC_PCR_GCKCSS_MASK)
4830                                         >> AT91_PMC_PCR_GCKCSS_OFFSET;
4831 @@ -229,8 +234,8 @@ static void clk_generated_startup(struct clk_generated *gck)
4834  static struct clk * __init
4835 -at91_clk_register_generated(struct at91_pmc *pmc, const char *name,
4836 -                           const char **parent_names, u8 num_parents,
4837 +at91_clk_register_generated(struct regmap *regmap,  spinlock_t *lock, const char
4838 +                           *name, const char **parent_names, u8 num_parents,
4839                             u8 id, const struct clk_range *range)
4841         struct clk_generated *gck;
4842 @@ -249,7 +254,8 @@ at91_clk_register_generated(struct at91_pmc *pmc, const char *name,
4844         gck->id = id;
4845         gck->hw.init = &init;
4846 -       gck->pmc = pmc;
4847 +       gck->regmap = regmap;
4848 +       gck->lock = lock;
4849         gck->range = *range;
4851         clk = clk_register(NULL, &gck->hw);
4852 @@ -261,8 +267,7 @@ at91_clk_register_generated(struct at91_pmc *pmc, const char *name,
4853         return clk;
4856 -void __init of_sama5d2_clk_generated_setup(struct device_node *np,
4857 -                                          struct at91_pmc *pmc)
4858 +void __init of_sama5d2_clk_generated_setup(struct device_node *np)
4860         int num;
4861         u32 id;
4862 @@ -272,6 +277,7 @@ void __init of_sama5d2_clk_generated_setup(struct device_node *np,
4863         const char *parent_names[GENERATED_SOURCE_MAX];
4864         struct device_node *gcknp;
4865         struct clk_range range = CLK_RANGE(0, 0);
4866 +       struct regmap *regmap;
4868         num_parents = of_clk_get_parent_count(np);
4869         if (num_parents <= 0 || num_parents > GENERATED_SOURCE_MAX)
4870 @@ -283,6 +289,10 @@ void __init of_sama5d2_clk_generated_setup(struct device_node *np,
4871         if (!num || num > PERIPHERAL_MAX)
4872                 return;
4874 +       regmap = syscon_node_to_regmap(of_get_parent(np));
4875 +       if (IS_ERR(regmap))
4876 +               return;
4878         for_each_child_of_node(np, gcknp) {
4879                 if (of_property_read_u32(gcknp, "reg", &id))
4880                         continue;
4881 @@ -296,11 +306,14 @@ void __init of_sama5d2_clk_generated_setup(struct device_node *np,
4882                 of_at91_get_clk_range(gcknp, "atmel,clk-output-range",
4883                                       &range);
4885 -               clk = at91_clk_register_generated(pmc, name, parent_names,
4886 -                                                 num_parents, id, &range);
4887 +               clk = at91_clk_register_generated(regmap, &pmc_pcr_lock, name,
4888 +                                                 parent_names, num_parents,
4889 +                                                 id, &range);
4890                 if (IS_ERR(clk))
4891                         continue;
4893                 of_clk_add_provider(gcknp, of_clk_src_simple_get, clk);
4894         }
4896 +CLK_OF_DECLARE(of_sama5d2_clk_generated_setup, "atmel,sama5d2-clk-generated",
4897 +              of_sama5d2_clk_generated_setup);
4898 diff --git a/drivers/clk/at91/clk-h32mx.c b/drivers/clk/at91/clk-h32mx.c
4899 index a165230e7eda..8e20c8a76db7 100644
4900 --- a/drivers/clk/at91/clk-h32mx.c
4901 +++ b/drivers/clk/at91/clk-h32mx.c
4902 @@ -15,15 +15,9 @@
4903  #include <linux/clk-provider.h>
4904  #include <linux/clkdev.h>
4905  #include <linux/clk/at91_pmc.h>
4906 -#include <linux/delay.h>
4907  #include <linux/of.h>
4908 -#include <linux/of_address.h>
4909 -#include <linux/of_irq.h>
4910 -#include <linux/io.h>
4911 -#include <linux/interrupt.h>
4912 -#include <linux/irq.h>
4913 -#include <linux/sched.h>
4914 -#include <linux/wait.h>
4915 +#include <linux/regmap.h>
4916 +#include <linux/mfd/syscon.h>
4918  #include "pmc.h"
4920 @@ -31,7 +25,7 @@
4922  struct clk_sama5d4_h32mx {
4923         struct clk_hw hw;
4924 -       struct at91_pmc *pmc;
4925 +       struct regmap *regmap;
4926  };
4928  #define to_clk_sama5d4_h32mx(hw) container_of(hw, struct clk_sama5d4_h32mx, hw)
4929 @@ -40,8 +34,10 @@ static unsigned long clk_sama5d4_h32mx_recalc_rate(struct clk_hw *hw,
4930                                                  unsigned long parent_rate)
4932         struct clk_sama5d4_h32mx *h32mxclk = to_clk_sama5d4_h32mx(hw);
4933 +       unsigned int mckr;
4935 -       if (pmc_read(h32mxclk->pmc, AT91_PMC_MCKR) & AT91_PMC_H32MXDIV)
4936 +       regmap_read(h32mxclk->regmap, AT91_PMC_MCKR, &mckr);
4937 +       if (mckr & AT91_PMC_H32MXDIV)
4938                 return parent_rate / 2;
4940         if (parent_rate > H32MX_MAX_FREQ)
4941 @@ -70,18 +66,16 @@ static int clk_sama5d4_h32mx_set_rate(struct clk_hw *hw, unsigned long rate,
4942                                     unsigned long parent_rate)
4944         struct clk_sama5d4_h32mx *h32mxclk = to_clk_sama5d4_h32mx(hw);
4945 -       struct at91_pmc *pmc = h32mxclk->pmc;
4946 -       u32 tmp;
4947 +       u32 mckr = 0;
4949         if (parent_rate != rate && (parent_rate / 2) != rate)
4950                 return -EINVAL;
4952 -       pmc_lock(pmc);
4953 -       tmp = pmc_read(pmc, AT91_PMC_MCKR) & ~AT91_PMC_H32MXDIV;
4954         if ((parent_rate / 2) == rate)
4955 -               tmp |= AT91_PMC_H32MXDIV;
4956 -       pmc_write(pmc, AT91_PMC_MCKR, tmp);
4957 -       pmc_unlock(pmc);
4958 +               mckr = AT91_PMC_H32MXDIV;
4960 +       regmap_update_bits(h32mxclk->regmap, AT91_PMC_MCKR,
4961 +                          AT91_PMC_H32MXDIV, mckr);
4963         return 0;
4965 @@ -92,14 +86,18 @@ static const struct clk_ops h32mx_ops = {
4966         .set_rate = clk_sama5d4_h32mx_set_rate,
4967  };
4969 -void __init of_sama5d4_clk_h32mx_setup(struct device_node *np,
4970 -                                    struct at91_pmc *pmc)
4971 +static void __init of_sama5d4_clk_h32mx_setup(struct device_node *np)
4973         struct clk_sama5d4_h32mx *h32mxclk;
4974         struct clk_init_data init;
4975         const char *parent_name;
4976 +       struct regmap *regmap;
4977         struct clk *clk;
4979 +       regmap = syscon_node_to_regmap(of_get_parent(np));
4980 +       if (IS_ERR(regmap))
4981 +               return;
4983         h32mxclk = kzalloc(sizeof(*h32mxclk), GFP_KERNEL);
4984         if (!h32mxclk)
4985                 return;
4986 @@ -113,7 +111,7 @@ void __init of_sama5d4_clk_h32mx_setup(struct device_node *np,
4987         init.flags = CLK_SET_RATE_GATE;
4989         h32mxclk->hw.init = &init;
4990 -       h32mxclk->pmc = pmc;
4991 +       h32mxclk->regmap = regmap;
4993         clk = clk_register(NULL, &h32mxclk->hw);
4994         if (IS_ERR(clk)) {
4995 @@ -123,3 +121,5 @@ void __init of_sama5d4_clk_h32mx_setup(struct device_node *np,
4997         of_clk_add_provider(np, of_clk_src_simple_get, clk);
4999 +CLK_OF_DECLARE(of_sama5d4_clk_h32mx_setup, "atmel,sama5d4-clk-h32mx",
5000 +              of_sama5d4_clk_h32mx_setup);
5001 diff --git a/drivers/clk/at91/clk-main.c b/drivers/clk/at91/clk-main.c
5002 index fd7247deabdc..4bfc94d6c26e 100644
5003 --- a/drivers/clk/at91/clk-main.c
5004 +++ b/drivers/clk/at91/clk-main.c
5005 @@ -13,13 +13,8 @@
5006  #include <linux/clk/at91_pmc.h>
5007  #include <linux/delay.h>
5008  #include <linux/of.h>
5009 -#include <linux/of_address.h>
5010 -#include <linux/of_irq.h>
5011 -#include <linux/io.h>
5012 -#include <linux/interrupt.h>
5013 -#include <linux/irq.h>
5014 -#include <linux/sched.h>
5015 -#include <linux/wait.h>
5016 +#include <linux/mfd/syscon.h>
5017 +#include <linux/regmap.h>
5019  #include "pmc.h"
5021 @@ -34,18 +29,14 @@
5023  struct clk_main_osc {
5024         struct clk_hw hw;
5025 -       struct at91_pmc *pmc;
5026 -       unsigned int irq;
5027 -       wait_queue_head_t wait;
5028 +       struct regmap *regmap;
5029  };
5031  #define to_clk_main_osc(hw) container_of(hw, struct clk_main_osc, hw)
5033  struct clk_main_rc_osc {
5034         struct clk_hw hw;
5035 -       struct at91_pmc *pmc;
5036 -       unsigned int irq;
5037 -       wait_queue_head_t wait;
5038 +       struct regmap *regmap;
5039         unsigned long frequency;
5040         unsigned long accuracy;
5041  };
5042 @@ -54,51 +45,47 @@ struct clk_main_rc_osc {
5044  struct clk_rm9200_main {
5045         struct clk_hw hw;
5046 -       struct at91_pmc *pmc;
5047 +       struct regmap *regmap;
5048  };
5050  #define to_clk_rm9200_main(hw) container_of(hw, struct clk_rm9200_main, hw)
5052  struct clk_sam9x5_main {
5053         struct clk_hw hw;
5054 -       struct at91_pmc *pmc;
5055 -       unsigned int irq;
5056 -       wait_queue_head_t wait;
5057 +       struct regmap *regmap;
5058         u8 parent;
5059  };
5061  #define to_clk_sam9x5_main(hw) container_of(hw, struct clk_sam9x5_main, hw)
5063 -static irqreturn_t clk_main_osc_irq_handler(int irq, void *dev_id)
5064 +static inline bool clk_main_osc_ready(struct regmap *regmap)
5066 -       struct clk_main_osc *osc = dev_id;
5067 +       unsigned int status;
5069 -       wake_up(&osc->wait);
5070 -       disable_irq_nosync(osc->irq);
5071 +       regmap_read(regmap, AT91_PMC_SR, &status);
5073 -       return IRQ_HANDLED;
5074 +       return status & AT91_PMC_MOSCS;
5077  static int clk_main_osc_prepare(struct clk_hw *hw)
5079         struct clk_main_osc *osc = to_clk_main_osc(hw);
5080 -       struct at91_pmc *pmc = osc->pmc;
5081 +       struct regmap *regmap = osc->regmap;
5082         u32 tmp;
5084 -       tmp = pmc_read(pmc, AT91_CKGR_MOR) & ~MOR_KEY_MASK;
5085 +       regmap_read(regmap, AT91_CKGR_MOR, &tmp);
5086 +       tmp &= ~MOR_KEY_MASK;
5088         if (tmp & AT91_PMC_OSCBYPASS)
5089                 return 0;
5091         if (!(tmp & AT91_PMC_MOSCEN)) {
5092                 tmp |= AT91_PMC_MOSCEN | AT91_PMC_KEY;
5093 -               pmc_write(pmc, AT91_CKGR_MOR, tmp);
5094 +               regmap_write(regmap, AT91_CKGR_MOR, tmp);
5095         }
5097 -       while (!(pmc_read(pmc, AT91_PMC_SR) & AT91_PMC_MOSCS)) {
5098 -               enable_irq(osc->irq);
5099 -               wait_event(osc->wait,
5100 -                          pmc_read(pmc, AT91_PMC_SR) & AT91_PMC_MOSCS);
5101 -       }
5102 +       while (!clk_main_osc_ready(regmap))
5103 +               cpu_relax();
5105         return 0;
5107 @@ -106,9 +93,10 @@ static int clk_main_osc_prepare(struct clk_hw *hw)
5108  static void clk_main_osc_unprepare(struct clk_hw *hw)
5110         struct clk_main_osc *osc = to_clk_main_osc(hw);
5111 -       struct at91_pmc *pmc = osc->pmc;
5112 -       u32 tmp = pmc_read(pmc, AT91_CKGR_MOR);
5113 +       struct regmap *regmap = osc->regmap;
5114 +       u32 tmp;
5116 +       regmap_read(regmap, AT91_CKGR_MOR, &tmp);
5117         if (tmp & AT91_PMC_OSCBYPASS)
5118                 return;
5120 @@ -116,20 +104,22 @@ static void clk_main_osc_unprepare(struct clk_hw *hw)
5121                 return;
5123         tmp &= ~(AT91_PMC_KEY | AT91_PMC_MOSCEN);
5124 -       pmc_write(pmc, AT91_CKGR_MOR, tmp | AT91_PMC_KEY);
5125 +       regmap_write(regmap, AT91_CKGR_MOR, tmp | AT91_PMC_KEY);
5128  static int clk_main_osc_is_prepared(struct clk_hw *hw)
5130         struct clk_main_osc *osc = to_clk_main_osc(hw);
5131 -       struct at91_pmc *pmc = osc->pmc;
5132 -       u32 tmp = pmc_read(pmc, AT91_CKGR_MOR);
5133 +       struct regmap *regmap = osc->regmap;
5134 +       u32 tmp, status;
5136 +       regmap_read(regmap, AT91_CKGR_MOR, &tmp);
5137         if (tmp & AT91_PMC_OSCBYPASS)
5138                 return 1;
5140 -       return !!((pmc_read(pmc, AT91_PMC_SR) & AT91_PMC_MOSCS) &&
5141 -                 (pmc_read(pmc, AT91_CKGR_MOR) & AT91_PMC_MOSCEN));
5142 +       regmap_read(regmap, AT91_PMC_SR, &status);
5144 +       return (status & AT91_PMC_MOSCS) && (tmp & AT91_PMC_MOSCEN);
5147  static const struct clk_ops main_osc_ops = {
5148 @@ -139,18 +129,16 @@ static const struct clk_ops main_osc_ops = {
5149  };
5151  static struct clk * __init
5152 -at91_clk_register_main_osc(struct at91_pmc *pmc,
5153 -                          unsigned int irq,
5154 +at91_clk_register_main_osc(struct regmap *regmap,
5155                            const char *name,
5156                            const char *parent_name,
5157                            bool bypass)
5159 -       int ret;
5160         struct clk_main_osc *osc;
5161         struct clk *clk = NULL;
5162         struct clk_init_data init;
5164 -       if (!pmc || !irq || !name || !parent_name)
5165 +       if (!name || !parent_name)
5166                 return ERR_PTR(-EINVAL);
5168         osc = kzalloc(sizeof(*osc), GFP_KERNEL);
5169 @@ -164,85 +152,70 @@ at91_clk_register_main_osc(struct at91_pmc *pmc,
5170         init.flags = CLK_IGNORE_UNUSED;
5172         osc->hw.init = &init;
5173 -       osc->pmc = pmc;
5174 -       osc->irq = irq;
5176 -       init_waitqueue_head(&osc->wait);
5177 -       irq_set_status_flags(osc->irq, IRQ_NOAUTOEN);
5178 -       ret = request_irq(osc->irq, clk_main_osc_irq_handler,
5179 -                         IRQF_TRIGGER_HIGH, name, osc);
5180 -       if (ret) {
5181 -               kfree(osc);
5182 -               return ERR_PTR(ret);
5183 -       }
5184 +       osc->regmap = regmap;
5186         if (bypass)
5187 -               pmc_write(pmc, AT91_CKGR_MOR,
5188 -                         (pmc_read(pmc, AT91_CKGR_MOR) &
5189 -                          ~(MOR_KEY_MASK | AT91_PMC_MOSCEN)) |
5190 -                         AT91_PMC_OSCBYPASS | AT91_PMC_KEY);
5191 +               regmap_update_bits(regmap,
5192 +                                  AT91_CKGR_MOR, MOR_KEY_MASK |
5193 +                                  AT91_PMC_MOSCEN,
5194 +                                  AT91_PMC_OSCBYPASS | AT91_PMC_KEY);
5196         clk = clk_register(NULL, &osc->hw);
5197 -       if (IS_ERR(clk)) {
5198 -               free_irq(irq, osc);
5199 +       if (IS_ERR(clk))
5200                 kfree(osc);
5201 -       }
5203         return clk;
5206 -void __init of_at91rm9200_clk_main_osc_setup(struct device_node *np,
5207 -                                            struct at91_pmc *pmc)
5208 +static void __init of_at91rm9200_clk_main_osc_setup(struct device_node *np)
5210         struct clk *clk;
5211 -       unsigned int irq;
5212         const char *name = np->name;
5213         const char *parent_name;
5214 +       struct regmap *regmap;
5215         bool bypass;
5217         of_property_read_string(np, "clock-output-names", &name);
5218         bypass = of_property_read_bool(np, "atmel,osc-bypass");
5219         parent_name = of_clk_get_parent_name(np, 0);
5221 -       irq = irq_of_parse_and_map(np, 0);
5222 -       if (!irq)
5223 +       regmap = syscon_node_to_regmap(of_get_parent(np));
5224 +       if (IS_ERR(regmap))
5225                 return;
5227 -       clk = at91_clk_register_main_osc(pmc, irq, name, parent_name, bypass);
5228 +       clk = at91_clk_register_main_osc(regmap, name, parent_name, bypass);
5229         if (IS_ERR(clk))
5230                 return;
5232         of_clk_add_provider(np, of_clk_src_simple_get, clk);
5234 +CLK_OF_DECLARE(at91rm9200_clk_main_osc, "atmel,at91rm9200-clk-main-osc",
5235 +              of_at91rm9200_clk_main_osc_setup);
5237 -static irqreturn_t clk_main_rc_osc_irq_handler(int irq, void *dev_id)
5238 +static bool clk_main_rc_osc_ready(struct regmap *regmap)
5240 -       struct clk_main_rc_osc *osc = dev_id;
5241 +       unsigned int status;
5243 -       wake_up(&osc->wait);
5244 -       disable_irq_nosync(osc->irq);
5245 +       regmap_read(regmap, AT91_PMC_SR, &status);
5247 -       return IRQ_HANDLED;
5248 +       return status & AT91_PMC_MOSCRCS;
5251  static int clk_main_rc_osc_prepare(struct clk_hw *hw)
5253         struct clk_main_rc_osc *osc = to_clk_main_rc_osc(hw);
5254 -       struct at91_pmc *pmc = osc->pmc;
5255 -       u32 tmp;
5256 +       struct regmap *regmap = osc->regmap;
5257 +       unsigned int mor;
5259 -       tmp = pmc_read(pmc, AT91_CKGR_MOR) & ~MOR_KEY_MASK;
5260 +       regmap_read(regmap, AT91_CKGR_MOR, &mor);
5262 -       if (!(tmp & AT91_PMC_MOSCRCEN)) {
5263 -               tmp |= AT91_PMC_MOSCRCEN | AT91_PMC_KEY;
5264 -               pmc_write(pmc, AT91_CKGR_MOR, tmp);
5265 -       }
5266 +       if (!(mor & AT91_PMC_MOSCRCEN))
5267 +               regmap_update_bits(regmap, AT91_CKGR_MOR,
5268 +                                  MOR_KEY_MASK | AT91_PMC_MOSCRCEN,
5269 +                                  AT91_PMC_MOSCRCEN | AT91_PMC_KEY);
5271 -       while (!(pmc_read(pmc, AT91_PMC_SR) & AT91_PMC_MOSCRCS)) {
5272 -               enable_irq(osc->irq);
5273 -               wait_event(osc->wait,
5274 -                          pmc_read(pmc, AT91_PMC_SR) & AT91_PMC_MOSCRCS);
5275 -       }
5276 +       while (!clk_main_rc_osc_ready(regmap))
5277 +               cpu_relax();
5279         return 0;
5281 @@ -250,23 +223,28 @@ static int clk_main_rc_osc_prepare(struct clk_hw *hw)
5282  static void clk_main_rc_osc_unprepare(struct clk_hw *hw)
5284         struct clk_main_rc_osc *osc = to_clk_main_rc_osc(hw);
5285 -       struct at91_pmc *pmc = osc->pmc;
5286 -       u32 tmp = pmc_read(pmc, AT91_CKGR_MOR);
5287 +       struct regmap *regmap = osc->regmap;
5288 +       unsigned int mor;
5290 +       regmap_read(regmap, AT91_CKGR_MOR, &mor);
5292 -       if (!(tmp & AT91_PMC_MOSCRCEN))
5293 +       if (!(mor & AT91_PMC_MOSCRCEN))
5294                 return;
5296 -       tmp &= ~(MOR_KEY_MASK | AT91_PMC_MOSCRCEN);
5297 -       pmc_write(pmc, AT91_CKGR_MOR, tmp | AT91_PMC_KEY);
5298 +       regmap_update_bits(regmap, AT91_CKGR_MOR,
5299 +                          MOR_KEY_MASK | AT91_PMC_MOSCRCEN, AT91_PMC_KEY);
5302  static int clk_main_rc_osc_is_prepared(struct clk_hw *hw)
5304         struct clk_main_rc_osc *osc = to_clk_main_rc_osc(hw);
5305 -       struct at91_pmc *pmc = osc->pmc;
5306 +       struct regmap *regmap = osc->regmap;
5307 +       unsigned int mor, status;
5309 -       return !!((pmc_read(pmc, AT91_PMC_SR) & AT91_PMC_MOSCRCS) &&
5310 -                 (pmc_read(pmc, AT91_CKGR_MOR) & AT91_PMC_MOSCRCEN));
5311 +       regmap_read(regmap, AT91_CKGR_MOR, &mor);
5312 +       regmap_read(regmap, AT91_PMC_SR, &status);
5314 +       return (mor & AT91_PMC_MOSCRCEN) && (status & AT91_PMC_MOSCRCS);
5317  static unsigned long clk_main_rc_osc_recalc_rate(struct clk_hw *hw,
5318 @@ -294,17 +272,15 @@ static const struct clk_ops main_rc_osc_ops = {
5319  };
5321  static struct clk * __init
5322 -at91_clk_register_main_rc_osc(struct at91_pmc *pmc,
5323 -                             unsigned int irq,
5324 +at91_clk_register_main_rc_osc(struct regmap *regmap,
5325                               const char *name,
5326                               u32 frequency, u32 accuracy)
5328 -       int ret;
5329         struct clk_main_rc_osc *osc;
5330         struct clk *clk = NULL;
5331         struct clk_init_data init;
5333 -       if (!pmc || !irq || !name || !frequency)
5334 +       if (!name || !frequency)
5335                 return ERR_PTR(-EINVAL);
5337         osc = kzalloc(sizeof(*osc), GFP_KERNEL);
5338 @@ -318,63 +294,53 @@ at91_clk_register_main_rc_osc(struct at91_pmc *pmc,
5339         init.flags = CLK_IS_ROOT | CLK_IGNORE_UNUSED;
5341         osc->hw.init = &init;
5342 -       osc->pmc = pmc;
5343 -       osc->irq = irq;
5344 +       osc->regmap = regmap;
5345         osc->frequency = frequency;
5346         osc->accuracy = accuracy;
5348 -       init_waitqueue_head(&osc->wait);
5349 -       irq_set_status_flags(osc->irq, IRQ_NOAUTOEN);
5350 -       ret = request_irq(osc->irq, clk_main_rc_osc_irq_handler,
5351 -                         IRQF_TRIGGER_HIGH, name, osc);
5352 -       if (ret)
5353 -               return ERR_PTR(ret);
5355         clk = clk_register(NULL, &osc->hw);
5356 -       if (IS_ERR(clk)) {
5357 -               free_irq(irq, osc);
5358 +       if (IS_ERR(clk))
5359                 kfree(osc);
5360 -       }
5362         return clk;
5365 -void __init of_at91sam9x5_clk_main_rc_osc_setup(struct device_node *np,
5366 -                                               struct at91_pmc *pmc)
5367 +static void __init of_at91sam9x5_clk_main_rc_osc_setup(struct device_node *np)
5369         struct clk *clk;
5370 -       unsigned int irq;
5371         u32 frequency = 0;
5372         u32 accuracy = 0;
5373         const char *name = np->name;
5374 +       struct regmap *regmap;
5376         of_property_read_string(np, "clock-output-names", &name);
5377         of_property_read_u32(np, "clock-frequency", &frequency);
5378         of_property_read_u32(np, "clock-accuracy", &accuracy);
5380 -       irq = irq_of_parse_and_map(np, 0);
5381 -       if (!irq)
5382 +       regmap = syscon_node_to_regmap(of_get_parent(np));
5383 +       if (IS_ERR(regmap))
5384                 return;
5386 -       clk = at91_clk_register_main_rc_osc(pmc, irq, name, frequency,
5387 -                                           accuracy);
5388 +       clk = at91_clk_register_main_rc_osc(regmap, name, frequency, accuracy);
5389         if (IS_ERR(clk))
5390                 return;
5392         of_clk_add_provider(np, of_clk_src_simple_get, clk);
5394 +CLK_OF_DECLARE(at91sam9x5_clk_main_rc_osc, "atmel,at91sam9x5-clk-main-rc-osc",
5395 +              of_at91sam9x5_clk_main_rc_osc_setup);
5398 -static int clk_main_probe_frequency(struct at91_pmc *pmc)
5399 +static int clk_main_probe_frequency(struct regmap *regmap)
5401         unsigned long prep_time, timeout;
5402 -       u32 tmp;
5403 +       unsigned int mcfr;
5405         timeout = jiffies + usecs_to_jiffies(MAINFRDY_TIMEOUT);
5406         do {
5407                 prep_time = jiffies;
5408 -               tmp = pmc_read(pmc, AT91_CKGR_MCFR);
5409 -               if (tmp & AT91_PMC_MAINRDY)
5410 +               regmap_read(regmap, AT91_CKGR_MCFR, &mcfr);
5411 +               if (mcfr & AT91_PMC_MAINRDY)
5412                         return 0;
5413                 usleep_range(MAINF_LOOP_MIN_WAIT, MAINF_LOOP_MAX_WAIT);
5414         } while (time_before(prep_time, timeout));
5415 @@ -382,34 +348,37 @@ static int clk_main_probe_frequency(struct at91_pmc *pmc)
5416         return -ETIMEDOUT;
5419 -static unsigned long clk_main_recalc_rate(struct at91_pmc *pmc,
5420 +static unsigned long clk_main_recalc_rate(struct regmap *regmap,
5421                                           unsigned long parent_rate)
5423 -       u32 tmp;
5424 +       unsigned int mcfr;
5426         if (parent_rate)
5427                 return parent_rate;
5429         pr_warn("Main crystal frequency not set, using approximate value\n");
5430 -       tmp = pmc_read(pmc, AT91_CKGR_MCFR);
5431 -       if (!(tmp & AT91_PMC_MAINRDY))
5432 +       regmap_read(regmap, AT91_CKGR_MCFR, &mcfr);
5433 +       if (!(mcfr & AT91_PMC_MAINRDY))
5434                 return 0;
5436 -       return ((tmp & AT91_PMC_MAINF) * SLOW_CLOCK_FREQ) / MAINF_DIV;
5437 +       return ((mcfr & AT91_PMC_MAINF) * SLOW_CLOCK_FREQ) / MAINF_DIV;
5440  static int clk_rm9200_main_prepare(struct clk_hw *hw)
5442         struct clk_rm9200_main *clkmain = to_clk_rm9200_main(hw);
5444 -       return clk_main_probe_frequency(clkmain->pmc);
5445 +       return clk_main_probe_frequency(clkmain->regmap);
5448  static int clk_rm9200_main_is_prepared(struct clk_hw *hw)
5450         struct clk_rm9200_main *clkmain = to_clk_rm9200_main(hw);
5451 +       unsigned int status;
5453 +       regmap_read(clkmain->regmap, AT91_CKGR_MCFR, &status);
5455 -       return !!(pmc_read(clkmain->pmc, AT91_CKGR_MCFR) & AT91_PMC_MAINRDY);
5456 +       return status & AT91_PMC_MAINRDY ? 1 : 0;
5459  static unsigned long clk_rm9200_main_recalc_rate(struct clk_hw *hw,
5460 @@ -417,7 +386,7 @@ static unsigned long clk_rm9200_main_recalc_rate(struct clk_hw *hw,
5462         struct clk_rm9200_main *clkmain = to_clk_rm9200_main(hw);
5464 -       return clk_main_recalc_rate(clkmain->pmc, parent_rate);
5465 +       return clk_main_recalc_rate(clkmain->regmap, parent_rate);
5468  static const struct clk_ops rm9200_main_ops = {
5469 @@ -427,7 +396,7 @@ static const struct clk_ops rm9200_main_ops = {
5470  };
5472  static struct clk * __init
5473 -at91_clk_register_rm9200_main(struct at91_pmc *pmc,
5474 +at91_clk_register_rm9200_main(struct regmap *regmap,
5475                               const char *name,
5476                               const char *parent_name)
5478 @@ -435,7 +404,7 @@ at91_clk_register_rm9200_main(struct at91_pmc *pmc,
5479         struct clk *clk = NULL;
5480         struct clk_init_data init;
5482 -       if (!pmc || !name)
5483 +       if (!name)
5484                 return ERR_PTR(-EINVAL);
5486         if (!parent_name)
5487 @@ -452,7 +421,7 @@ at91_clk_register_rm9200_main(struct at91_pmc *pmc,
5488         init.flags = 0;
5490         clkmain->hw.init = &init;
5491 -       clkmain->pmc = pmc;
5492 +       clkmain->regmap = regmap;
5494         clk = clk_register(NULL, &clkmain->hw);
5495         if (IS_ERR(clk))
5496 @@ -461,52 +430,54 @@ at91_clk_register_rm9200_main(struct at91_pmc *pmc,
5497         return clk;
5500 -void __init of_at91rm9200_clk_main_setup(struct device_node *np,
5501 -                                        struct at91_pmc *pmc)
5502 +static void __init of_at91rm9200_clk_main_setup(struct device_node *np)
5504         struct clk *clk;
5505         const char *parent_name;
5506         const char *name = np->name;
5507 +       struct regmap *regmap;
5509         parent_name = of_clk_get_parent_name(np, 0);
5510         of_property_read_string(np, "clock-output-names", &name);
5512 -       clk = at91_clk_register_rm9200_main(pmc, name, parent_name);
5513 +       regmap = syscon_node_to_regmap(of_get_parent(np));
5514 +       if (IS_ERR(regmap))
5515 +               return;
5517 +       clk = at91_clk_register_rm9200_main(regmap, name, parent_name);
5518         if (IS_ERR(clk))
5519                 return;
5521         of_clk_add_provider(np, of_clk_src_simple_get, clk);
5523 +CLK_OF_DECLARE(at91rm9200_clk_main, "atmel,at91rm9200-clk-main",
5524 +              of_at91rm9200_clk_main_setup);
5526 -static irqreturn_t clk_sam9x5_main_irq_handler(int irq, void *dev_id)
5527 +static inline bool clk_sam9x5_main_ready(struct regmap *regmap)
5529 -       struct clk_sam9x5_main *clkmain = dev_id;
5530 +       unsigned int status;
5532 -       wake_up(&clkmain->wait);
5533 -       disable_irq_nosync(clkmain->irq);
5534 +       regmap_read(regmap, AT91_PMC_SR, &status);
5536 -       return IRQ_HANDLED;
5537 +       return status & AT91_PMC_MOSCSELS ? 1 : 0;
5540  static int clk_sam9x5_main_prepare(struct clk_hw *hw)
5542         struct clk_sam9x5_main *clkmain = to_clk_sam9x5_main(hw);
5543 -       struct at91_pmc *pmc = clkmain->pmc;
5544 +       struct regmap *regmap = clkmain->regmap;
5546 -       while (!(pmc_read(pmc, AT91_PMC_SR) & AT91_PMC_MOSCSELS)) {
5547 -               enable_irq(clkmain->irq);
5548 -               wait_event(clkmain->wait,
5549 -                          pmc_read(pmc, AT91_PMC_SR) & AT91_PMC_MOSCSELS);
5550 -       }
5551 +       while (!clk_sam9x5_main_ready(regmap))
5552 +               cpu_relax();
5554 -       return clk_main_probe_frequency(pmc);
5555 +       return clk_main_probe_frequency(regmap);
5558  static int clk_sam9x5_main_is_prepared(struct clk_hw *hw)
5560         struct clk_sam9x5_main *clkmain = to_clk_sam9x5_main(hw);
5562 -       return !!(pmc_read(clkmain->pmc, AT91_PMC_SR) & AT91_PMC_MOSCSELS);
5563 +       return clk_sam9x5_main_ready(clkmain->regmap);
5566  static unsigned long clk_sam9x5_main_recalc_rate(struct clk_hw *hw,
5567 @@ -514,30 +485,28 @@ static unsigned long clk_sam9x5_main_recalc_rate(struct clk_hw *hw,
5569         struct clk_sam9x5_main *clkmain = to_clk_sam9x5_main(hw);
5571 -       return clk_main_recalc_rate(clkmain->pmc, parent_rate);
5572 +       return clk_main_recalc_rate(clkmain->regmap, parent_rate);
5575  static int clk_sam9x5_main_set_parent(struct clk_hw *hw, u8 index)
5577         struct clk_sam9x5_main *clkmain = to_clk_sam9x5_main(hw);
5578 -       struct at91_pmc *pmc = clkmain->pmc;
5579 -       u32 tmp;
5580 +       struct regmap *regmap = clkmain->regmap;
5581 +       unsigned int tmp;
5583         if (index > 1)
5584                 return -EINVAL;
5586 -       tmp = pmc_read(pmc, AT91_CKGR_MOR) & ~MOR_KEY_MASK;
5587 +       regmap_read(regmap, AT91_CKGR_MOR, &tmp);
5588 +       tmp &= ~MOR_KEY_MASK;
5590         if (index && !(tmp & AT91_PMC_MOSCSEL))
5591 -               pmc_write(pmc, AT91_CKGR_MOR, tmp | AT91_PMC_MOSCSEL);
5592 +               regmap_write(regmap, AT91_CKGR_MOR, tmp | AT91_PMC_MOSCSEL);
5593         else if (!index && (tmp & AT91_PMC_MOSCSEL))
5594 -               pmc_write(pmc, AT91_CKGR_MOR, tmp & ~AT91_PMC_MOSCSEL);
5595 +               regmap_write(regmap, AT91_CKGR_MOR, tmp & ~AT91_PMC_MOSCSEL);
5597 -       while (!(pmc_read(pmc, AT91_PMC_SR) & AT91_PMC_MOSCSELS)) {
5598 -               enable_irq(clkmain->irq);
5599 -               wait_event(clkmain->wait,
5600 -                          pmc_read(pmc, AT91_PMC_SR) & AT91_PMC_MOSCSELS);
5601 -       }
5602 +       while (!clk_sam9x5_main_ready(regmap))
5603 +               cpu_relax();
5605         return 0;
5607 @@ -545,8 +514,11 @@ static int clk_sam9x5_main_set_parent(struct clk_hw *hw, u8 index)
5608  static u8 clk_sam9x5_main_get_parent(struct clk_hw *hw)
5610         struct clk_sam9x5_main *clkmain = to_clk_sam9x5_main(hw);
5611 +       unsigned int status;
5613 +       regmap_read(clkmain->regmap, AT91_CKGR_MOR, &status);
5615 -       return !!(pmc_read(clkmain->pmc, AT91_CKGR_MOR) & AT91_PMC_MOSCEN);
5616 +       return status & AT91_PMC_MOSCEN ? 1 : 0;
5619  static const struct clk_ops sam9x5_main_ops = {
5620 @@ -558,18 +530,17 @@ static const struct clk_ops sam9x5_main_ops = {
5621  };
5623  static struct clk * __init
5624 -at91_clk_register_sam9x5_main(struct at91_pmc *pmc,
5625 -                             unsigned int irq,
5626 +at91_clk_register_sam9x5_main(struct regmap *regmap,
5627                               const char *name,
5628                               const char **parent_names,
5629                               int num_parents)
5631 -       int ret;
5632         struct clk_sam9x5_main *clkmain;
5633         struct clk *clk = NULL;
5634         struct clk_init_data init;
5635 +       unsigned int status;
5637 -       if (!pmc || !irq || !name)
5638 +       if (!name)
5639                 return ERR_PTR(-EINVAL);
5641         if (!parent_names || !num_parents)
5642 @@ -586,51 +557,42 @@ at91_clk_register_sam9x5_main(struct at91_pmc *pmc,
5643         init.flags = CLK_SET_PARENT_GATE;
5645         clkmain->hw.init = &init;
5646 -       clkmain->pmc = pmc;
5647 -       clkmain->irq = irq;
5648 -       clkmain->parent = !!(pmc_read(clkmain->pmc, AT91_CKGR_MOR) &
5649 -                            AT91_PMC_MOSCEN);
5650 -       init_waitqueue_head(&clkmain->wait);
5651 -       irq_set_status_flags(clkmain->irq, IRQ_NOAUTOEN);
5652 -       ret = request_irq(clkmain->irq, clk_sam9x5_main_irq_handler,
5653 -                         IRQF_TRIGGER_HIGH, name, clkmain);
5654 -       if (ret)
5655 -               return ERR_PTR(ret);
5656 +       clkmain->regmap = regmap;
5657 +       regmap_read(clkmain->regmap, AT91_CKGR_MOR, &status);
5658 +       clkmain->parent = status & AT91_PMC_MOSCEN ? 1 : 0;
5660         clk = clk_register(NULL, &clkmain->hw);
5661 -       if (IS_ERR(clk)) {
5662 -               free_irq(clkmain->irq, clkmain);
5663 +       if (IS_ERR(clk))
5664                 kfree(clkmain);
5665 -       }
5667         return clk;
5670 -void __init of_at91sam9x5_clk_main_setup(struct device_node *np,
5671 -                                        struct at91_pmc *pmc)
5672 +static void __init of_at91sam9x5_clk_main_setup(struct device_node *np)
5674         struct clk *clk;
5675         const char *parent_names[2];
5676         int num_parents;
5677 -       unsigned int irq;
5678         const char *name = np->name;
5679 +       struct regmap *regmap;
5681         num_parents = of_clk_get_parent_count(np);
5682         if (num_parents <= 0 || num_parents > 2)
5683                 return;
5685         of_clk_parent_fill(np, parent_names, num_parents);
5686 +       regmap = syscon_node_to_regmap(of_get_parent(np));
5687 +       if (IS_ERR(regmap))
5688 +               return;
5690         of_property_read_string(np, "clock-output-names", &name);
5692 -       irq = irq_of_parse_and_map(np, 0);
5693 -       if (!irq)
5694 -               return;
5696 -       clk = at91_clk_register_sam9x5_main(pmc, irq, name, parent_names,
5697 +       clk = at91_clk_register_sam9x5_main(regmap, name, parent_names,
5698                                             num_parents);
5699         if (IS_ERR(clk))
5700                 return;
5702         of_clk_add_provider(np, of_clk_src_simple_get, clk);
5704 +CLK_OF_DECLARE(at91sam9x5_clk_main, "atmel,at91sam9x5-clk-main",
5705 +              of_at91sam9x5_clk_main_setup);
5706 diff --git a/drivers/clk/at91/clk-master.c b/drivers/clk/at91/clk-master.c
5707 index 620ea323356b..7d4a1864ea7c 100644
5708 --- a/drivers/clk/at91/clk-master.c
5709 +++ b/drivers/clk/at91/clk-master.c
5710 @@ -12,13 +12,8 @@
5711  #include <linux/clkdev.h>
5712  #include <linux/clk/at91_pmc.h>
5713  #include <linux/of.h>
5714 -#include <linux/of_address.h>
5715 -#include <linux/of_irq.h>
5716 -#include <linux/io.h>
5717 -#include <linux/wait.h>
5718 -#include <linux/sched.h>
5719 -#include <linux/interrupt.h>
5720 -#include <linux/irq.h>
5721 +#include <linux/mfd/syscon.h>
5722 +#include <linux/regmap.h>
5724  #include "pmc.h"
5726 @@ -44,32 +39,26 @@ struct clk_master_layout {
5728  struct clk_master {
5729         struct clk_hw hw;
5730 -       struct at91_pmc *pmc;
5731 -       unsigned int irq;
5732 -       wait_queue_head_t wait;
5733 +       struct regmap *regmap;
5734         const struct clk_master_layout *layout;
5735         const struct clk_master_characteristics *characteristics;
5736  };
5738 -static irqreturn_t clk_master_irq_handler(int irq, void *dev_id)
5739 +static inline bool clk_master_ready(struct regmap *regmap)
5741 -       struct clk_master *master = (struct clk_master *)dev_id;
5742 +       unsigned int status;
5744 -       wake_up(&master->wait);
5745 -       disable_irq_nosync(master->irq);
5746 +       regmap_read(regmap, AT91_PMC_SR, &status);
5748 -       return IRQ_HANDLED;
5749 +       return status & AT91_PMC_MCKRDY ? 1 : 0;
5752  static int clk_master_prepare(struct clk_hw *hw)
5754         struct clk_master *master = to_clk_master(hw);
5755 -       struct at91_pmc *pmc = master->pmc;
5757 -       while (!(pmc_read(pmc, AT91_PMC_SR) & AT91_PMC_MCKRDY)) {
5758 -               enable_irq(master->irq);
5759 -               wait_event(master->wait,
5760 -                          pmc_read(pmc, AT91_PMC_SR) & AT91_PMC_MCKRDY);
5761 -       }
5762 +       while (!clk_master_ready(master->regmap))
5763 +               cpu_relax();
5765         return 0;
5767 @@ -78,7 +67,7 @@ static int clk_master_is_prepared(struct clk_hw *hw)
5769         struct clk_master *master = to_clk_master(hw);
5771 -       return !!(pmc_read(master->pmc, AT91_PMC_SR) & AT91_PMC_MCKRDY);
5772 +       return clk_master_ready(master->regmap);
5775  static unsigned long clk_master_recalc_rate(struct clk_hw *hw,
5776 @@ -88,18 +77,16 @@ static unsigned long clk_master_recalc_rate(struct clk_hw *hw,
5777         u8 div;
5778         unsigned long rate = parent_rate;
5779         struct clk_master *master = to_clk_master(hw);
5780 -       struct at91_pmc *pmc = master->pmc;
5781         const struct clk_master_layout *layout = master->layout;
5782         const struct clk_master_characteristics *characteristics =
5783                                                 master->characteristics;
5784 -       u32 tmp;
5785 +       unsigned int mckr;
5787 -       pmc_lock(pmc);
5788 -       tmp = pmc_read(pmc, AT91_PMC_MCKR) & layout->mask;
5789 -       pmc_unlock(pmc);
5790 +       regmap_read(master->regmap, AT91_PMC_MCKR, &mckr);
5791 +       mckr &= layout->mask;
5793 -       pres = (tmp >> layout->pres_shift) & MASTER_PRES_MASK;
5794 -       div = (tmp >> MASTER_DIV_SHIFT) & MASTER_DIV_MASK;
5795 +       pres = (mckr >> layout->pres_shift) & MASTER_PRES_MASK;
5796 +       div = (mckr >> MASTER_DIV_SHIFT) & MASTER_DIV_MASK;
5798         if (characteristics->have_div3_pres && pres == MASTER_PRES_MAX)
5799                 rate /= 3;
5800 @@ -119,9 +106,11 @@ static unsigned long clk_master_recalc_rate(struct clk_hw *hw,
5801  static u8 clk_master_get_parent(struct clk_hw *hw)
5803         struct clk_master *master = to_clk_master(hw);
5804 -       struct at91_pmc *pmc = master->pmc;
5805 +       unsigned int mckr;
5807 -       return pmc_read(pmc, AT91_PMC_MCKR) & AT91_PMC_CSS;
5808 +       regmap_read(master->regmap, AT91_PMC_MCKR, &mckr);
5810 +       return mckr & AT91_PMC_CSS;
5813  static const struct clk_ops master_ops = {
5814 @@ -132,18 +121,17 @@ static const struct clk_ops master_ops = {
5815  };
5817  static struct clk * __init
5818 -at91_clk_register_master(struct at91_pmc *pmc, unsigned int irq,
5819 +at91_clk_register_master(struct regmap *regmap,
5820                 const char *name, int num_parents,
5821                 const char **parent_names,
5822                 const struct clk_master_layout *layout,
5823                 const struct clk_master_characteristics *characteristics)
5825 -       int ret;
5826         struct clk_master *master;
5827         struct clk *clk = NULL;
5828         struct clk_init_data init;
5830 -       if (!pmc || !irq || !name || !num_parents || !parent_names)
5831 +       if (!name || !num_parents || !parent_names)
5832                 return ERR_PTR(-EINVAL);
5834         master = kzalloc(sizeof(*master), GFP_KERNEL);
5835 @@ -159,20 +147,10 @@ at91_clk_register_master(struct at91_pmc *pmc, unsigned int irq,
5836         master->hw.init = &init;
5837         master->layout = layout;
5838         master->characteristics = characteristics;
5839 -       master->pmc = pmc;
5840 -       master->irq = irq;
5841 -       init_waitqueue_head(&master->wait);
5842 -       irq_set_status_flags(master->irq, IRQ_NOAUTOEN);
5843 -       ret = request_irq(master->irq, clk_master_irq_handler,
5844 -                         IRQF_TRIGGER_HIGH, "clk-master", master);
5845 -       if (ret) {
5846 -               kfree(master);
5847 -               return ERR_PTR(ret);
5848 -       }
5849 +       master->regmap = regmap;
5851         clk = clk_register(NULL, &master->hw);
5852         if (IS_ERR(clk)) {
5853 -               free_irq(master->irq, master);
5854                 kfree(master);
5855         }
5857 @@ -217,15 +195,15 @@ out_free_characteristics:
5860  static void __init
5861 -of_at91_clk_master_setup(struct device_node *np, struct at91_pmc *pmc,
5862 +of_at91_clk_master_setup(struct device_node *np,
5863                          const struct clk_master_layout *layout)
5865         struct clk *clk;
5866         int num_parents;
5867 -       unsigned int irq;
5868         const char *parent_names[MASTER_SOURCE_MAX];
5869         const char *name = np->name;
5870         struct clk_master_characteristics *characteristics;
5871 +       struct regmap *regmap;
5873         num_parents = of_clk_get_parent_count(np);
5874         if (num_parents <= 0 || num_parents > MASTER_SOURCE_MAX)
5875 @@ -239,11 +217,11 @@ of_at91_clk_master_setup(struct device_node *np, struct at91_pmc *pmc,
5876         if (!characteristics)
5877                 return;
5879 -       irq = irq_of_parse_and_map(np, 0);
5880 -       if (!irq)
5881 -               goto out_free_characteristics;
5882 +       regmap = syscon_node_to_regmap(of_get_parent(np));
5883 +       if (IS_ERR(regmap))
5884 +               return;
5886 -       clk = at91_clk_register_master(pmc, irq, name, num_parents,
5887 +       clk = at91_clk_register_master(regmap, name, num_parents,
5888                                        parent_names, layout,
5889                                        characteristics);
5890         if (IS_ERR(clk))
5891 @@ -256,14 +234,16 @@ out_free_characteristics:
5892         kfree(characteristics);
5895 -void __init of_at91rm9200_clk_master_setup(struct device_node *np,
5896 -                                          struct at91_pmc *pmc)
5897 +static void __init of_at91rm9200_clk_master_setup(struct device_node *np)
5899 -       of_at91_clk_master_setup(np, pmc, &at91rm9200_master_layout);
5900 +       of_at91_clk_master_setup(np, &at91rm9200_master_layout);
5902 +CLK_OF_DECLARE(at91rm9200_clk_master, "atmel,at91rm9200-clk-master",
5903 +              of_at91rm9200_clk_master_setup);
5905 -void __init of_at91sam9x5_clk_master_setup(struct device_node *np,
5906 -                                          struct at91_pmc *pmc)
5907 +static void __init of_at91sam9x5_clk_master_setup(struct device_node *np)
5909 -       of_at91_clk_master_setup(np, pmc, &at91sam9x5_master_layout);
5910 +       of_at91_clk_master_setup(np, &at91sam9x5_master_layout);
5912 +CLK_OF_DECLARE(at91sam9x5_clk_master, "atmel,at91sam9x5-clk-master",
5913 +              of_at91sam9x5_clk_master_setup);
5914 diff --git a/drivers/clk/at91/clk-peripheral.c b/drivers/clk/at91/clk-peripheral.c
5915 index 58f3b568e9cb..d69cd2a121b1 100644
5916 --- a/drivers/clk/at91/clk-peripheral.c
5917 +++ b/drivers/clk/at91/clk-peripheral.c
5918 @@ -12,11 +12,13 @@
5919  #include <linux/clkdev.h>
5920  #include <linux/clk/at91_pmc.h>
5921  #include <linux/of.h>
5922 -#include <linux/of_address.h>
5923 -#include <linux/io.h>
5924 +#include <linux/mfd/syscon.h>
5925 +#include <linux/regmap.h>
5927  #include "pmc.h"
5929 +DEFINE_SPINLOCK(pmc_pcr_lock);
5931  #define PERIPHERAL_MAX         64
5933  #define PERIPHERAL_AT91RM9200  0
5934 @@ -33,7 +35,7 @@
5936  struct clk_peripheral {
5937         struct clk_hw hw;
5938 -       struct at91_pmc *pmc;
5939 +       struct regmap *regmap;
5940         u32 id;
5941  };
5943 @@ -41,8 +43,9 @@ struct clk_peripheral {
5945  struct clk_sam9x5_peripheral {
5946         struct clk_hw hw;
5947 -       struct at91_pmc *pmc;
5948 +       struct regmap *regmap;
5949         struct clk_range range;
5950 +       spinlock_t *lock;
5951         u32 id;
5952         u32 div;
5953         bool auto_div;
5954 @@ -54,7 +57,6 @@ struct clk_sam9x5_peripheral {
5955  static int clk_peripheral_enable(struct clk_hw *hw)
5957         struct clk_peripheral *periph = to_clk_peripheral(hw);
5958 -       struct at91_pmc *pmc = periph->pmc;
5959         int offset = AT91_PMC_PCER;
5960         u32 id = periph->id;
5962 @@ -62,14 +64,14 @@ static int clk_peripheral_enable(struct clk_hw *hw)
5963                 return 0;
5964         if (id > PERIPHERAL_ID_MAX)
5965                 offset = AT91_PMC_PCER1;
5966 -       pmc_write(pmc, offset, PERIPHERAL_MASK(id));
5967 +       regmap_write(periph->regmap, offset, PERIPHERAL_MASK(id));
5969         return 0;
5972  static void clk_peripheral_disable(struct clk_hw *hw)
5974         struct clk_peripheral *periph = to_clk_peripheral(hw);
5975 -       struct at91_pmc *pmc = periph->pmc;
5976         int offset = AT91_PMC_PCDR;
5977         u32 id = periph->id;
5979 @@ -77,21 +79,23 @@ static void clk_peripheral_disable(struct clk_hw *hw)
5980                 return;
5981         if (id > PERIPHERAL_ID_MAX)
5982                 offset = AT91_PMC_PCDR1;
5983 -       pmc_write(pmc, offset, PERIPHERAL_MASK(id));
5984 +       regmap_write(periph->regmap, offset, PERIPHERAL_MASK(id));
5987  static int clk_peripheral_is_enabled(struct clk_hw *hw)
5989         struct clk_peripheral *periph = to_clk_peripheral(hw);
5990 -       struct at91_pmc *pmc = periph->pmc;
5991         int offset = AT91_PMC_PCSR;
5992 +       unsigned int status;
5993         u32 id = periph->id;
5995         if (id < PERIPHERAL_ID_MIN)
5996                 return 1;
5997         if (id > PERIPHERAL_ID_MAX)
5998                 offset = AT91_PMC_PCSR1;
5999 -       return !!(pmc_read(pmc, offset) & PERIPHERAL_MASK(id));
6000 +       regmap_read(periph->regmap, offset, &status);
6002 +       return status & PERIPHERAL_MASK(id) ? 1 : 0;
6005  static const struct clk_ops peripheral_ops = {
6006 @@ -101,14 +105,14 @@ static const struct clk_ops peripheral_ops = {
6007  };
6009  static struct clk * __init
6010 -at91_clk_register_peripheral(struct at91_pmc *pmc, const char *name,
6011 +at91_clk_register_peripheral(struct regmap *regmap, const char *name,
6012                              const char *parent_name, u32 id)
6014         struct clk_peripheral *periph;
6015         struct clk *clk = NULL;
6016         struct clk_init_data init;
6018 -       if (!pmc || !name || !parent_name || id > PERIPHERAL_ID_MAX)
6019 +       if (!name || !parent_name || id > PERIPHERAL_ID_MAX)
6020                 return ERR_PTR(-EINVAL);
6022         periph = kzalloc(sizeof(*periph), GFP_KERNEL);
6023 @@ -123,7 +127,7 @@ at91_clk_register_peripheral(struct at91_pmc *pmc, const char *name,
6025         periph->id = id;
6026         periph->hw.init = &init;
6027 -       periph->pmc = pmc;
6028 +       periph->regmap = regmap;
6030         clk = clk_register(NULL, &periph->hw);
6031         if (IS_ERR(clk))
6032 @@ -160,53 +164,58 @@ static void clk_sam9x5_peripheral_autodiv(struct clk_sam9x5_peripheral *periph)
6033  static int clk_sam9x5_peripheral_enable(struct clk_hw *hw)
6035         struct clk_sam9x5_peripheral *periph = to_clk_sam9x5_peripheral(hw);
6036 -       struct at91_pmc *pmc = periph->pmc;
6037 -       u32 tmp;
6038 +       unsigned long flags;
6040         if (periph->id < PERIPHERAL_ID_MIN)
6041                 return 0;
6043 -       pmc_lock(pmc);
6044 -       pmc_write(pmc, AT91_PMC_PCR, (periph->id & AT91_PMC_PCR_PID_MASK));
6045 -       tmp = pmc_read(pmc, AT91_PMC_PCR) & ~AT91_PMC_PCR_DIV_MASK;
6046 -       pmc_write(pmc, AT91_PMC_PCR, tmp | AT91_PMC_PCR_DIV(periph->div)
6047 -                                        | AT91_PMC_PCR_CMD
6048 -                                        | AT91_PMC_PCR_EN);
6049 -       pmc_unlock(pmc);
6050 +       spin_lock_irqsave(periph->lock, flags);
6051 +       regmap_write(periph->regmap, AT91_PMC_PCR,
6052 +                    (periph->id & AT91_PMC_PCR_PID_MASK));
6053 +       regmap_update_bits(periph->regmap, AT91_PMC_PCR,
6054 +                          AT91_PMC_PCR_DIV_MASK | AT91_PMC_PCR_CMD |
6055 +                          AT91_PMC_PCR_EN,
6056 +                          AT91_PMC_PCR_DIV(periph->div) |
6057 +                          AT91_PMC_PCR_CMD |
6058 +                          AT91_PMC_PCR_EN);
6059 +       spin_unlock_irqrestore(periph->lock, flags);
6061         return 0;
6064  static void clk_sam9x5_peripheral_disable(struct clk_hw *hw)
6066         struct clk_sam9x5_peripheral *periph = to_clk_sam9x5_peripheral(hw);
6067 -       struct at91_pmc *pmc = periph->pmc;
6068 -       u32 tmp;
6069 +       unsigned long flags;
6071         if (periph->id < PERIPHERAL_ID_MIN)
6072                 return;
6074 -       pmc_lock(pmc);
6075 -       pmc_write(pmc, AT91_PMC_PCR, (periph->id & AT91_PMC_PCR_PID_MASK));
6076 -       tmp = pmc_read(pmc, AT91_PMC_PCR) & ~AT91_PMC_PCR_EN;
6077 -       pmc_write(pmc, AT91_PMC_PCR, tmp | AT91_PMC_PCR_CMD);
6078 -       pmc_unlock(pmc);
6079 +       spin_lock_irqsave(periph->lock, flags);
6080 +       regmap_write(periph->regmap, AT91_PMC_PCR,
6081 +                    (periph->id & AT91_PMC_PCR_PID_MASK));
6082 +       regmap_update_bits(periph->regmap, AT91_PMC_PCR,
6083 +                          AT91_PMC_PCR_EN | AT91_PMC_PCR_CMD,
6084 +                          AT91_PMC_PCR_CMD);
6085 +       spin_unlock_irqrestore(periph->lock, flags);
6088  static int clk_sam9x5_peripheral_is_enabled(struct clk_hw *hw)
6090         struct clk_sam9x5_peripheral *periph = to_clk_sam9x5_peripheral(hw);
6091 -       struct at91_pmc *pmc = periph->pmc;
6092 -       int ret;
6093 +       unsigned long flags;
6094 +       unsigned int status;
6096         if (periph->id < PERIPHERAL_ID_MIN)
6097                 return 1;
6099 -       pmc_lock(pmc);
6100 -       pmc_write(pmc, AT91_PMC_PCR, (periph->id & AT91_PMC_PCR_PID_MASK));
6101 -       ret = !!(pmc_read(pmc, AT91_PMC_PCR) & AT91_PMC_PCR_EN);
6102 -       pmc_unlock(pmc);
6103 +       spin_lock_irqsave(periph->lock, flags);
6104 +       regmap_write(periph->regmap, AT91_PMC_PCR,
6105 +                    (periph->id & AT91_PMC_PCR_PID_MASK));
6106 +       regmap_read(periph->regmap, AT91_PMC_PCR, &status);
6107 +       spin_unlock_irqrestore(periph->lock, flags);
6109 -       return ret;
6110 +       return status & AT91_PMC_PCR_EN ? 1 : 0;
6113  static unsigned long
6114 @@ -214,19 +223,20 @@ clk_sam9x5_peripheral_recalc_rate(struct clk_hw *hw,
6115                                   unsigned long parent_rate)
6117         struct clk_sam9x5_peripheral *periph = to_clk_sam9x5_peripheral(hw);
6118 -       struct at91_pmc *pmc = periph->pmc;
6119 -       u32 tmp;
6120 +       unsigned long flags;
6121 +       unsigned int status;
6123         if (periph->id < PERIPHERAL_ID_MIN)
6124                 return parent_rate;
6126 -       pmc_lock(pmc);
6127 -       pmc_write(pmc, AT91_PMC_PCR, (periph->id & AT91_PMC_PCR_PID_MASK));
6128 -       tmp = pmc_read(pmc, AT91_PMC_PCR);
6129 -       pmc_unlock(pmc);
6130 +       spin_lock_irqsave(periph->lock, flags);
6131 +       regmap_write(periph->regmap, AT91_PMC_PCR,
6132 +                    (periph->id & AT91_PMC_PCR_PID_MASK));
6133 +       regmap_read(periph->regmap, AT91_PMC_PCR, &status);
6134 +       spin_unlock_irqrestore(periph->lock, flags);
6136 -       if (tmp & AT91_PMC_PCR_EN) {
6137 -               periph->div = PERIPHERAL_RSHIFT(tmp);
6138 +       if (status & AT91_PMC_PCR_EN) {
6139 +               periph->div = PERIPHERAL_RSHIFT(status);
6140                 periph->auto_div = false;
6141         } else {
6142                 clk_sam9x5_peripheral_autodiv(periph);
6143 @@ -318,15 +328,15 @@ static const struct clk_ops sam9x5_peripheral_ops = {
6144  };
6146  static struct clk * __init
6147 -at91_clk_register_sam9x5_peripheral(struct at91_pmc *pmc, const char *name,
6148 -                                   const char *parent_name, u32 id,
6149 -                                   const struct clk_range *range)
6150 +at91_clk_register_sam9x5_peripheral(struct regmap *regmap, spinlock_t *lock,
6151 +                                   const char *name, const char *parent_name,
6152 +                                   u32 id, const struct clk_range *range)
6154         struct clk_sam9x5_peripheral *periph;
6155         struct clk *clk = NULL;
6156         struct clk_init_data init;
6158 -       if (!pmc || !name || !parent_name)
6159 +       if (!name || !parent_name)
6160                 return ERR_PTR(-EINVAL);
6162         periph = kzalloc(sizeof(*periph), GFP_KERNEL);
6163 @@ -342,7 +352,8 @@ at91_clk_register_sam9x5_peripheral(struct at91_pmc *pmc, const char *name,
6164         periph->id = id;
6165         periph->hw.init = &init;
6166         periph->div = 0;
6167 -       periph->pmc = pmc;
6168 +       periph->regmap = regmap;
6169 +       periph->lock = lock;
6170         periph->auto_div = true;
6171         periph->range = *range;
6173 @@ -356,7 +367,7 @@ at91_clk_register_sam9x5_peripheral(struct at91_pmc *pmc, const char *name,
6176  static void __init
6177 -of_at91_clk_periph_setup(struct device_node *np, struct at91_pmc *pmc, u8 type)
6178 +of_at91_clk_periph_setup(struct device_node *np, u8 type)
6180         int num;
6181         u32 id;
6182 @@ -364,6 +375,7 @@ of_at91_clk_periph_setup(struct device_node *np, struct at91_pmc *pmc, u8 type)
6183         const char *parent_name;
6184         const char *name;
6185         struct device_node *periphclknp;
6186 +       struct regmap *regmap;
6188         parent_name = of_clk_get_parent_name(np, 0);
6189         if (!parent_name)
6190 @@ -373,6 +385,10 @@ of_at91_clk_periph_setup(struct device_node *np, struct at91_pmc *pmc, u8 type)
6191         if (!num || num > PERIPHERAL_MAX)
6192                 return;
6194 +       regmap = syscon_node_to_regmap(of_get_parent(np));
6195 +       if (IS_ERR(regmap))
6196 +               return;
6198         for_each_child_of_node(np, periphclknp) {
6199                 if (of_property_read_u32(periphclknp, "reg", &id))
6200                         continue;
6201 @@ -384,7 +400,7 @@ of_at91_clk_periph_setup(struct device_node *np, struct at91_pmc *pmc, u8 type)
6202                         name = periphclknp->name;
6204                 if (type == PERIPHERAL_AT91RM9200) {
6205 -                       clk = at91_clk_register_peripheral(pmc, name,
6206 +                       clk = at91_clk_register_peripheral(regmap, name,
6207                                                            parent_name, id);
6208                 } else {
6209                         struct clk_range range = CLK_RANGE(0, 0);
6210 @@ -393,7 +409,9 @@ of_at91_clk_periph_setup(struct device_node *np, struct at91_pmc *pmc, u8 type)
6211                                               "atmel,clk-output-range",
6212                                               &range);
6214 -                       clk = at91_clk_register_sam9x5_peripheral(pmc, name,
6215 +                       clk = at91_clk_register_sam9x5_peripheral(regmap,
6216 +                                                                 &pmc_pcr_lock,
6217 +                                                                 name,
6218                                                                   parent_name,
6219                                                                   id, &range);
6220                 }
6221 @@ -405,14 +423,16 @@ of_at91_clk_periph_setup(struct device_node *np, struct at91_pmc *pmc, u8 type)
6222         }
6225 -void __init of_at91rm9200_clk_periph_setup(struct device_node *np,
6226 -                                          struct at91_pmc *pmc)
6227 +static void __init of_at91rm9200_clk_periph_setup(struct device_node *np)
6229 -       of_at91_clk_periph_setup(np, pmc, PERIPHERAL_AT91RM9200);
6230 +       of_at91_clk_periph_setup(np, PERIPHERAL_AT91RM9200);
6232 +CLK_OF_DECLARE(at91rm9200_clk_periph, "atmel,at91rm9200-clk-peripheral",
6233 +              of_at91rm9200_clk_periph_setup);
6235 -void __init of_at91sam9x5_clk_periph_setup(struct device_node *np,
6236 -                                          struct at91_pmc *pmc)
6237 +static void __init of_at91sam9x5_clk_periph_setup(struct device_node *np)
6239 -       of_at91_clk_periph_setup(np, pmc, PERIPHERAL_AT91SAM9X5);
6240 +       of_at91_clk_periph_setup(np, PERIPHERAL_AT91SAM9X5);
6242 +CLK_OF_DECLARE(at91sam9x5_clk_periph, "atmel,at91sam9x5-clk-peripheral",
6243 +              of_at91sam9x5_clk_periph_setup);
6244 diff --git a/drivers/clk/at91/clk-pll.c b/drivers/clk/at91/clk-pll.c
6245 index 18b60f4895a6..fb2e0b56d4b7 100644
6246 --- a/drivers/clk/at91/clk-pll.c
6247 +++ b/drivers/clk/at91/clk-pll.c
6248 @@ -12,14 +12,8 @@
6249  #include <linux/clkdev.h>
6250  #include <linux/clk/at91_pmc.h>
6251  #include <linux/of.h>
6252 -#include <linux/of_address.h>
6253 -#include <linux/of_irq.h>
6254 -#include <linux/io.h>
6255 -#include <linux/kernel.h>
6256 -#include <linux/wait.h>
6257 -#include <linux/sched.h>
6258 -#include <linux/interrupt.h>
6259 -#include <linux/irq.h>
6260 +#include <linux/mfd/syscon.h>
6261 +#include <linux/regmap.h>
6263  #include "pmc.h"
6265 @@ -58,9 +52,7 @@ struct clk_pll_layout {
6267  struct clk_pll {
6268         struct clk_hw hw;
6269 -       struct at91_pmc *pmc;
6270 -       unsigned int irq;
6271 -       wait_queue_head_t wait;
6272 +       struct regmap *regmap;
6273         u8 id;
6274         u8 div;
6275         u8 range;
6276 @@ -69,20 +61,19 @@ struct clk_pll {
6277         const struct clk_pll_characteristics *characteristics;
6278  };
6280 -static irqreturn_t clk_pll_irq_handler(int irq, void *dev_id)
6281 +static inline bool clk_pll_ready(struct regmap *regmap, int id)
6283 -       struct clk_pll *pll = (struct clk_pll *)dev_id;
6284 +       unsigned int status;
6286 -       wake_up(&pll->wait);
6287 -       disable_irq_nosync(pll->irq);
6288 +       regmap_read(regmap, AT91_PMC_SR, &status);
6290 -       return IRQ_HANDLED;
6291 +       return status & PLL_STATUS_MASK(id) ? 1 : 0;
6294  static int clk_pll_prepare(struct clk_hw *hw)
6296         struct clk_pll *pll = to_clk_pll(hw);
6297 -       struct at91_pmc *pmc = pll->pmc;
6298 +       struct regmap *regmap = pll->regmap;
6299         const struct clk_pll_layout *layout = pll->layout;
6300         const struct clk_pll_characteristics *characteristics =
6301                                                         pll->characteristics;
6302 @@ -90,39 +81,34 @@ static int clk_pll_prepare(struct clk_hw *hw)
6303         u32 mask = PLL_STATUS_MASK(id);
6304         int offset = PLL_REG(id);
6305         u8 out = 0;
6306 -       u32 pllr, icpr;
6307 +       unsigned int pllr;
6308 +       unsigned int status;
6309         u8 div;
6310         u16 mul;
6312 -       pllr = pmc_read(pmc, offset);
6313 +       regmap_read(regmap, offset, &pllr);
6314         div = PLL_DIV(pllr);
6315         mul = PLL_MUL(pllr, layout);
6317 -       if ((pmc_read(pmc, AT91_PMC_SR) & mask) &&
6318 +       regmap_read(regmap, AT91_PMC_SR, &status);
6319 +       if ((status & mask) &&
6320             (div == pll->div && mul == pll->mul))
6321                 return 0;
6323         if (characteristics->out)
6324                 out = characteristics->out[pll->range];
6325 -       if (characteristics->icpll) {
6326 -               icpr = pmc_read(pmc, AT91_PMC_PLLICPR) & ~PLL_ICPR_MASK(id);
6327 -               icpr |= (characteristics->icpll[pll->range] <<
6328 -                       PLL_ICPR_SHIFT(id));
6329 -               pmc_write(pmc, AT91_PMC_PLLICPR, icpr);
6330 -       }
6332 -       pllr &= ~layout->pllr_mask;
6333 -       pllr |= layout->pllr_mask &
6334 -              (pll->div | (PLL_MAX_COUNT << PLL_COUNT_SHIFT) |
6335 -               (out << PLL_OUT_SHIFT) |
6336 -               ((pll->mul & layout->mul_mask) << layout->mul_shift));
6337 -       pmc_write(pmc, offset, pllr);
6339 -       while (!(pmc_read(pmc, AT91_PMC_SR) & mask)) {
6340 -               enable_irq(pll->irq);
6341 -               wait_event(pll->wait,
6342 -                          pmc_read(pmc, AT91_PMC_SR) & mask);
6343 -       }
6344 +       if (characteristics->icpll)
6345 +               regmap_update_bits(regmap, AT91_PMC_PLLICPR, PLL_ICPR_MASK(id),
6346 +                       characteristics->icpll[pll->range] << PLL_ICPR_SHIFT(id));
6348 +       regmap_update_bits(regmap, offset, layout->pllr_mask,
6349 +                       pll->div | (PLL_MAX_COUNT << PLL_COUNT_SHIFT) |
6350 +                       (out << PLL_OUT_SHIFT) |
6351 +                       ((pll->mul & layout->mul_mask) << layout->mul_shift));
6353 +       while (!clk_pll_ready(regmap, pll->id))
6354 +               cpu_relax();
6356         return 0;
6358 @@ -130,32 +116,35 @@ static int clk_pll_prepare(struct clk_hw *hw)
6359  static int clk_pll_is_prepared(struct clk_hw *hw)
6361         struct clk_pll *pll = to_clk_pll(hw);
6362 -       struct at91_pmc *pmc = pll->pmc;
6364 -       return !!(pmc_read(pmc, AT91_PMC_SR) &
6365 -                 PLL_STATUS_MASK(pll->id));
6366 +       return clk_pll_ready(pll->regmap, pll->id);
6369  static void clk_pll_unprepare(struct clk_hw *hw)
6371         struct clk_pll *pll = to_clk_pll(hw);
6372 -       struct at91_pmc *pmc = pll->pmc;
6373 -       const struct clk_pll_layout *layout = pll->layout;
6374 -       int offset = PLL_REG(pll->id);
6375 -       u32 tmp = pmc_read(pmc, offset) & ~(layout->pllr_mask);
6376 +       unsigned int mask = pll->layout->pllr_mask;
6378 -       pmc_write(pmc, offset, tmp);
6379 +       regmap_update_bits(pll->regmap, PLL_REG(pll->id), mask, ~mask);
6382  static unsigned long clk_pll_recalc_rate(struct clk_hw *hw,
6383                                          unsigned long parent_rate)
6385         struct clk_pll *pll = to_clk_pll(hw);
6386 +       unsigned int pllr;
6387 +       u16 mul;
6388 +       u8 div;
6390 -       if (!pll->div || !pll->mul)
6391 +       regmap_read(pll->regmap, PLL_REG(pll->id), &pllr);
6393 +       div = PLL_DIV(pllr);
6394 +       mul = PLL_MUL(pllr, pll->layout);
6396 +       if (!div || !mul)
6397                 return 0;
6399 -       return (parent_rate / pll->div) * (pll->mul + 1);
6400 +       return (parent_rate / div) * (mul + 1);
6403  static long clk_pll_get_best_div_mul(struct clk_pll *pll, unsigned long rate,
6404 @@ -308,7 +297,7 @@ static const struct clk_ops pll_ops = {
6405  };
6407  static struct clk * __init
6408 -at91_clk_register_pll(struct at91_pmc *pmc, unsigned int irq, const char *name,
6409 +at91_clk_register_pll(struct regmap *regmap, const char *name,
6410                       const char *parent_name, u8 id,
6411                       const struct clk_pll_layout *layout,
6412                       const struct clk_pll_characteristics *characteristics)
6413 @@ -316,9 +305,8 @@ at91_clk_register_pll(struct at91_pmc *pmc, unsigned int irq, const char *name,
6414         struct clk_pll *pll;
6415         struct clk *clk = NULL;
6416         struct clk_init_data init;
6417 -       int ret;
6418         int offset = PLL_REG(id);
6419 -       u32 tmp;
6420 +       unsigned int pllr;
6422         if (id > PLL_MAX_ID)
6423                 return ERR_PTR(-EINVAL);
6424 @@ -337,23 +325,13 @@ at91_clk_register_pll(struct at91_pmc *pmc, unsigned int irq, const char *name,
6425         pll->hw.init = &init;
6426         pll->layout = layout;
6427         pll->characteristics = characteristics;
6428 -       pll->pmc = pmc;
6429 -       pll->irq = irq;
6430 -       tmp = pmc_read(pmc, offset) & layout->pllr_mask;
6431 -       pll->div = PLL_DIV(tmp);
6432 -       pll->mul = PLL_MUL(tmp, layout);
6433 -       init_waitqueue_head(&pll->wait);
6434 -       irq_set_status_flags(pll->irq, IRQ_NOAUTOEN);
6435 -       ret = request_irq(pll->irq, clk_pll_irq_handler, IRQF_TRIGGER_HIGH,
6436 -                         id ? "clk-pllb" : "clk-plla", pll);
6437 -       if (ret) {
6438 -               kfree(pll);
6439 -               return ERR_PTR(ret);
6440 -       }
6441 +       pll->regmap = regmap;
6442 +       regmap_read(regmap, offset, &pllr);
6443 +       pll->div = PLL_DIV(pllr);
6444 +       pll->mul = PLL_MUL(pllr, layout);
6446         clk = clk_register(NULL, &pll->hw);
6447         if (IS_ERR(clk)) {
6448 -               free_irq(pll->irq, pll);
6449                 kfree(pll);
6450         }
6452 @@ -483,12 +461,12 @@ out_free_characteristics:
6455  static void __init
6456 -of_at91_clk_pll_setup(struct device_node *np, struct at91_pmc *pmc,
6457 +of_at91_clk_pll_setup(struct device_node *np,
6458                       const struct clk_pll_layout *layout)
6460         u32 id;
6461 -       unsigned int irq;
6462         struct clk *clk;
6463 +       struct regmap *regmap;
6464         const char *parent_name;
6465         const char *name = np->name;
6466         struct clk_pll_characteristics *characteristics;
6467 @@ -500,15 +478,15 @@ of_at91_clk_pll_setup(struct device_node *np, struct at91_pmc *pmc,
6469         of_property_read_string(np, "clock-output-names", &name);
6471 -       characteristics = of_at91_clk_pll_get_characteristics(np);
6472 -       if (!characteristics)
6473 +       regmap = syscon_node_to_regmap(of_get_parent(np));
6474 +       if (IS_ERR(regmap))
6475                 return;
6477 -       irq = irq_of_parse_and_map(np, 0);
6478 -       if (!irq)
6479 +       characteristics = of_at91_clk_pll_get_characteristics(np);
6480 +       if (!characteristics)
6481                 return;
6483 -       clk = at91_clk_register_pll(pmc, irq, name, parent_name, id, layout,
6484 +       clk = at91_clk_register_pll(regmap, name, parent_name, id, layout,
6485                                     characteristics);
6486         if (IS_ERR(clk))
6487                 goto out_free_characteristics;
6488 @@ -520,26 +498,30 @@ out_free_characteristics:
6489         kfree(characteristics);
6492 -void __init of_at91rm9200_clk_pll_setup(struct device_node *np,
6493 -                                              struct at91_pmc *pmc)
6494 +static void __init of_at91rm9200_clk_pll_setup(struct device_node *np)
6496 -       of_at91_clk_pll_setup(np, pmc, &at91rm9200_pll_layout);
6497 +       of_at91_clk_pll_setup(np, &at91rm9200_pll_layout);
6499 +CLK_OF_DECLARE(at91rm9200_clk_pll, "atmel,at91rm9200-clk-pll",
6500 +              of_at91rm9200_clk_pll_setup);
6502 -void __init of_at91sam9g45_clk_pll_setup(struct device_node *np,
6503 -                                               struct at91_pmc *pmc)
6504 +static void __init of_at91sam9g45_clk_pll_setup(struct device_node *np)
6506 -       of_at91_clk_pll_setup(np, pmc, &at91sam9g45_pll_layout);
6507 +       of_at91_clk_pll_setup(np, &at91sam9g45_pll_layout);
6509 +CLK_OF_DECLARE(at91sam9g45_clk_pll, "atmel,at91sam9g45-clk-pll",
6510 +              of_at91sam9g45_clk_pll_setup);
6512 -void __init of_at91sam9g20_clk_pllb_setup(struct device_node *np,
6513 -                                                struct at91_pmc *pmc)
6514 +static void __init of_at91sam9g20_clk_pllb_setup(struct device_node *np)
6516 -       of_at91_clk_pll_setup(np, pmc, &at91sam9g20_pllb_layout);
6517 +       of_at91_clk_pll_setup(np, &at91sam9g20_pllb_layout);
6519 +CLK_OF_DECLARE(at91sam9g20_clk_pllb, "atmel,at91sam9g20-clk-pllb",
6520 +              of_at91sam9g20_clk_pllb_setup);
6522 -void __init of_sama5d3_clk_pll_setup(struct device_node *np,
6523 -                                           struct at91_pmc *pmc)
6524 +static void __init of_sama5d3_clk_pll_setup(struct device_node *np)
6526 -       of_at91_clk_pll_setup(np, pmc, &sama5d3_pll_layout);
6527 +       of_at91_clk_pll_setup(np, &sama5d3_pll_layout);
6529 +CLK_OF_DECLARE(sama5d3_clk_pll, "atmel,sama5d3-clk-pll",
6530 +              of_sama5d3_clk_pll_setup);
6531 diff --git a/drivers/clk/at91/clk-plldiv.c b/drivers/clk/at91/clk-plldiv.c
6532 index ea226562bb40..2bed26481027 100644
6533 --- a/drivers/clk/at91/clk-plldiv.c
6534 +++ b/drivers/clk/at91/clk-plldiv.c
6535 @@ -12,8 +12,8 @@
6536  #include <linux/clkdev.h>
6537  #include <linux/clk/at91_pmc.h>
6538  #include <linux/of.h>
6539 -#include <linux/of_address.h>
6540 -#include <linux/io.h>
6541 +#include <linux/mfd/syscon.h>
6542 +#include <linux/regmap.h>
6544  #include "pmc.h"
6546 @@ -21,16 +21,18 @@
6548  struct clk_plldiv {
6549         struct clk_hw hw;
6550 -       struct at91_pmc *pmc;
6551 +       struct regmap *regmap;
6552  };
6554  static unsigned long clk_plldiv_recalc_rate(struct clk_hw *hw,
6555                                             unsigned long parent_rate)
6557         struct clk_plldiv *plldiv = to_clk_plldiv(hw);
6558 -       struct at91_pmc *pmc = plldiv->pmc;
6559 +       unsigned int mckr;
6561 -       if (pmc_read(pmc, AT91_PMC_MCKR) & AT91_PMC_PLLADIV2)
6562 +       regmap_read(plldiv->regmap, AT91_PMC_MCKR, &mckr);
6564 +       if (mckr & AT91_PMC_PLLADIV2)
6565                 return parent_rate / 2;
6567         return parent_rate;
6568 @@ -57,18 +59,12 @@ static int clk_plldiv_set_rate(struct clk_hw *hw, unsigned long rate,
6569                                unsigned long parent_rate)
6571         struct clk_plldiv *plldiv = to_clk_plldiv(hw);
6572 -       struct at91_pmc *pmc = plldiv->pmc;
6573 -       u32 tmp;
6575 -       if (parent_rate != rate && (parent_rate / 2) != rate)
6576 +       if ((parent_rate != rate) && (parent_rate / 2 != rate))
6577                 return -EINVAL;
6579 -       pmc_lock(pmc);
6580 -       tmp = pmc_read(pmc, AT91_PMC_MCKR) & ~AT91_PMC_PLLADIV2;
6581 -       if ((parent_rate / 2) == rate)
6582 -               tmp |= AT91_PMC_PLLADIV2;
6583 -       pmc_write(pmc, AT91_PMC_MCKR, tmp);
6584 -       pmc_unlock(pmc);
6585 +       regmap_update_bits(plldiv->regmap, AT91_PMC_MCKR, AT91_PMC_PLLADIV2,
6586 +                          parent_rate != rate ? AT91_PMC_PLLADIV2 : 0);
6588         return 0;
6590 @@ -80,7 +76,7 @@ static const struct clk_ops plldiv_ops = {
6591  };
6593  static struct clk * __init
6594 -at91_clk_register_plldiv(struct at91_pmc *pmc, const char *name,
6595 +at91_clk_register_plldiv(struct regmap *regmap, const char *name,
6596                          const char *parent_name)
6598         struct clk_plldiv *plldiv;
6599 @@ -98,7 +94,7 @@ at91_clk_register_plldiv(struct at91_pmc *pmc, const char *name,
6600         init.flags = CLK_SET_RATE_GATE;
6602         plldiv->hw.init = &init;
6603 -       plldiv->pmc = pmc;
6604 +       plldiv->regmap = regmap;
6606         clk = clk_register(NULL, &plldiv->hw);
6608 @@ -109,27 +105,27 @@ at91_clk_register_plldiv(struct at91_pmc *pmc, const char *name,
6611  static void __init
6612 -of_at91_clk_plldiv_setup(struct device_node *np, struct at91_pmc *pmc)
6613 +of_at91sam9x5_clk_plldiv_setup(struct device_node *np)
6615         struct clk *clk;
6616         const char *parent_name;
6617         const char *name = np->name;
6618 +       struct regmap *regmap;
6620         parent_name = of_clk_get_parent_name(np, 0);
6622         of_property_read_string(np, "clock-output-names", &name);
6624 -       clk = at91_clk_register_plldiv(pmc, name, parent_name);
6625 +       regmap = syscon_node_to_regmap(of_get_parent(np));
6626 +       if (IS_ERR(regmap))
6627 +               return;
6629 +       clk = at91_clk_register_plldiv(regmap, name, parent_name);
6630         if (IS_ERR(clk))
6631                 return;
6633         of_clk_add_provider(np, of_clk_src_simple_get, clk);
6634         return;
6637 -void __init of_at91sam9x5_clk_plldiv_setup(struct device_node *np,
6638 -                                          struct at91_pmc *pmc)
6640 -       of_at91_clk_plldiv_setup(np, pmc);
6642 +CLK_OF_DECLARE(at91sam9x5_clk_plldiv, "atmel,at91sam9x5-clk-plldiv",
6643 +              of_at91sam9x5_clk_plldiv_setup);
6644 diff --git a/drivers/clk/at91/clk-programmable.c b/drivers/clk/at91/clk-programmable.c
6645 index 14b270b85fec..bc0be629671b 100644
6646 --- a/drivers/clk/at91/clk-programmable.c
6647 +++ b/drivers/clk/at91/clk-programmable.c
6648 @@ -12,10 +12,8 @@
6649  #include <linux/clkdev.h>
6650  #include <linux/clk/at91_pmc.h>
6651  #include <linux/of.h>
6652 -#include <linux/of_address.h>
6653 -#include <linux/io.h>
6654 -#include <linux/wait.h>
6655 -#include <linux/sched.h>
6656 +#include <linux/mfd/syscon.h>
6657 +#include <linux/regmap.h>
6659  #include "pmc.h"
6661 @@ -24,6 +22,7 @@
6663  #define PROG_STATUS_MASK(id)   (1 << ((id) + 8))
6664  #define PROG_PRES_MASK         0x7
6665 +#define PROG_PRES(layout, pckr)        ((pckr >> layout->pres_shift) & PROG_PRES_MASK)
6666  #define PROG_MAX_RM9200_CSS    3
6668  struct clk_programmable_layout {
6669 @@ -34,7 +33,7 @@ struct clk_programmable_layout {
6671  struct clk_programmable {
6672         struct clk_hw hw;
6673 -       struct at91_pmc *pmc;
6674 +       struct regmap *regmap;
6675         u8 id;
6676         const struct clk_programmable_layout *layout;
6677  };
6678 @@ -44,14 +43,12 @@ struct clk_programmable {
6679  static unsigned long clk_programmable_recalc_rate(struct clk_hw *hw,
6680                                                   unsigned long parent_rate)
6682 -       u32 pres;
6683         struct clk_programmable *prog = to_clk_programmable(hw);
6684 -       struct at91_pmc *pmc = prog->pmc;
6685 -       const struct clk_programmable_layout *layout = prog->layout;
6686 +       unsigned int pckr;
6688 +       regmap_read(prog->regmap, AT91_PMC_PCKR(prog->id), &pckr);
6690 -       pres = (pmc_read(pmc, AT91_PMC_PCKR(prog->id)) >> layout->pres_shift) &
6691 -              PROG_PRES_MASK;
6692 -       return parent_rate >> pres;
6693 +       return parent_rate >> PROG_PRES(prog->layout, pckr);
6696  static int clk_programmable_determine_rate(struct clk_hw *hw,
6697 @@ -101,36 +98,36 @@ static int clk_programmable_set_parent(struct clk_hw *hw, u8 index)
6699         struct clk_programmable *prog = to_clk_programmable(hw);
6700         const struct clk_programmable_layout *layout = prog->layout;
6701 -       struct at91_pmc *pmc = prog->pmc;
6702 -       u32 tmp = pmc_read(pmc, AT91_PMC_PCKR(prog->id)) & ~layout->css_mask;
6703 +       unsigned int mask = layout->css_mask;
6704 +       unsigned int pckr = 0;
6706         if (layout->have_slck_mck)
6707 -               tmp &= AT91_PMC_CSSMCK_MCK;
6708 +               mask |= AT91_PMC_CSSMCK_MCK;
6710         if (index > layout->css_mask) {
6711 -               if (index > PROG_MAX_RM9200_CSS && layout->have_slck_mck) {
6712 -                       tmp |= AT91_PMC_CSSMCK_MCK;
6713 -                       return 0;
6714 -               } else {
6715 +               if (index > PROG_MAX_RM9200_CSS && !layout->have_slck_mck)
6716                         return -EINVAL;
6717 -               }
6719 +               pckr |= AT91_PMC_CSSMCK_MCK;
6720         }
6722 -       pmc_write(pmc, AT91_PMC_PCKR(prog->id), tmp | index);
6723 +       regmap_update_bits(prog->regmap, AT91_PMC_PCKR(prog->id), mask, pckr);
6725         return 0;
6728  static u8 clk_programmable_get_parent(struct clk_hw *hw)
6730 -       u32 tmp;
6731 -       u8 ret;
6732         struct clk_programmable *prog = to_clk_programmable(hw);
6733 -       struct at91_pmc *pmc = prog->pmc;
6734         const struct clk_programmable_layout *layout = prog->layout;
6735 +       unsigned int pckr;
6736 +       u8 ret;
6738 +       regmap_read(prog->regmap, AT91_PMC_PCKR(prog->id), &pckr);
6740 +       ret = pckr & layout->css_mask;
6742 -       tmp = pmc_read(pmc, AT91_PMC_PCKR(prog->id));
6743 -       ret = tmp & layout->css_mask;
6744 -       if (layout->have_slck_mck && (tmp & AT91_PMC_CSSMCK_MCK) && !ret)
6745 +       if (layout->have_slck_mck && (pckr & AT91_PMC_CSSMCK_MCK) && !ret)
6746                 ret = PROG_MAX_RM9200_CSS + 1;
6748         return ret;
6749 @@ -140,26 +137,27 @@ static int clk_programmable_set_rate(struct clk_hw *hw, unsigned long rate,
6750                                      unsigned long parent_rate)
6752         struct clk_programmable *prog = to_clk_programmable(hw);
6753 -       struct at91_pmc *pmc = prog->pmc;
6754         const struct clk_programmable_layout *layout = prog->layout;
6755         unsigned long div = parent_rate / rate;
6756 +       unsigned int pckr;
6757         int shift = 0;
6758 -       u32 tmp = pmc_read(pmc, AT91_PMC_PCKR(prog->id)) &
6759 -                 ~(PROG_PRES_MASK << layout->pres_shift);
6761 +       regmap_read(prog->regmap, AT91_PMC_PCKR(prog->id), &pckr);
6763         if (!div)
6764                 return -EINVAL;
6766         shift = fls(div) - 1;
6768 -       if (div != (1<<shift))
6769 +       if (div != (1 << shift))
6770                 return -EINVAL;
6772         if (shift >= PROG_PRES_MASK)
6773                 return -EINVAL;
6775 -       pmc_write(pmc, AT91_PMC_PCKR(prog->id),
6776 -                 tmp | (shift << layout->pres_shift));
6777 +       regmap_update_bits(prog->regmap, AT91_PMC_PCKR(prog->id),
6778 +                          PROG_PRES_MASK << layout->pres_shift,
6779 +                          shift << layout->pres_shift);
6781         return 0;
6783 @@ -173,7 +171,7 @@ static const struct clk_ops programmable_ops = {
6784  };
6786  static struct clk * __init
6787 -at91_clk_register_programmable(struct at91_pmc *pmc,
6788 +at91_clk_register_programmable(struct regmap *regmap,
6789                                const char *name, const char **parent_names,
6790                                u8 num_parents, u8 id,
6791                                const struct clk_programmable_layout *layout)
6792 @@ -198,7 +196,7 @@ at91_clk_register_programmable(struct at91_pmc *pmc,
6793         prog->id = id;
6794         prog->layout = layout;
6795         prog->hw.init = &init;
6796 -       prog->pmc = pmc;
6797 +       prog->regmap = regmap;
6799         clk = clk_register(NULL, &prog->hw);
6800         if (IS_ERR(clk))
6801 @@ -226,7 +224,7 @@ static const struct clk_programmable_layout at91sam9x5_programmable_layout = {
6802  };
6804  static void __init
6805 -of_at91_clk_prog_setup(struct device_node *np, struct at91_pmc *pmc,
6806 +of_at91_clk_prog_setup(struct device_node *np,
6807                        const struct clk_programmable_layout *layout)
6809         int num;
6810 @@ -236,6 +234,7 @@ of_at91_clk_prog_setup(struct device_node *np, struct at91_pmc *pmc,
6811         const char *parent_names[PROG_SOURCE_MAX];
6812         const char *name;
6813         struct device_node *progclknp;
6814 +       struct regmap *regmap;
6816         num_parents = of_clk_get_parent_count(np);
6817         if (num_parents <= 0 || num_parents > PROG_SOURCE_MAX)
6818 @@ -247,6 +246,10 @@ of_at91_clk_prog_setup(struct device_node *np, struct at91_pmc *pmc,
6819         if (!num || num > (PROG_ID_MAX + 1))
6820                 return;
6822 +       regmap = syscon_node_to_regmap(of_get_parent(np));
6823 +       if (IS_ERR(regmap))
6824 +               return;
6826         for_each_child_of_node(np, progclknp) {
6827                 if (of_property_read_u32(progclknp, "reg", &id))
6828                         continue;
6829 @@ -254,7 +257,7 @@ of_at91_clk_prog_setup(struct device_node *np, struct at91_pmc *pmc,
6830                 if (of_property_read_string(np, "clock-output-names", &name))
6831                         name = progclknp->name;
6833 -               clk = at91_clk_register_programmable(pmc, name,
6834 +               clk = at91_clk_register_programmable(regmap, name,
6835                                                      parent_names, num_parents,
6836                                                      id, layout);
6837                 if (IS_ERR(clk))
6838 @@ -265,20 +268,23 @@ of_at91_clk_prog_setup(struct device_node *np, struct at91_pmc *pmc,
6842 -void __init of_at91rm9200_clk_prog_setup(struct device_node *np,
6843 -                                        struct at91_pmc *pmc)
6844 +static void __init of_at91rm9200_clk_prog_setup(struct device_node *np)
6846 -       of_at91_clk_prog_setup(np, pmc, &at91rm9200_programmable_layout);
6847 +       of_at91_clk_prog_setup(np, &at91rm9200_programmable_layout);
6849 +CLK_OF_DECLARE(at91rm9200_clk_prog, "atmel,at91rm9200-clk-programmable",
6850 +              of_at91rm9200_clk_prog_setup);
6852 -void __init of_at91sam9g45_clk_prog_setup(struct device_node *np,
6853 -                                         struct at91_pmc *pmc)
6854 +static void __init of_at91sam9g45_clk_prog_setup(struct device_node *np)
6856 -       of_at91_clk_prog_setup(np, pmc, &at91sam9g45_programmable_layout);
6857 +       of_at91_clk_prog_setup(np, &at91sam9g45_programmable_layout);
6859 +CLK_OF_DECLARE(at91sam9g45_clk_prog, "atmel,at91sam9g45-clk-programmable",
6860 +              of_at91sam9g45_clk_prog_setup);
6862 -void __init of_at91sam9x5_clk_prog_setup(struct device_node *np,
6863 -                                        struct at91_pmc *pmc)
6864 +static void __init of_at91sam9x5_clk_prog_setup(struct device_node *np)
6866 -       of_at91_clk_prog_setup(np, pmc, &at91sam9x5_programmable_layout);
6867 +       of_at91_clk_prog_setup(np, &at91sam9x5_programmable_layout);
6869 +CLK_OF_DECLARE(at91sam9x5_clk_prog, "atmel,at91sam9x5-clk-programmable",
6870 +              of_at91sam9x5_clk_prog_setup);
6871 diff --git a/drivers/clk/at91/clk-slow.c b/drivers/clk/at91/clk-slow.c
6872 index d0d5076a9b94..221c09684ba3 100644
6873 --- a/drivers/clk/at91/clk-slow.c
6874 +++ b/drivers/clk/at91/clk-slow.c
6875 @@ -13,17 +13,11 @@
6876  #include <linux/clk.h>
6877  #include <linux/clk-provider.h>
6878  #include <linux/clkdev.h>
6879 -#include <linux/slab.h>
6880  #include <linux/clk/at91_pmc.h>
6881  #include <linux/delay.h>
6882  #include <linux/of.h>
6883 -#include <linux/of_address.h>
6884 -#include <linux/of_irq.h>
6885 -#include <linux/io.h>
6886 -#include <linux/interrupt.h>
6887 -#include <linux/irq.h>
6888 -#include <linux/sched.h>
6889 -#include <linux/wait.h>
6890 +#include <linux/mfd/syscon.h>
6891 +#include <linux/regmap.h>
6893  #include "pmc.h"
6894  #include "sckc.h"
6895 @@ -59,7 +53,7 @@ struct clk_slow_rc_osc {
6897  struct clk_sam9260_slow {
6898         struct clk_hw hw;
6899 -       struct at91_pmc *pmc;
6900 +       struct regmap *regmap;
6901  };
6903  #define to_clk_sam9260_slow(hw) container_of(hw, struct clk_sam9260_slow, hw)
6904 @@ -393,8 +387,11 @@ void __init of_at91sam9x5_clk_slow_setup(struct device_node *np,
6905  static u8 clk_sam9260_slow_get_parent(struct clk_hw *hw)
6907         struct clk_sam9260_slow *slowck = to_clk_sam9260_slow(hw);
6908 +       unsigned int status;
6910 -       return !!(pmc_read(slowck->pmc, AT91_PMC_SR) & AT91_PMC_OSCSEL);
6911 +       regmap_read(slowck->regmap, AT91_PMC_SR, &status);
6913 +       return status & AT91_PMC_OSCSEL ? 1 : 0;
6916  static const struct clk_ops sam9260_slow_ops = {
6917 @@ -402,7 +399,7 @@ static const struct clk_ops sam9260_slow_ops = {
6918  };
6920  static struct clk * __init
6921 -at91_clk_register_sam9260_slow(struct at91_pmc *pmc,
6922 +at91_clk_register_sam9260_slow(struct regmap *regmap,
6923                                const char *name,
6924                                const char **parent_names,
6925                                int num_parents)
6926 @@ -411,7 +408,7 @@ at91_clk_register_sam9260_slow(struct at91_pmc *pmc,
6927         struct clk *clk = NULL;
6928         struct clk_init_data init;
6930 -       if (!pmc || !name)
6931 +       if (!name)
6932                 return ERR_PTR(-EINVAL);
6934         if (!parent_names || !num_parents)
6935 @@ -428,7 +425,7 @@ at91_clk_register_sam9260_slow(struct at91_pmc *pmc,
6936         init.flags = 0;
6938         slowck->hw.init = &init;
6939 -       slowck->pmc = pmc;
6940 +       slowck->regmap = regmap;
6942         clk = clk_register(NULL, &slowck->hw);
6943         if (IS_ERR(clk))
6944 @@ -439,29 +436,34 @@ at91_clk_register_sam9260_slow(struct at91_pmc *pmc,
6945         return clk;
6948 -void __init of_at91sam9260_clk_slow_setup(struct device_node *np,
6949 -                                         struct at91_pmc *pmc)
6950 +static void __init of_at91sam9260_clk_slow_setup(struct device_node *np)
6952         struct clk *clk;
6953         const char *parent_names[2];
6954         int num_parents;
6955         const char *name = np->name;
6956 +       struct regmap *regmap;
6958         num_parents = of_clk_get_parent_count(np);
6959         if (num_parents != 2)
6960                 return;
6962         of_clk_parent_fill(np, parent_names, num_parents);
6963 +       regmap = syscon_node_to_regmap(of_get_parent(np));
6964 +       if (IS_ERR(regmap))
6965 +               return;
6967         of_property_read_string(np, "clock-output-names", &name);
6969 -       clk = at91_clk_register_sam9260_slow(pmc, name, parent_names,
6970 +       clk = at91_clk_register_sam9260_slow(regmap, name, parent_names,
6971                                              num_parents);
6972         if (IS_ERR(clk))
6973                 return;
6975         of_clk_add_provider(np, of_clk_src_simple_get, clk);
6977 +CLK_OF_DECLARE(at91sam9260_clk_slow, "atmel,at91sam9260-clk-slow",
6978 +              of_at91sam9260_clk_slow_setup);
6980  /*
6981   * FIXME: All slow clk users are not properly claiming it (get + prepare +
6982 diff --git a/drivers/clk/at91/clk-smd.c b/drivers/clk/at91/clk-smd.c
6983 index a7f8501cfa05..e6948a52005a 100644
6984 --- a/drivers/clk/at91/clk-smd.c
6985 +++ b/drivers/clk/at91/clk-smd.c
6986 @@ -12,8 +12,8 @@
6987  #include <linux/clkdev.h>
6988  #include <linux/clk/at91_pmc.h>
6989  #include <linux/of.h>
6990 -#include <linux/of_address.h>
6991 -#include <linux/io.h>
6992 +#include <linux/mfd/syscon.h>
6993 +#include <linux/regmap.h>
6995  #include "pmc.h"
6997 @@ -24,7 +24,7 @@
6999  struct at91sam9x5_clk_smd {
7000         struct clk_hw hw;
7001 -       struct at91_pmc *pmc;
7002 +       struct regmap *regmap;
7003  };
7005  #define to_at91sam9x5_clk_smd(hw) \
7006 @@ -33,13 +33,13 @@ struct at91sam9x5_clk_smd {
7007  static unsigned long at91sam9x5_clk_smd_recalc_rate(struct clk_hw *hw,
7008                                                     unsigned long parent_rate)
7010 -       u32 tmp;
7011 -       u8 smddiv;
7012         struct at91sam9x5_clk_smd *smd = to_at91sam9x5_clk_smd(hw);
7013 -       struct at91_pmc *pmc = smd->pmc;
7014 +       unsigned int smdr;
7015 +       u8 smddiv;
7017 +       regmap_read(smd->regmap, AT91_PMC_SMD, &smdr);
7018 +       smddiv = (smdr & AT91_PMC_SMD_DIV) >> SMD_DIV_SHIFT;
7020 -       tmp = pmc_read(pmc, AT91_PMC_SMD);
7021 -       smddiv = (tmp & AT91_PMC_SMD_DIV) >> SMD_DIV_SHIFT;
7022         return parent_rate / (smddiv + 1);
7025 @@ -67,40 +67,38 @@ static long at91sam9x5_clk_smd_round_rate(struct clk_hw *hw, unsigned long rate,
7027  static int at91sam9x5_clk_smd_set_parent(struct clk_hw *hw, u8 index)
7029 -       u32 tmp;
7030         struct at91sam9x5_clk_smd *smd = to_at91sam9x5_clk_smd(hw);
7031 -       struct at91_pmc *pmc = smd->pmc;
7033         if (index > 1)
7034                 return -EINVAL;
7035 -       tmp = pmc_read(pmc, AT91_PMC_SMD) & ~AT91_PMC_SMDS;
7036 -       if (index)
7037 -               tmp |= AT91_PMC_SMDS;
7038 -       pmc_write(pmc, AT91_PMC_SMD, tmp);
7040 +       regmap_update_bits(smd->regmap, AT91_PMC_SMD, AT91_PMC_SMDS,
7041 +                          index ? AT91_PMC_SMDS : 0);
7043         return 0;
7046  static u8 at91sam9x5_clk_smd_get_parent(struct clk_hw *hw)
7048         struct at91sam9x5_clk_smd *smd = to_at91sam9x5_clk_smd(hw);
7049 -       struct at91_pmc *pmc = smd->pmc;
7050 +       unsigned int smdr;
7052 -       return pmc_read(pmc, AT91_PMC_SMD) & AT91_PMC_SMDS;
7053 +       regmap_read(smd->regmap, AT91_PMC_SMD, &smdr);
7055 +       return smdr & AT91_PMC_SMDS;
7058  static int at91sam9x5_clk_smd_set_rate(struct clk_hw *hw, unsigned long rate,
7059                                        unsigned long parent_rate)
7061 -       u32 tmp;
7062         struct at91sam9x5_clk_smd *smd = to_at91sam9x5_clk_smd(hw);
7063 -       struct at91_pmc *pmc = smd->pmc;
7064         unsigned long div = parent_rate / rate;
7066         if (parent_rate % rate || div < 1 || div > (SMD_MAX_DIV + 1))
7067                 return -EINVAL;
7068 -       tmp = pmc_read(pmc, AT91_PMC_SMD) & ~AT91_PMC_SMD_DIV;
7069 -       tmp |= (div - 1) << SMD_DIV_SHIFT;
7070 -       pmc_write(pmc, AT91_PMC_SMD, tmp);
7072 +       regmap_update_bits(smd->regmap, AT91_PMC_SMD, AT91_PMC_SMD_DIV,
7073 +                          (div - 1) << SMD_DIV_SHIFT);
7075         return 0;
7077 @@ -114,7 +112,7 @@ static const struct clk_ops at91sam9x5_smd_ops = {
7078  };
7080  static struct clk * __init
7081 -at91sam9x5_clk_register_smd(struct at91_pmc *pmc, const char *name,
7082 +at91sam9x5_clk_register_smd(struct regmap *regmap, const char *name,
7083                             const char **parent_names, u8 num_parents)
7085         struct at91sam9x5_clk_smd *smd;
7086 @@ -132,7 +130,7 @@ at91sam9x5_clk_register_smd(struct at91_pmc *pmc, const char *name,
7087         init.flags = CLK_SET_RATE_GATE | CLK_SET_PARENT_GATE;
7089         smd->hw.init = &init;
7090 -       smd->pmc = pmc;
7091 +       smd->regmap = regmap;
7093         clk = clk_register(NULL, &smd->hw);
7094         if (IS_ERR(clk))
7095 @@ -141,13 +139,13 @@ at91sam9x5_clk_register_smd(struct at91_pmc *pmc, const char *name,
7096         return clk;
7099 -void __init of_at91sam9x5_clk_smd_setup(struct device_node *np,
7100 -                                       struct at91_pmc *pmc)
7101 +static void __init of_at91sam9x5_clk_smd_setup(struct device_node *np)
7103         struct clk *clk;
7104         int num_parents;
7105         const char *parent_names[SMD_SOURCE_MAX];
7106         const char *name = np->name;
7107 +       struct regmap *regmap;
7109         num_parents = of_clk_get_parent_count(np);
7110         if (num_parents <= 0 || num_parents > SMD_SOURCE_MAX)
7111 @@ -157,10 +155,16 @@ void __init of_at91sam9x5_clk_smd_setup(struct device_node *np,
7113         of_property_read_string(np, "clock-output-names", &name);
7115 -       clk = at91sam9x5_clk_register_smd(pmc, name, parent_names,
7116 +       regmap = syscon_node_to_regmap(of_get_parent(np));
7117 +       if (IS_ERR(regmap))
7118 +               return;
7120 +       clk = at91sam9x5_clk_register_smd(regmap, name, parent_names,
7121                                           num_parents);
7122         if (IS_ERR(clk))
7123                 return;
7125         of_clk_add_provider(np, of_clk_src_simple_get, clk);
7127 +CLK_OF_DECLARE(at91sam9x5_clk_smd, "atmel,at91sam9x5-clk-smd",
7128 +              of_at91sam9x5_clk_smd_setup);
7129 diff --git a/drivers/clk/at91/clk-system.c b/drivers/clk/at91/clk-system.c
7130 index 3f5314344286..8f35d8172909 100644
7131 --- a/drivers/clk/at91/clk-system.c
7132 +++ b/drivers/clk/at91/clk-system.c
7133 @@ -12,13 +12,8 @@
7134  #include <linux/clkdev.h>
7135  #include <linux/clk/at91_pmc.h>
7136  #include <linux/of.h>
7137 -#include <linux/of_address.h>
7138 -#include <linux/io.h>
7139 -#include <linux/irq.h>
7140 -#include <linux/of_irq.h>
7141 -#include <linux/interrupt.h>
7142 -#include <linux/wait.h>
7143 -#include <linux/sched.h>
7144 +#include <linux/mfd/syscon.h>
7145 +#include <linux/regmap.h>
7147  #include "pmc.h"
7149 @@ -29,9 +24,7 @@
7150  #define to_clk_system(hw) container_of(hw, struct clk_system, hw)
7151  struct clk_system {
7152         struct clk_hw hw;
7153 -       struct at91_pmc *pmc;
7154 -       unsigned int irq;
7155 -       wait_queue_head_t wait;
7156 +       struct regmap *regmap;
7157         u8 id;
7158  };
7160 @@ -39,58 +32,54 @@ static inline int is_pck(int id)
7162         return (id >= 8) && (id <= 15);
7164 -static irqreturn_t clk_system_irq_handler(int irq, void *dev_id)
7166 +static inline bool clk_system_ready(struct regmap *regmap, int id)
7168 -       struct clk_system *sys = (struct clk_system *)dev_id;
7169 +       unsigned int status;
7171 -       wake_up(&sys->wait);
7172 -       disable_irq_nosync(sys->irq);
7173 +       regmap_read(regmap, AT91_PMC_SR, &status);
7175 -       return IRQ_HANDLED;
7176 +       return status & (1 << id) ? 1 : 0;
7179  static int clk_system_prepare(struct clk_hw *hw)
7181         struct clk_system *sys = to_clk_system(hw);
7182 -       struct at91_pmc *pmc = sys->pmc;
7183 -       u32 mask = 1 << sys->id;
7185 -       pmc_write(pmc, AT91_PMC_SCER, mask);
7186 +       regmap_write(sys->regmap, AT91_PMC_SCER, 1 << sys->id);
7188         if (!is_pck(sys->id))
7189                 return 0;
7191 -       while (!(pmc_read(pmc, AT91_PMC_SR) & mask)) {
7192 -               if (sys->irq) {
7193 -                       enable_irq(sys->irq);
7194 -                       wait_event(sys->wait,
7195 -                                  pmc_read(pmc, AT91_PMC_SR) & mask);
7196 -               } else
7197 -                       cpu_relax();
7198 -       }
7199 +       while (!clk_system_ready(sys->regmap, sys->id))
7200 +               cpu_relax();
7202         return 0;
7205  static void clk_system_unprepare(struct clk_hw *hw)
7207         struct clk_system *sys = to_clk_system(hw);
7208 -       struct at91_pmc *pmc = sys->pmc;
7210 -       pmc_write(pmc, AT91_PMC_SCDR, 1 << sys->id);
7211 +       regmap_write(sys->regmap, AT91_PMC_SCDR, 1 << sys->id);
7214  static int clk_system_is_prepared(struct clk_hw *hw)
7216         struct clk_system *sys = to_clk_system(hw);
7217 -       struct at91_pmc *pmc = sys->pmc;
7218 +       unsigned int status;
7220 +       regmap_read(sys->regmap, AT91_PMC_SCSR, &status);
7222 -       if (!(pmc_read(pmc, AT91_PMC_SCSR) & (1 << sys->id)))
7223 +       if (!(status & (1 << sys->id)))
7224                 return 0;
7226         if (!is_pck(sys->id))
7227                 return 1;
7229 -       return !!(pmc_read(pmc, AT91_PMC_SR) & (1 << sys->id));
7230 +       regmap_read(sys->regmap, AT91_PMC_SR, &status);
7232 +       return status & (1 << sys->id) ? 1 : 0;
7235  static const struct clk_ops system_ops = {
7236 @@ -100,13 +89,12 @@ static const struct clk_ops system_ops = {
7237  };
7239  static struct clk * __init
7240 -at91_clk_register_system(struct at91_pmc *pmc, const char *name,
7241 -                        const char *parent_name, u8 id, int irq)
7242 +at91_clk_register_system(struct regmap *regmap, const char *name,
7243 +                        const char *parent_name, u8 id)
7245         struct clk_system *sys;
7246         struct clk *clk = NULL;
7247         struct clk_init_data init;
7248 -       int ret;
7250         if (!parent_name || id > SYSTEM_MAX_ID)
7251                 return ERR_PTR(-EINVAL);
7252 @@ -123,44 +111,33 @@ at91_clk_register_system(struct at91_pmc *pmc, const char *name,
7254         sys->id = id;
7255         sys->hw.init = &init;
7256 -       sys->pmc = pmc;
7257 -       sys->irq = irq;
7258 -       if (irq) {
7259 -               init_waitqueue_head(&sys->wait);
7260 -               irq_set_status_flags(sys->irq, IRQ_NOAUTOEN);
7261 -               ret = request_irq(sys->irq, clk_system_irq_handler,
7262 -                               IRQF_TRIGGER_HIGH, name, sys);
7263 -               if (ret) {
7264 -                       kfree(sys);
7265 -                       return ERR_PTR(ret);
7266 -               }
7267 -       }
7268 +       sys->regmap = regmap;
7270         clk = clk_register(NULL, &sys->hw);
7271 -       if (IS_ERR(clk)) {
7272 -               if (irq)
7273 -                       free_irq(sys->irq, sys);
7274 +       if (IS_ERR(clk))
7275                 kfree(sys);
7276 -       }
7278         return clk;
7281 -static void __init
7282 -of_at91_clk_sys_setup(struct device_node *np, struct at91_pmc *pmc)
7283 +static void __init of_at91rm9200_clk_sys_setup(struct device_node *np)
7285         int num;
7286 -       int irq = 0;
7287         u32 id;
7288         struct clk *clk;
7289         const char *name;
7290         struct device_node *sysclknp;
7291         const char *parent_name;
7292 +       struct regmap *regmap;
7294         num = of_get_child_count(np);
7295         if (num > (SYSTEM_MAX_ID + 1))
7296                 return;
7298 +       regmap = syscon_node_to_regmap(of_get_parent(np));
7299 +       if (IS_ERR(regmap))
7300 +               return;
7302         for_each_child_of_node(np, sysclknp) {
7303                 if (of_property_read_u32(sysclknp, "reg", &id))
7304                         continue;
7305 @@ -168,21 +145,14 @@ of_at91_clk_sys_setup(struct device_node *np, struct at91_pmc *pmc)
7306                 if (of_property_read_string(np, "clock-output-names", &name))
7307                         name = sysclknp->name;
7309 -               if (is_pck(id))
7310 -                       irq = irq_of_parse_and_map(sysclknp, 0);
7312                 parent_name = of_clk_get_parent_name(sysclknp, 0);
7314 -               clk = at91_clk_register_system(pmc, name, parent_name, id, irq);
7315 +               clk = at91_clk_register_system(regmap, name, parent_name, id);
7316                 if (IS_ERR(clk))
7317                         continue;
7319                 of_clk_add_provider(sysclknp, of_clk_src_simple_get, clk);
7320         }
7323 -void __init of_at91rm9200_clk_sys_setup(struct device_node *np,
7324 -                                       struct at91_pmc *pmc)
7326 -       of_at91_clk_sys_setup(np, pmc);
7328 +CLK_OF_DECLARE(at91rm9200_clk_sys, "atmel,at91rm9200-clk-system",
7329 +              of_at91rm9200_clk_sys_setup);
7330 diff --git a/drivers/clk/at91/clk-usb.c b/drivers/clk/at91/clk-usb.c
7331 index 8ab8502778a2..650ca45892c0 100644
7332 --- a/drivers/clk/at91/clk-usb.c
7333 +++ b/drivers/clk/at91/clk-usb.c
7334 @@ -12,8 +12,8 @@
7335  #include <linux/clkdev.h>
7336  #include <linux/clk/at91_pmc.h>
7337  #include <linux/of.h>
7338 -#include <linux/of_address.h>
7339 -#include <linux/io.h>
7340 +#include <linux/mfd/syscon.h>
7341 +#include <linux/regmap.h>
7343  #include "pmc.h"
7345 @@ -27,7 +27,7 @@
7347  struct at91sam9x5_clk_usb {
7348         struct clk_hw hw;
7349 -       struct at91_pmc *pmc;
7350 +       struct regmap *regmap;
7351  };
7353  #define to_at91sam9x5_clk_usb(hw) \
7354 @@ -35,7 +35,7 @@ struct at91sam9x5_clk_usb {
7356  struct at91rm9200_clk_usb {
7357         struct clk_hw hw;
7358 -       struct at91_pmc *pmc;
7359 +       struct regmap *regmap;
7360         u32 divisors[4];
7361  };
7363 @@ -45,13 +45,12 @@ struct at91rm9200_clk_usb {
7364  static unsigned long at91sam9x5_clk_usb_recalc_rate(struct clk_hw *hw,
7365                                                     unsigned long parent_rate)
7367 -       u32 tmp;
7368 -       u8 usbdiv;
7369         struct at91sam9x5_clk_usb *usb = to_at91sam9x5_clk_usb(hw);
7370 -       struct at91_pmc *pmc = usb->pmc;
7371 +       unsigned int usbr;
7372 +       u8 usbdiv;
7374 -       tmp = pmc_read(pmc, AT91_PMC_USB);
7375 -       usbdiv = (tmp & AT91_PMC_OHCIUSBDIV) >> SAM9X5_USB_DIV_SHIFT;
7376 +       regmap_read(usb->regmap, AT91_PMC_USB, &usbr);
7377 +       usbdiv = (usbr & AT91_PMC_OHCIUSBDIV) >> SAM9X5_USB_DIV_SHIFT;
7379         return DIV_ROUND_CLOSEST(parent_rate, (usbdiv + 1));
7381 @@ -109,33 +108,31 @@ static int at91sam9x5_clk_usb_determine_rate(struct clk_hw *hw,
7383  static int at91sam9x5_clk_usb_set_parent(struct clk_hw *hw, u8 index)
7385 -       u32 tmp;
7386         struct at91sam9x5_clk_usb *usb = to_at91sam9x5_clk_usb(hw);
7387 -       struct at91_pmc *pmc = usb->pmc;
7389         if (index > 1)
7390                 return -EINVAL;
7391 -       tmp = pmc_read(pmc, AT91_PMC_USB) & ~AT91_PMC_USBS;
7392 -       if (index)
7393 -               tmp |= AT91_PMC_USBS;
7394 -       pmc_write(pmc, AT91_PMC_USB, tmp);
7396 +       regmap_update_bits(usb->regmap, AT91_PMC_USB, AT91_PMC_USBS,
7397 +                          index ? AT91_PMC_USBS : 0);
7399         return 0;
7402  static u8 at91sam9x5_clk_usb_get_parent(struct clk_hw *hw)
7404         struct at91sam9x5_clk_usb *usb = to_at91sam9x5_clk_usb(hw);
7405 -       struct at91_pmc *pmc = usb->pmc;
7406 +       unsigned int usbr;
7408 -       return pmc_read(pmc, AT91_PMC_USB) & AT91_PMC_USBS;
7409 +       regmap_read(usb->regmap, AT91_PMC_USB, &usbr);
7411 +       return usbr & AT91_PMC_USBS;
7414  static int at91sam9x5_clk_usb_set_rate(struct clk_hw *hw, unsigned long rate,
7415                                        unsigned long parent_rate)
7417 -       u32 tmp;
7418         struct at91sam9x5_clk_usb *usb = to_at91sam9x5_clk_usb(hw);
7419 -       struct at91_pmc *pmc = usb->pmc;
7420         unsigned long div;
7422         if (!rate)
7423 @@ -145,9 +142,8 @@ static int at91sam9x5_clk_usb_set_rate(struct clk_hw *hw, unsigned long rate,
7424         if (div > SAM9X5_USB_MAX_DIV + 1 || !div)
7425                 return -EINVAL;
7427 -       tmp = pmc_read(pmc, AT91_PMC_USB) & ~AT91_PMC_OHCIUSBDIV;
7428 -       tmp |= (div - 1) << SAM9X5_USB_DIV_SHIFT;
7429 -       pmc_write(pmc, AT91_PMC_USB, tmp);
7430 +       regmap_update_bits(usb->regmap, AT91_PMC_USB, AT91_PMC_OHCIUSBDIV,
7431 +                          (div - 1) << SAM9X5_USB_DIV_SHIFT);
7433         return 0;
7435 @@ -163,28 +159,28 @@ static const struct clk_ops at91sam9x5_usb_ops = {
7436  static int at91sam9n12_clk_usb_enable(struct clk_hw *hw)
7438         struct at91sam9x5_clk_usb *usb = to_at91sam9x5_clk_usb(hw);
7439 -       struct at91_pmc *pmc = usb->pmc;
7441 -       pmc_write(pmc, AT91_PMC_USB,
7442 -                 pmc_read(pmc, AT91_PMC_USB) | AT91_PMC_USBS);
7443 +       regmap_update_bits(usb->regmap, AT91_PMC_USB, AT91_PMC_USBS,
7444 +                          AT91_PMC_USBS);
7446         return 0;
7449  static void at91sam9n12_clk_usb_disable(struct clk_hw *hw)
7451         struct at91sam9x5_clk_usb *usb = to_at91sam9x5_clk_usb(hw);
7452 -       struct at91_pmc *pmc = usb->pmc;
7454 -       pmc_write(pmc, AT91_PMC_USB,
7455 -                 pmc_read(pmc, AT91_PMC_USB) & ~AT91_PMC_USBS);
7456 +       regmap_update_bits(usb->regmap, AT91_PMC_USB, AT91_PMC_USBS, 0);
7459  static int at91sam9n12_clk_usb_is_enabled(struct clk_hw *hw)
7461         struct at91sam9x5_clk_usb *usb = to_at91sam9x5_clk_usb(hw);
7462 -       struct at91_pmc *pmc = usb->pmc;
7463 +       unsigned int usbr;
7465 -       return !!(pmc_read(pmc, AT91_PMC_USB) & AT91_PMC_USBS);
7466 +       regmap_read(usb->regmap, AT91_PMC_USB, &usbr);
7468 +       return usbr & AT91_PMC_USBS;
7471  static const struct clk_ops at91sam9n12_usb_ops = {
7472 @@ -197,7 +193,7 @@ static const struct clk_ops at91sam9n12_usb_ops = {
7473  };
7475  static struct clk * __init
7476 -at91sam9x5_clk_register_usb(struct at91_pmc *pmc, const char *name,
7477 +at91sam9x5_clk_register_usb(struct regmap *regmap, const char *name,
7478                             const char **parent_names, u8 num_parents)
7480         struct at91sam9x5_clk_usb *usb;
7481 @@ -216,7 +212,7 @@ at91sam9x5_clk_register_usb(struct at91_pmc *pmc, const char *name,
7482                      CLK_SET_RATE_PARENT;
7484         usb->hw.init = &init;
7485 -       usb->pmc = pmc;
7486 +       usb->regmap = regmap;
7488         clk = clk_register(NULL, &usb->hw);
7489         if (IS_ERR(clk))
7490 @@ -226,7 +222,7 @@ at91sam9x5_clk_register_usb(struct at91_pmc *pmc, const char *name,
7493  static struct clk * __init
7494 -at91sam9n12_clk_register_usb(struct at91_pmc *pmc, const char *name,
7495 +at91sam9n12_clk_register_usb(struct regmap *regmap, const char *name,
7496                              const char *parent_name)
7498         struct at91sam9x5_clk_usb *usb;
7499 @@ -244,7 +240,7 @@ at91sam9n12_clk_register_usb(struct at91_pmc *pmc, const char *name,
7500         init.flags = CLK_SET_RATE_GATE | CLK_SET_RATE_PARENT;
7502         usb->hw.init = &init;
7503 -       usb->pmc = pmc;
7504 +       usb->regmap = regmap;
7506         clk = clk_register(NULL, &usb->hw);
7507         if (IS_ERR(clk))
7508 @@ -257,12 +253,12 @@ static unsigned long at91rm9200_clk_usb_recalc_rate(struct clk_hw *hw,
7509                                                     unsigned long parent_rate)
7511         struct at91rm9200_clk_usb *usb = to_at91rm9200_clk_usb(hw);
7512 -       struct at91_pmc *pmc = usb->pmc;
7513 -       u32 tmp;
7514 +       unsigned int pllbr;
7515         u8 usbdiv;
7517 -       tmp = pmc_read(pmc, AT91_CKGR_PLLBR);
7518 -       usbdiv = (tmp & AT91_PMC_USBDIV) >> RM9200_USB_DIV_SHIFT;
7519 +       regmap_read(usb->regmap, AT91_CKGR_PLLBR, &pllbr);
7521 +       usbdiv = (pllbr & AT91_PMC_USBDIV) >> RM9200_USB_DIV_SHIFT;
7522         if (usb->divisors[usbdiv])
7523                 return parent_rate / usb->divisors[usbdiv];
7525 @@ -310,10 +306,8 @@ static long at91rm9200_clk_usb_round_rate(struct clk_hw *hw, unsigned long rate,
7526  static int at91rm9200_clk_usb_set_rate(struct clk_hw *hw, unsigned long rate,
7527                                        unsigned long parent_rate)
7529 -       u32 tmp;
7530         int i;
7531         struct at91rm9200_clk_usb *usb = to_at91rm9200_clk_usb(hw);
7532 -       struct at91_pmc *pmc = usb->pmc;
7533         unsigned long div;
7535         if (!rate)
7536 @@ -323,10 +317,10 @@ static int at91rm9200_clk_usb_set_rate(struct clk_hw *hw, unsigned long rate,
7538         for (i = 0; i < RM9200_USB_DIV_TAB_SIZE; i++) {
7539                 if (usb->divisors[i] == div) {
7540 -                       tmp = pmc_read(pmc, AT91_CKGR_PLLBR) &
7541 -                             ~AT91_PMC_USBDIV;
7542 -                       tmp |= i << RM9200_USB_DIV_SHIFT;
7543 -                       pmc_write(pmc, AT91_CKGR_PLLBR, tmp);
7544 +                       regmap_update_bits(usb->regmap, AT91_CKGR_PLLBR,
7545 +                                          AT91_PMC_USBDIV,
7546 +                                          i << RM9200_USB_DIV_SHIFT);
7548                         return 0;
7549                 }
7550         }
7551 @@ -341,7 +335,7 @@ static const struct clk_ops at91rm9200_usb_ops = {
7552  };
7554  static struct clk * __init
7555 -at91rm9200_clk_register_usb(struct at91_pmc *pmc, const char *name,
7556 +at91rm9200_clk_register_usb(struct regmap *regmap, const char *name,
7557                             const char *parent_name, const u32 *divisors)
7559         struct at91rm9200_clk_usb *usb;
7560 @@ -359,7 +353,7 @@ at91rm9200_clk_register_usb(struct at91_pmc *pmc, const char *name,
7561         init.flags = CLK_SET_RATE_PARENT;
7563         usb->hw.init = &init;
7564 -       usb->pmc = pmc;
7565 +       usb->regmap = regmap;
7566         memcpy(usb->divisors, divisors, sizeof(usb->divisors));
7568         clk = clk_register(NULL, &usb->hw);
7569 @@ -369,13 +363,13 @@ at91rm9200_clk_register_usb(struct at91_pmc *pmc, const char *name,
7570         return clk;
7573 -void __init of_at91sam9x5_clk_usb_setup(struct device_node *np,
7574 -                                       struct at91_pmc *pmc)
7575 +static void __init of_at91sam9x5_clk_usb_setup(struct device_node *np)
7577         struct clk *clk;
7578         int num_parents;
7579         const char *parent_names[USB_SOURCE_MAX];
7580         const char *name = np->name;
7581 +       struct regmap *regmap;
7583         num_parents = of_clk_get_parent_count(np);
7584         if (num_parents <= 0 || num_parents > USB_SOURCE_MAX)
7585 @@ -385,19 +379,26 @@ void __init of_at91sam9x5_clk_usb_setup(struct device_node *np,
7587         of_property_read_string(np, "clock-output-names", &name);
7589 -       clk = at91sam9x5_clk_register_usb(pmc, name, parent_names, num_parents);
7590 +       regmap = syscon_node_to_regmap(of_get_parent(np));
7591 +       if (IS_ERR(regmap))
7592 +               return;
7594 +       clk = at91sam9x5_clk_register_usb(regmap, name, parent_names,
7595 +                                         num_parents);
7596         if (IS_ERR(clk))
7597                 return;
7599         of_clk_add_provider(np, of_clk_src_simple_get, clk);
7601 +CLK_OF_DECLARE(at91sam9x5_clk_usb, "atmel,at91sam9x5-clk-usb",
7602 +              of_at91sam9x5_clk_usb_setup);
7604 -void __init of_at91sam9n12_clk_usb_setup(struct device_node *np,
7605 -                                        struct at91_pmc *pmc)
7606 +static void __init of_at91sam9n12_clk_usb_setup(struct device_node *np)
7608         struct clk *clk;
7609         const char *parent_name;
7610         const char *name = np->name;
7611 +       struct regmap *regmap;
7613         parent_name = of_clk_get_parent_name(np, 0);
7614         if (!parent_name)
7615 @@ -405,20 +406,26 @@ void __init of_at91sam9n12_clk_usb_setup(struct device_node *np,
7617         of_property_read_string(np, "clock-output-names", &name);
7619 -       clk = at91sam9n12_clk_register_usb(pmc, name, parent_name);
7620 +       regmap = syscon_node_to_regmap(of_get_parent(np));
7621 +       if (IS_ERR(regmap))
7622 +               return;
7624 +       clk = at91sam9n12_clk_register_usb(regmap, name, parent_name);
7625         if (IS_ERR(clk))
7626                 return;
7628         of_clk_add_provider(np, of_clk_src_simple_get, clk);
7630 +CLK_OF_DECLARE(at91sam9n12_clk_usb, "atmel,at91sam9n12-clk-usb",
7631 +              of_at91sam9n12_clk_usb_setup);
7633 -void __init of_at91rm9200_clk_usb_setup(struct device_node *np,
7634 -                                       struct at91_pmc *pmc)
7635 +static void __init of_at91rm9200_clk_usb_setup(struct device_node *np)
7637         struct clk *clk;
7638         const char *parent_name;
7639         const char *name = np->name;
7640         u32 divisors[4] = {0, 0, 0, 0};
7641 +       struct regmap *regmap;
7643         parent_name = of_clk_get_parent_name(np, 0);
7644         if (!parent_name)
7645 @@ -430,9 +437,15 @@ void __init of_at91rm9200_clk_usb_setup(struct device_node *np,
7647         of_property_read_string(np, "clock-output-names", &name);
7649 -       clk = at91rm9200_clk_register_usb(pmc, name, parent_name, divisors);
7650 +       regmap = syscon_node_to_regmap(of_get_parent(np));
7651 +       if (IS_ERR(regmap))
7652 +               return;
7654 +       clk = at91rm9200_clk_register_usb(regmap, name, parent_name, divisors);
7655         if (IS_ERR(clk))
7656                 return;
7658         of_clk_add_provider(np, of_clk_src_simple_get, clk);
7660 +CLK_OF_DECLARE(at91rm9200_clk_usb, "atmel,at91rm9200-clk-usb",
7661 +              of_at91rm9200_clk_usb_setup);
7662 diff --git a/drivers/clk/at91/clk-utmi.c b/drivers/clk/at91/clk-utmi.c
7663 index ca561e90a60f..61fcf399e58c 100644
7664 --- a/drivers/clk/at91/clk-utmi.c
7665 +++ b/drivers/clk/at91/clk-utmi.c
7666 @@ -11,14 +11,9 @@
7667  #include <linux/clk-provider.h>
7668  #include <linux/clkdev.h>
7669  #include <linux/clk/at91_pmc.h>
7670 -#include <linux/interrupt.h>
7671 -#include <linux/irq.h>
7672  #include <linux/of.h>
7673 -#include <linux/of_address.h>
7674 -#include <linux/of_irq.h>
7675 -#include <linux/io.h>
7676 -#include <linux/sched.h>
7677 -#include <linux/wait.h>
7678 +#include <linux/mfd/syscon.h>
7679 +#include <linux/regmap.h>
7681  #include "pmc.h"
7683 @@ -26,37 +21,30 @@
7685  struct clk_utmi {
7686         struct clk_hw hw;
7687 -       struct at91_pmc *pmc;
7688 -       unsigned int irq;
7689 -       wait_queue_head_t wait;
7690 +       struct regmap *regmap;
7691  };
7693  #define to_clk_utmi(hw) container_of(hw, struct clk_utmi, hw)
7695 -static irqreturn_t clk_utmi_irq_handler(int irq, void *dev_id)
7696 +static inline bool clk_utmi_ready(struct regmap *regmap)
7698 -       struct clk_utmi *utmi = (struct clk_utmi *)dev_id;
7699 +       unsigned int status;
7701 -       wake_up(&utmi->wait);
7702 -       disable_irq_nosync(utmi->irq);
7703 +       regmap_read(regmap, AT91_PMC_SR, &status);
7705 -       return IRQ_HANDLED;
7706 +       return status & AT91_PMC_LOCKU;
7709  static int clk_utmi_prepare(struct clk_hw *hw)
7711         struct clk_utmi *utmi = to_clk_utmi(hw);
7712 -       struct at91_pmc *pmc = utmi->pmc;
7713 -       u32 tmp = pmc_read(pmc, AT91_CKGR_UCKR) | AT91_PMC_UPLLEN |
7714 -                 AT91_PMC_UPLLCOUNT | AT91_PMC_BIASEN;
7715 +       unsigned int uckr = AT91_PMC_UPLLEN | AT91_PMC_UPLLCOUNT |
7716 +                           AT91_PMC_BIASEN;
7718 -       pmc_write(pmc, AT91_CKGR_UCKR, tmp);
7719 +       regmap_update_bits(utmi->regmap, AT91_CKGR_UCKR, uckr, uckr);
7721 -       while (!(pmc_read(pmc, AT91_PMC_SR) & AT91_PMC_LOCKU)) {
7722 -               enable_irq(utmi->irq);
7723 -               wait_event(utmi->wait,
7724 -                          pmc_read(pmc, AT91_PMC_SR) & AT91_PMC_LOCKU);
7725 -       }
7726 +       while (!clk_utmi_ready(utmi->regmap))
7727 +               cpu_relax();
7729         return 0;
7731 @@ -64,18 +52,15 @@ static int clk_utmi_prepare(struct clk_hw *hw)
7732  static int clk_utmi_is_prepared(struct clk_hw *hw)
7734         struct clk_utmi *utmi = to_clk_utmi(hw);
7735 -       struct at91_pmc *pmc = utmi->pmc;
7737 -       return !!(pmc_read(pmc, AT91_PMC_SR) & AT91_PMC_LOCKU);
7738 +       return clk_utmi_ready(utmi->regmap);
7741  static void clk_utmi_unprepare(struct clk_hw *hw)
7743         struct clk_utmi *utmi = to_clk_utmi(hw);
7744 -       struct at91_pmc *pmc = utmi->pmc;
7745 -       u32 tmp = pmc_read(pmc, AT91_CKGR_UCKR) & ~AT91_PMC_UPLLEN;
7747 -       pmc_write(pmc, AT91_CKGR_UCKR, tmp);
7748 +       regmap_update_bits(utmi->regmap, AT91_CKGR_UCKR, AT91_PMC_UPLLEN, 0);
7751  static unsigned long clk_utmi_recalc_rate(struct clk_hw *hw,
7752 @@ -93,10 +78,9 @@ static const struct clk_ops utmi_ops = {
7753  };
7755  static struct clk * __init
7756 -at91_clk_register_utmi(struct at91_pmc *pmc, unsigned int irq,
7757 +at91_clk_register_utmi(struct regmap *regmap,
7758                        const char *name, const char *parent_name)
7760 -       int ret;
7761         struct clk_utmi *utmi;
7762         struct clk *clk = NULL;
7763         struct clk_init_data init;
7764 @@ -112,52 +96,36 @@ at91_clk_register_utmi(struct at91_pmc *pmc, unsigned int irq,
7765         init.flags = CLK_SET_RATE_GATE;
7767         utmi->hw.init = &init;
7768 -       utmi->pmc = pmc;
7769 -       utmi->irq = irq;
7770 -       init_waitqueue_head(&utmi->wait);
7771 -       irq_set_status_flags(utmi->irq, IRQ_NOAUTOEN);
7772 -       ret = request_irq(utmi->irq, clk_utmi_irq_handler,
7773 -                         IRQF_TRIGGER_HIGH, "clk-utmi", utmi);
7774 -       if (ret) {
7775 -               kfree(utmi);
7776 -               return ERR_PTR(ret);
7777 -       }
7778 +       utmi->regmap = regmap;
7780         clk = clk_register(NULL, &utmi->hw);
7781 -       if (IS_ERR(clk)) {
7782 -               free_irq(utmi->irq, utmi);
7783 +       if (IS_ERR(clk))
7784                 kfree(utmi);
7785 -       }
7787         return clk;
7790 -static void __init
7791 -of_at91_clk_utmi_setup(struct device_node *np, struct at91_pmc *pmc)
7792 +static void __init of_at91sam9x5_clk_utmi_setup(struct device_node *np)
7794 -       unsigned int irq;
7795         struct clk *clk;
7796         const char *parent_name;
7797         const char *name = np->name;
7798 +       struct regmap *regmap;
7800         parent_name = of_clk_get_parent_name(np, 0);
7802         of_property_read_string(np, "clock-output-names", &name);
7804 -       irq = irq_of_parse_and_map(np, 0);
7805 -       if (!irq)
7806 +       regmap = syscon_node_to_regmap(of_get_parent(np));
7807 +       if (IS_ERR(regmap))
7808                 return;
7810 -       clk = at91_clk_register_utmi(pmc, irq, name, parent_name);
7811 +       clk = at91_clk_register_utmi(regmap, name, parent_name);
7812         if (IS_ERR(clk))
7813                 return;
7815         of_clk_add_provider(np, of_clk_src_simple_get, clk);
7816         return;
7819 -void __init of_at91sam9x5_clk_utmi_setup(struct device_node *np,
7820 -                                        struct at91_pmc *pmc)
7822 -       of_at91_clk_utmi_setup(np, pmc);
7824 +CLK_OF_DECLARE(at91sam9x5_clk_utmi, "atmel,at91sam9x5-clk-utmi",
7825 +              of_at91sam9x5_clk_utmi_setup);
7826 diff --git a/drivers/clk/at91/pmc.c b/drivers/clk/at91/pmc.c
7827 index 8476b570779b..526df5ba042d 100644
7828 --- a/drivers/clk/at91/pmc.c
7829 +++ b/drivers/clk/at91/pmc.c
7830 @@ -12,36 +12,13 @@
7831  #include <linux/clkdev.h>
7832  #include <linux/clk/at91_pmc.h>
7833  #include <linux/of.h>
7834 -#include <linux/of_address.h>
7835 -#include <linux/io.h>
7836 -#include <linux/interrupt.h>
7837 -#include <linux/irq.h>
7838 -#include <linux/irqchip/chained_irq.h>
7839 -#include <linux/irqdomain.h>
7840 -#include <linux/of_irq.h>
7841 +#include <linux/mfd/syscon.h>
7842 +#include <linux/regmap.h>
7844  #include <asm/proc-fns.h>
7846  #include "pmc.h"
7848 -void __iomem *at91_pmc_base;
7849 -EXPORT_SYMBOL_GPL(at91_pmc_base);
7851 -void at91rm9200_idle(void)
7853 -       /*
7854 -        * Disable the processor clock.  The processor will be automatically
7855 -        * re-enabled by an interrupt or by a reset.
7856 -        */
7857 -       at91_pmc_write(AT91_PMC_SCDR, AT91_PMC_PCK);
7860 -void at91sam9_idle(void)
7862 -       at91_pmc_write(AT91_PMC_SCDR, AT91_PMC_PCK);
7863 -       cpu_do_idle();
7866  int of_at91_get_clk_range(struct device_node *np, const char *propname,
7867                           struct clk_range *range)
7869 @@ -64,402 +41,3 @@ int of_at91_get_clk_range(struct device_node *np, const char *propname,
7870         return 0;
7872  EXPORT_SYMBOL_GPL(of_at91_get_clk_range);
7874 -static void pmc_irq_mask(struct irq_data *d)
7876 -       struct at91_pmc *pmc = irq_data_get_irq_chip_data(d);
7878 -       pmc_write(pmc, AT91_PMC_IDR, 1 << d->hwirq);
7881 -static void pmc_irq_unmask(struct irq_data *d)
7883 -       struct at91_pmc *pmc = irq_data_get_irq_chip_data(d);
7885 -       pmc_write(pmc, AT91_PMC_IER, 1 << d->hwirq);
7888 -static int pmc_irq_set_type(struct irq_data *d, unsigned type)
7890 -       if (type != IRQ_TYPE_LEVEL_HIGH) {
7891 -               pr_warn("PMC: type not supported (support only IRQ_TYPE_LEVEL_HIGH type)\n");
7892 -               return -EINVAL;
7893 -       }
7895 -       return 0;
7898 -static void pmc_irq_suspend(struct irq_data *d)
7900 -       struct at91_pmc *pmc = irq_data_get_irq_chip_data(d);
7902 -       pmc->imr = pmc_read(pmc, AT91_PMC_IMR);
7903 -       pmc_write(pmc, AT91_PMC_IDR, pmc->imr);
7906 -static void pmc_irq_resume(struct irq_data *d)
7908 -       struct at91_pmc *pmc = irq_data_get_irq_chip_data(d);
7910 -       pmc_write(pmc, AT91_PMC_IER, pmc->imr);
7913 -static struct irq_chip pmc_irq = {
7914 -       .name = "PMC",
7915 -       .irq_disable = pmc_irq_mask,
7916 -       .irq_mask = pmc_irq_mask,
7917 -       .irq_unmask = pmc_irq_unmask,
7918 -       .irq_set_type = pmc_irq_set_type,
7919 -       .irq_suspend = pmc_irq_suspend,
7920 -       .irq_resume = pmc_irq_resume,
7923 -static struct lock_class_key pmc_lock_class;
7925 -static int pmc_irq_map(struct irq_domain *h, unsigned int virq,
7926 -                      irq_hw_number_t hw)
7928 -       struct at91_pmc *pmc = h->host_data;
7930 -       irq_set_lockdep_class(virq, &pmc_lock_class);
7932 -       irq_set_chip_and_handler(virq, &pmc_irq,
7933 -                                handle_level_irq);
7934 -       irq_set_chip_data(virq, pmc);
7936 -       return 0;
7939 -static int pmc_irq_domain_xlate(struct irq_domain *d,
7940 -                               struct device_node *ctrlr,
7941 -                               const u32 *intspec, unsigned int intsize,
7942 -                               irq_hw_number_t *out_hwirq,
7943 -                               unsigned int *out_type)
7945 -       struct at91_pmc *pmc = d->host_data;
7946 -       const struct at91_pmc_caps *caps = pmc->caps;
7948 -       if (WARN_ON(intsize < 1))
7949 -               return -EINVAL;
7951 -       *out_hwirq = intspec[0];
7953 -       if (!(caps->available_irqs & (1 << *out_hwirq)))
7954 -               return -EINVAL;
7956 -       *out_type = IRQ_TYPE_LEVEL_HIGH;
7958 -       return 0;
7961 -static const struct irq_domain_ops pmc_irq_ops = {
7962 -       .map    = pmc_irq_map,
7963 -       .xlate  = pmc_irq_domain_xlate,
7966 -static irqreturn_t pmc_irq_handler(int irq, void *data)
7968 -       struct at91_pmc *pmc = (struct at91_pmc *)data;
7969 -       unsigned long sr;
7970 -       int n;
7972 -       sr = pmc_read(pmc, AT91_PMC_SR) & pmc_read(pmc, AT91_PMC_IMR);
7973 -       if (!sr)
7974 -               return IRQ_NONE;
7976 -       for_each_set_bit(n, &sr, BITS_PER_LONG)
7977 -               generic_handle_irq(irq_find_mapping(pmc->irqdomain, n));
7979 -       return IRQ_HANDLED;
7982 -static const struct at91_pmc_caps at91rm9200_caps = {
7983 -       .available_irqs = AT91_PMC_MOSCS | AT91_PMC_LOCKA | AT91_PMC_LOCKB |
7984 -                         AT91_PMC_MCKRDY | AT91_PMC_PCK0RDY |
7985 -                         AT91_PMC_PCK1RDY | AT91_PMC_PCK2RDY |
7986 -                         AT91_PMC_PCK3RDY,
7989 -static const struct at91_pmc_caps at91sam9260_caps = {
7990 -       .available_irqs = AT91_PMC_MOSCS | AT91_PMC_LOCKA | AT91_PMC_LOCKB |
7991 -                         AT91_PMC_MCKRDY | AT91_PMC_PCK0RDY |
7992 -                         AT91_PMC_PCK1RDY,
7995 -static const struct at91_pmc_caps at91sam9g45_caps = {
7996 -       .available_irqs = AT91_PMC_MOSCS | AT91_PMC_LOCKA | AT91_PMC_MCKRDY |
7997 -                         AT91_PMC_LOCKU | AT91_PMC_PCK0RDY |
7998 -                         AT91_PMC_PCK1RDY,
8001 -static const struct at91_pmc_caps at91sam9n12_caps = {
8002 -       .available_irqs = AT91_PMC_MOSCS | AT91_PMC_LOCKA | AT91_PMC_LOCKB |
8003 -                         AT91_PMC_MCKRDY | AT91_PMC_PCK0RDY |
8004 -                         AT91_PMC_PCK1RDY | AT91_PMC_MOSCSELS |
8005 -                         AT91_PMC_MOSCRCS | AT91_PMC_CFDEV,
8008 -static const struct at91_pmc_caps at91sam9x5_caps = {
8009 -       .available_irqs = AT91_PMC_MOSCS | AT91_PMC_LOCKA | AT91_PMC_MCKRDY |
8010 -                         AT91_PMC_LOCKU | AT91_PMC_PCK0RDY |
8011 -                         AT91_PMC_PCK1RDY | AT91_PMC_MOSCSELS |
8012 -                         AT91_PMC_MOSCRCS | AT91_PMC_CFDEV,
8015 -static const struct at91_pmc_caps sama5d2_caps = {
8016 -       .available_irqs = AT91_PMC_MOSCS | AT91_PMC_LOCKA | AT91_PMC_MCKRDY |
8017 -                         AT91_PMC_LOCKU | AT91_PMC_PCK0RDY |
8018 -                         AT91_PMC_PCK1RDY | AT91_PMC_PCK2RDY |
8019 -                         AT91_PMC_MOSCSELS | AT91_PMC_MOSCRCS |
8020 -                         AT91_PMC_CFDEV | AT91_PMC_GCKRDY,
8023 -static const struct at91_pmc_caps sama5d3_caps = {
8024 -       .available_irqs = AT91_PMC_MOSCS | AT91_PMC_LOCKA | AT91_PMC_MCKRDY |
8025 -                         AT91_PMC_LOCKU | AT91_PMC_PCK0RDY |
8026 -                         AT91_PMC_PCK1RDY | AT91_PMC_PCK2RDY |
8027 -                         AT91_PMC_MOSCSELS | AT91_PMC_MOSCRCS |
8028 -                         AT91_PMC_CFDEV,
8031 -static struct at91_pmc *__init at91_pmc_init(struct device_node *np,
8032 -                                            void __iomem *regbase, int virq,
8033 -                                            const struct at91_pmc_caps *caps)
8035 -       struct at91_pmc *pmc;
8037 -       if (!regbase || !virq ||  !caps)
8038 -               return NULL;
8040 -       at91_pmc_base = regbase;
8042 -       pmc = kzalloc(sizeof(*pmc), GFP_KERNEL);
8043 -       if (!pmc)
8044 -               return NULL;
8046 -       spin_lock_init(&pmc->lock);
8047 -       pmc->regbase = regbase;
8048 -       pmc->virq = virq;
8049 -       pmc->caps = caps;
8051 -       pmc->irqdomain = irq_domain_add_linear(np, 32, &pmc_irq_ops, pmc);
8053 -       if (!pmc->irqdomain)
8054 -               goto out_free_pmc;
8056 -       pmc_write(pmc, AT91_PMC_IDR, 0xffffffff);
8057 -       if (request_irq(pmc->virq, pmc_irq_handler,
8058 -                       IRQF_SHARED | IRQF_COND_SUSPEND, "pmc", pmc))
8059 -               goto out_remove_irqdomain;
8061 -       return pmc;
8063 -out_remove_irqdomain:
8064 -       irq_domain_remove(pmc->irqdomain);
8065 -out_free_pmc:
8066 -       kfree(pmc);
8068 -       return NULL;
8071 -static const struct of_device_id pmc_clk_ids[] __initconst = {
8072 -       /* Slow oscillator */
8073 -       {
8074 -               .compatible = "atmel,at91sam9260-clk-slow",
8075 -               .data = of_at91sam9260_clk_slow_setup,
8076 -       },
8077 -       /* Main clock */
8078 -       {
8079 -               .compatible = "atmel,at91rm9200-clk-main-osc",
8080 -               .data = of_at91rm9200_clk_main_osc_setup,
8081 -       },
8082 -       {
8083 -               .compatible = "atmel,at91sam9x5-clk-main-rc-osc",
8084 -               .data = of_at91sam9x5_clk_main_rc_osc_setup,
8085 -       },
8086 -       {
8087 -               .compatible = "atmel,at91rm9200-clk-main",
8088 -               .data = of_at91rm9200_clk_main_setup,
8089 -       },
8090 -       {
8091 -               .compatible = "atmel,at91sam9x5-clk-main",
8092 -               .data = of_at91sam9x5_clk_main_setup,
8093 -       },
8094 -       /* PLL clocks */
8095 -       {
8096 -               .compatible = "atmel,at91rm9200-clk-pll",
8097 -               .data = of_at91rm9200_clk_pll_setup,
8098 -       },
8099 -       {
8100 -               .compatible = "atmel,at91sam9g45-clk-pll",
8101 -               .data = of_at91sam9g45_clk_pll_setup,
8102 -       },
8103 -       {
8104 -               .compatible = "atmel,at91sam9g20-clk-pllb",
8105 -               .data = of_at91sam9g20_clk_pllb_setup,
8106 -       },
8107 -       {
8108 -               .compatible = "atmel,sama5d3-clk-pll",
8109 -               .data = of_sama5d3_clk_pll_setup,
8110 -       },
8111 -       {
8112 -               .compatible = "atmel,at91sam9x5-clk-plldiv",
8113 -               .data = of_at91sam9x5_clk_plldiv_setup,
8114 -       },
8115 -       /* Master clock */
8116 -       {
8117 -               .compatible = "atmel,at91rm9200-clk-master",
8118 -               .data = of_at91rm9200_clk_master_setup,
8119 -       },
8120 -       {
8121 -               .compatible = "atmel,at91sam9x5-clk-master",
8122 -               .data = of_at91sam9x5_clk_master_setup,
8123 -       },
8124 -       /* System clocks */
8125 -       {
8126 -               .compatible = "atmel,at91rm9200-clk-system",
8127 -               .data = of_at91rm9200_clk_sys_setup,
8128 -       },
8129 -       /* Peripheral clocks */
8130 -       {
8131 -               .compatible = "atmel,at91rm9200-clk-peripheral",
8132 -               .data = of_at91rm9200_clk_periph_setup,
8133 -       },
8134 -       {
8135 -               .compatible = "atmel,at91sam9x5-clk-peripheral",
8136 -               .data = of_at91sam9x5_clk_periph_setup,
8137 -       },
8138 -       /* Programmable clocks */
8139 -       {
8140 -               .compatible = "atmel,at91rm9200-clk-programmable",
8141 -               .data = of_at91rm9200_clk_prog_setup,
8142 -       },
8143 -       {
8144 -               .compatible = "atmel,at91sam9g45-clk-programmable",
8145 -               .data = of_at91sam9g45_clk_prog_setup,
8146 -       },
8147 -       {
8148 -               .compatible = "atmel,at91sam9x5-clk-programmable",
8149 -               .data = of_at91sam9x5_clk_prog_setup,
8150 -       },
8151 -       /* UTMI clock */
8152 -#if defined(CONFIG_HAVE_AT91_UTMI)
8153 -       {
8154 -               .compatible = "atmel,at91sam9x5-clk-utmi",
8155 -               .data = of_at91sam9x5_clk_utmi_setup,
8156 -       },
8157 -#endif
8158 -       /* USB clock */
8159 -#if defined(CONFIG_HAVE_AT91_USB_CLK)
8160 -       {
8161 -               .compatible = "atmel,at91rm9200-clk-usb",
8162 -               .data = of_at91rm9200_clk_usb_setup,
8163 -       },
8164 -       {
8165 -               .compatible = "atmel,at91sam9x5-clk-usb",
8166 -               .data = of_at91sam9x5_clk_usb_setup,
8167 -       },
8168 -       {
8169 -               .compatible = "atmel,at91sam9n12-clk-usb",
8170 -               .data = of_at91sam9n12_clk_usb_setup,
8171 -       },
8172 -#endif
8173 -       /* SMD clock */
8174 -#if defined(CONFIG_HAVE_AT91_SMD)
8175 -       {
8176 -               .compatible = "atmel,at91sam9x5-clk-smd",
8177 -               .data = of_at91sam9x5_clk_smd_setup,
8178 -       },
8179 -#endif
8180 -#if defined(CONFIG_HAVE_AT91_H32MX)
8181 -       {
8182 -               .compatible = "atmel,sama5d4-clk-h32mx",
8183 -               .data = of_sama5d4_clk_h32mx_setup,
8184 -       },
8185 -#endif
8186 -#if defined(CONFIG_HAVE_AT91_GENERATED_CLK)
8187 -       {
8188 -               .compatible = "atmel,sama5d2-clk-generated",
8189 -               .data = of_sama5d2_clk_generated_setup,
8190 -       },
8191 -#endif
8192 -       { /*sentinel*/ }
8195 -static void __init of_at91_pmc_setup(struct device_node *np,
8196 -                                    const struct at91_pmc_caps *caps)
8198 -       struct at91_pmc *pmc;
8199 -       struct device_node *childnp;
8200 -       void (*clk_setup)(struct device_node *, struct at91_pmc *);
8201 -       const struct of_device_id *clk_id;
8202 -       void __iomem *regbase = of_iomap(np, 0);
8203 -       int virq;
8205 -       if (!regbase)
8206 -               return;
8208 -       virq = irq_of_parse_and_map(np, 0);
8209 -       if (!virq)
8210 -               return;
8212 -       pmc = at91_pmc_init(np, regbase, virq, caps);
8213 -       if (!pmc)
8214 -               return;
8215 -       for_each_child_of_node(np, childnp) {
8216 -               clk_id = of_match_node(pmc_clk_ids, childnp);
8217 -               if (!clk_id)
8218 -                       continue;
8219 -               clk_setup = clk_id->data;
8220 -               clk_setup(childnp, pmc);
8221 -       }
8224 -static void __init of_at91rm9200_pmc_setup(struct device_node *np)
8226 -       of_at91_pmc_setup(np, &at91rm9200_caps);
8228 -CLK_OF_DECLARE(at91rm9200_clk_pmc, "atmel,at91rm9200-pmc",
8229 -              of_at91rm9200_pmc_setup);
8231 -static void __init of_at91sam9260_pmc_setup(struct device_node *np)
8233 -       of_at91_pmc_setup(np, &at91sam9260_caps);
8235 -CLK_OF_DECLARE(at91sam9260_clk_pmc, "atmel,at91sam9260-pmc",
8236 -              of_at91sam9260_pmc_setup);
8238 -static void __init of_at91sam9g45_pmc_setup(struct device_node *np)
8240 -       of_at91_pmc_setup(np, &at91sam9g45_caps);
8242 -CLK_OF_DECLARE(at91sam9g45_clk_pmc, "atmel,at91sam9g45-pmc",
8243 -              of_at91sam9g45_pmc_setup);
8245 -static void __init of_at91sam9n12_pmc_setup(struct device_node *np)
8247 -       of_at91_pmc_setup(np, &at91sam9n12_caps);
8249 -CLK_OF_DECLARE(at91sam9n12_clk_pmc, "atmel,at91sam9n12-pmc",
8250 -              of_at91sam9n12_pmc_setup);
8252 -static void __init of_at91sam9x5_pmc_setup(struct device_node *np)
8254 -       of_at91_pmc_setup(np, &at91sam9x5_caps);
8256 -CLK_OF_DECLARE(at91sam9x5_clk_pmc, "atmel,at91sam9x5-pmc",
8257 -              of_at91sam9x5_pmc_setup);
8259 -static void __init of_sama5d2_pmc_setup(struct device_node *np)
8261 -       of_at91_pmc_setup(np, &sama5d2_caps);
8263 -CLK_OF_DECLARE(sama5d2_clk_pmc, "atmel,sama5d2-pmc",
8264 -              of_sama5d2_pmc_setup);
8266 -static void __init of_sama5d3_pmc_setup(struct device_node *np)
8268 -       of_at91_pmc_setup(np, &sama5d3_caps);
8270 -CLK_OF_DECLARE(sama5d3_clk_pmc, "atmel,sama5d3-pmc",
8271 -              of_sama5d3_pmc_setup);
8272 diff --git a/drivers/clk/at91/pmc.h b/drivers/clk/at91/pmc.h
8273 index f65739272779..5771fff0ee3f 100644
8274 --- a/drivers/clk/at91/pmc.h
8275 +++ b/drivers/clk/at91/pmc.h
8276 @@ -14,8 +14,11 @@
8278  #include <linux/io.h>
8279  #include <linux/irqdomain.h>
8280 +#include <linux/regmap.h>
8281  #include <linux/spinlock.h>
8283 +extern spinlock_t pmc_pcr_lock;
8285  struct clk_range {
8286         unsigned long min;
8287         unsigned long max;
8288 @@ -23,102 +26,7 @@ struct clk_range {
8290  #define CLK_RANGE(MIN, MAX) {.min = MIN, .max = MAX,}
8292 -struct at91_pmc_caps {
8293 -       u32 available_irqs;
8296 -struct at91_pmc {
8297 -       void __iomem *regbase;
8298 -       int virq;
8299 -       spinlock_t lock;
8300 -       const struct at91_pmc_caps *caps;
8301 -       struct irq_domain *irqdomain;
8302 -       u32 imr;
8305 -static inline void pmc_lock(struct at91_pmc *pmc)
8307 -       spin_lock(&pmc->lock);
8310 -static inline void pmc_unlock(struct at91_pmc *pmc)
8312 -       spin_unlock(&pmc->lock);
8315 -static inline u32 pmc_read(struct at91_pmc *pmc, int offset)
8317 -       return readl(pmc->regbase + offset);
8320 -static inline void pmc_write(struct at91_pmc *pmc, int offset, u32 value)
8322 -       writel(value, pmc->regbase + offset);
8325  int of_at91_get_clk_range(struct device_node *np, const char *propname,
8326                           struct clk_range *range);
8328 -void of_at91sam9260_clk_slow_setup(struct device_node *np,
8329 -                                  struct at91_pmc *pmc);
8331 -void of_at91rm9200_clk_main_osc_setup(struct device_node *np,
8332 -                                     struct at91_pmc *pmc);
8333 -void of_at91sam9x5_clk_main_rc_osc_setup(struct device_node *np,
8334 -                                        struct at91_pmc *pmc);
8335 -void of_at91rm9200_clk_main_setup(struct device_node *np,
8336 -                                 struct at91_pmc *pmc);
8337 -void of_at91sam9x5_clk_main_setup(struct device_node *np,
8338 -                                 struct at91_pmc *pmc);
8340 -void of_at91rm9200_clk_pll_setup(struct device_node *np,
8341 -                                struct at91_pmc *pmc);
8342 -void of_at91sam9g45_clk_pll_setup(struct device_node *np,
8343 -                                 struct at91_pmc *pmc);
8344 -void of_at91sam9g20_clk_pllb_setup(struct device_node *np,
8345 -                                  struct at91_pmc *pmc);
8346 -void of_sama5d3_clk_pll_setup(struct device_node *np,
8347 -                             struct at91_pmc *pmc);
8348 -void of_at91sam9x5_clk_plldiv_setup(struct device_node *np,
8349 -                                   struct at91_pmc *pmc);
8351 -void of_at91rm9200_clk_master_setup(struct device_node *np,
8352 -                                   struct at91_pmc *pmc);
8353 -void of_at91sam9x5_clk_master_setup(struct device_node *np,
8354 -                                   struct at91_pmc *pmc);
8356 -void of_at91rm9200_clk_sys_setup(struct device_node *np,
8357 -                                struct at91_pmc *pmc);
8359 -void of_at91rm9200_clk_periph_setup(struct device_node *np,
8360 -                                   struct at91_pmc *pmc);
8361 -void of_at91sam9x5_clk_periph_setup(struct device_node *np,
8362 -                                   struct at91_pmc *pmc);
8364 -void of_at91rm9200_clk_prog_setup(struct device_node *np,
8365 -                                 struct at91_pmc *pmc);
8366 -void of_at91sam9g45_clk_prog_setup(struct device_node *np,
8367 -                                  struct at91_pmc *pmc);
8368 -void of_at91sam9x5_clk_prog_setup(struct device_node *np,
8369 -                                 struct at91_pmc *pmc);
8371 -void of_at91sam9x5_clk_utmi_setup(struct device_node *np,
8372 -                                 struct at91_pmc *pmc);
8374 -void of_at91rm9200_clk_usb_setup(struct device_node *np,
8375 -                                struct at91_pmc *pmc);
8376 -void of_at91sam9x5_clk_usb_setup(struct device_node *np,
8377 -                                struct at91_pmc *pmc);
8378 -void of_at91sam9n12_clk_usb_setup(struct device_node *np,
8379 -                                 struct at91_pmc *pmc);
8381 -void of_at91sam9x5_clk_smd_setup(struct device_node *np,
8382 -                                struct at91_pmc *pmc);
8384 -void of_sama5d4_clk_h32mx_setup(struct device_node *np,
8385 -                               struct at91_pmc *pmc);
8387 -void of_sama5d2_clk_generated_setup(struct device_node *np,
8388 -                                   struct at91_pmc *pmc);
8390  #endif /* __PMC_H_ */
8391 diff --git a/drivers/clocksource/tcb_clksrc.c b/drivers/clocksource/tcb_clksrc.c
8392 index 4da2af9694a2..5b6f57f500b8 100644
8393 --- a/drivers/clocksource/tcb_clksrc.c
8394 +++ b/drivers/clocksource/tcb_clksrc.c
8395 @@ -23,8 +23,7 @@
8396   *     this 32 bit free-running counter. the second channel is not used.
8397   *
8398   *   - The third channel may be used to provide a 16-bit clockevent
8399 - *     source, used in either periodic or oneshot mode.  This runs
8400 - *     at 32 KiHZ, and can handle delays of up to two seconds.
8401 + *     source, used in either periodic or oneshot mode.
8402   *
8403   * A boot clocksource and clockevent source are also currently needed,
8404   * unless the relevant platforms (ARM/AT91, AVR32/AT32) are changed so
8405 @@ -74,6 +73,8 @@ static struct clocksource clksrc = {
8406  struct tc_clkevt_device {
8407         struct clock_event_device       clkevt;
8408         struct clk                      *clk;
8409 +       bool                            clk_enabled;
8410 +       u32                             freq;
8411         void __iomem                    *regs;
8412  };
8414 @@ -82,15 +83,26 @@ static struct tc_clkevt_device *to_tc_clkevt(struct clock_event_device *clkevt)
8415         return container_of(clkevt, struct tc_clkevt_device, clkevt);
8418 -/* For now, we always use the 32K clock ... this optimizes for NO_HZ,
8419 - * because using one of the divided clocks would usually mean the
8420 - * tick rate can never be less than several dozen Hz (vs 0.5 Hz).
8421 - *
8422 - * A divided clock could be good for high resolution timers, since
8423 - * 30.5 usec resolution can seem "low".
8424 - */
8425  static u32 timer_clock;
8427 +static void tc_clk_disable(struct clock_event_device *d)
8429 +       struct tc_clkevt_device *tcd = to_tc_clkevt(d);
8431 +       clk_disable(tcd->clk);
8432 +       tcd->clk_enabled = false;
8435 +static void tc_clk_enable(struct clock_event_device *d)
8437 +       struct tc_clkevt_device *tcd = to_tc_clkevt(d);
8439 +       if (tcd->clk_enabled)
8440 +               return;
8441 +       clk_enable(tcd->clk);
8442 +       tcd->clk_enabled = true;
8445  static int tc_shutdown(struct clock_event_device *d)
8447         struct tc_clkevt_device *tcd = to_tc_clkevt(d);
8448 @@ -98,8 +110,14 @@ static int tc_shutdown(struct clock_event_device *d)
8450         __raw_writel(0xff, regs + ATMEL_TC_REG(2, IDR));
8451         __raw_writel(ATMEL_TC_CLKDIS, regs + ATMEL_TC_REG(2, CCR));
8452 +       return 0;
8455 +static int tc_shutdown_clk_off(struct clock_event_device *d)
8457 +       tc_shutdown(d);
8458         if (!clockevent_state_detached(d))
8459 -               clk_disable(tcd->clk);
8460 +               tc_clk_disable(d);
8462         return 0;
8464 @@ -112,9 +130,9 @@ static int tc_set_oneshot(struct clock_event_device *d)
8465         if (clockevent_state_oneshot(d) || clockevent_state_periodic(d))
8466                 tc_shutdown(d);
8468 -       clk_enable(tcd->clk);
8469 +       tc_clk_enable(d);
8471 -       /* slow clock, count up to RC, then irq and stop */
8472 +       /* count up to RC, then irq and stop */
8473         __raw_writel(timer_clock | ATMEL_TC_CPCSTOP | ATMEL_TC_WAVE |
8474                      ATMEL_TC_WAVESEL_UP_AUTO, regs + ATMEL_TC_REG(2, CMR));
8475         __raw_writel(ATMEL_TC_CPCS, regs + ATMEL_TC_REG(2, IER));
8476 @@ -134,12 +152,12 @@ static int tc_set_periodic(struct clock_event_device *d)
8477         /* By not making the gentime core emulate periodic mode on top
8478          * of oneshot, we get lower overhead and improved accuracy.
8479          */
8480 -       clk_enable(tcd->clk);
8481 +       tc_clk_enable(d);
8483 -       /* slow clock, count up to RC, then irq and restart */
8484 +       /* count up to RC, then irq and restart */
8485         __raw_writel(timer_clock | ATMEL_TC_WAVE | ATMEL_TC_WAVESEL_UP_AUTO,
8486                      regs + ATMEL_TC_REG(2, CMR));
8487 -       __raw_writel((32768 + HZ / 2) / HZ, tcaddr + ATMEL_TC_REG(2, RC));
8488 +       __raw_writel((tcd->freq + HZ / 2) / HZ, tcaddr + ATMEL_TC_REG(2, RC));
8490         /* Enable clock and interrupts on RC compare */
8491         __raw_writel(ATMEL_TC_CPCS, regs + ATMEL_TC_REG(2, IER));
8492 @@ -166,9 +184,13 @@ static struct tc_clkevt_device clkevt = {
8493                 .features               = CLOCK_EVT_FEAT_PERIODIC |
8494                                           CLOCK_EVT_FEAT_ONESHOT,
8495                 /* Should be lower than at91rm9200's system timer */
8496 +#ifdef CONFIG_ATMEL_TCB_CLKSRC_USE_SLOW_CLOCK
8497                 .rating                 = 125,
8498 +#else
8499 +               .rating                 = 200,
8500 +#endif
8501                 .set_next_event         = tc_next_event,
8502 -               .set_state_shutdown     = tc_shutdown,
8503 +               .set_state_shutdown     = tc_shutdown_clk_off,
8504                 .set_state_periodic     = tc_set_periodic,
8505                 .set_state_oneshot      = tc_set_oneshot,
8506         },
8507 @@ -188,8 +210,9 @@ static irqreturn_t ch2_irq(int irq, void *handle)
8508         return IRQ_NONE;
8511 -static int __init setup_clkevents(struct atmel_tc *tc, int clk32k_divisor_idx)
8512 +static int __init setup_clkevents(struct atmel_tc *tc, int divisor_idx)
8514 +       unsigned divisor = atmel_tc_divisors[divisor_idx];
8515         int ret;
8516         struct clk *t2_clk = tc->clk[2];
8517         int irq = tc->irq[2];
8518 @@ -210,7 +233,11 @@ static int __init setup_clkevents(struct atmel_tc *tc, int clk32k_divisor_idx)
8519         clkevt.regs = tc->regs;
8520         clkevt.clk = t2_clk;
8522 -       timer_clock = clk32k_divisor_idx;
8523 +       timer_clock = divisor_idx;
8524 +       if (!divisor)
8525 +               clkevt.freq = 32768;
8526 +       else
8527 +               clkevt.freq = clk_get_rate(t2_clk) / divisor;
8529         clkevt.clkevt.cpumask = cpumask_of(0);
8531 @@ -221,7 +248,7 @@ static int __init setup_clkevents(struct atmel_tc *tc, int clk32k_divisor_idx)
8532                 return ret;
8533         }
8535 -       clockevents_config_and_register(&clkevt.clkevt, 32768, 1, 0xffff);
8536 +       clockevents_config_and_register(&clkevt.clkevt, clkevt.freq, 1, 0xffff);
8538         return ret;
8540 @@ -358,7 +385,11 @@ static int __init tcb_clksrc_init(void)
8541                 goto err_disable_t1;
8543         /* channel 2:  periodic and oneshot timer support */
8544 +#ifdef CONFIG_ATMEL_TCB_CLKSRC_USE_SLOW_CLOCK
8545         ret = setup_clkevents(tc, clk32k_divisor_idx);
8546 +#else
8547 +       ret = setup_clkevents(tc, best_divisor_idx);
8548 +#endif
8549         if (ret)
8550                 goto err_unregister_clksrc;
8552 diff --git a/drivers/clocksource/timer-atmel-pit.c b/drivers/clocksource/timer-atmel-pit.c
8553 index d911c5dca8f1..7a40f7e88468 100644
8554 --- a/drivers/clocksource/timer-atmel-pit.c
8555 +++ b/drivers/clocksource/timer-atmel-pit.c
8556 @@ -46,6 +46,7 @@ struct pit_data {
8557         u32             cycle;
8558         u32             cnt;
8559         unsigned int    irq;
8560 +       bool            irq_requested;
8561         struct clk      *mck;
8562  };
8564 @@ -96,15 +97,29 @@ static int pit_clkevt_shutdown(struct clock_event_device *dev)
8566         /* disable irq, leaving the clocksource active */
8567         pit_write(data->base, AT91_PIT_MR, (data->cycle - 1) | AT91_PIT_PITEN);
8568 +       if (data->irq_requested) {
8569 +               free_irq(data->irq, data);
8570 +               data->irq_requested = false;
8571 +       }
8572         return 0;
8575 +static irqreturn_t at91sam926x_pit_interrupt(int irq, void *dev_id);
8576  /*
8577   * Clockevent device:  interrupts every 1/HZ (== pit_cycles * MCK/16)
8578   */
8579  static int pit_clkevt_set_periodic(struct clock_event_device *dev)
8581         struct pit_data *data = clkevt_to_pit_data(dev);
8582 +       int ret;
8584 +       ret = request_irq(data->irq, at91sam926x_pit_interrupt,
8585 +                         IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
8586 +                         "at91_tick", data);
8587 +       if (ret)
8588 +               panic(pr_fmt("Unable to setup IRQ\n"));
8590 +       data->irq_requested = true;
8592         /* update clocksource counter */
8593         data->cnt += data->cycle * PIT_PICNT(pit_read(data->base, AT91_PIT_PIVR));
8594 @@ -181,7 +196,6 @@ static void __init at91sam926x_pit_common_init(struct pit_data *data)
8596         unsigned long   pit_rate;
8597         unsigned        bits;
8598 -       int             ret;
8600         /*
8601          * Use our actual MCK to figure out how many MCK/16 ticks per
8602 @@ -206,13 +220,6 @@ static void __init at91sam926x_pit_common_init(struct pit_data *data)
8603         data->clksrc.flags = CLOCK_SOURCE_IS_CONTINUOUS;
8604         clocksource_register_hz(&data->clksrc, pit_rate);
8606 -       /* Set up irq handler */
8607 -       ret = request_irq(data->irq, at91sam926x_pit_interrupt,
8608 -                         IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
8609 -                         "at91_tick", data);
8610 -       if (ret)
8611 -               panic(pr_fmt("Unable to setup IRQ\n"));
8613         /* Set up and register clockevents */
8614         data->clkevt.name = "pit";
8615         data->clkevt.features = CLOCK_EVT_FEAT_PERIODIC;
8616 diff --git a/drivers/clocksource/timer-atmel-st.c b/drivers/clocksource/timer-atmel-st.c
8617 index 29d21d68df5a..103d0fd70cc4 100644
8618 --- a/drivers/clocksource/timer-atmel-st.c
8619 +++ b/drivers/clocksource/timer-atmel-st.c
8620 @@ -115,18 +115,29 @@ static void clkdev32k_disable_and_flush_irq(void)
8621         last_crtr = read_CRTR();
8624 +static int atmel_st_irq;
8626  static int clkevt32k_shutdown(struct clock_event_device *evt)
8628         clkdev32k_disable_and_flush_irq();
8629         irqmask = 0;
8630         regmap_write(regmap_st, AT91_ST_IER, irqmask);
8631 +       free_irq(atmel_st_irq, regmap_st);
8632         return 0;
8635  static int clkevt32k_set_oneshot(struct clock_event_device *dev)
8637 +       int ret;
8639         clkdev32k_disable_and_flush_irq();
8641 +       ret = request_irq(atmel_st_irq, at91rm9200_timer_interrupt,
8642 +                         IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
8643 +                         "at91_tick", regmap_st);
8644 +       if (ret)
8645 +               panic(pr_fmt("Unable to setup IRQ\n"));
8647         /*
8648          * ALM for oneshot irqs, set by next_event()
8649          * before 32 seconds have passed.
8650 @@ -139,8 +150,16 @@ static int clkevt32k_set_oneshot(struct clock_event_device *dev)
8652  static int clkevt32k_set_periodic(struct clock_event_device *dev)
8654 +       int ret;
8656         clkdev32k_disable_and_flush_irq();
8658 +       ret = request_irq(atmel_st_irq, at91rm9200_timer_interrupt,
8659 +                         IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
8660 +                         "at91_tick", regmap_st);
8661 +       if (ret)
8662 +               panic(pr_fmt("Unable to setup IRQ\n"));
8664         /* PIT for periodic irqs; fixed rate of 1/HZ */
8665         irqmask = AT91_ST_PITS;
8666         regmap_write(regmap_st, AT91_ST_PIMR, timer_latch);
8667 @@ -198,7 +217,7 @@ static void __init atmel_st_timer_init(struct device_node *node)
8669         struct clk *sclk;
8670         unsigned int sclk_rate, val;
8671 -       int irq, ret;
8672 +       int ret;
8674         regmap_st = syscon_node_to_regmap(node);
8675         if (IS_ERR(regmap_st))
8676 @@ -210,17 +229,10 @@ static void __init atmel_st_timer_init(struct device_node *node)
8677         regmap_read(regmap_st, AT91_ST_SR, &val);
8679         /* Get the interrupts property */
8680 -       irq  = irq_of_parse_and_map(node, 0);
8681 -       if (!irq)
8682 +       atmel_st_irq  = irq_of_parse_and_map(node, 0);
8683 +       if (!atmel_st_irq)
8684                 panic(pr_fmt("Unable to get IRQ from DT\n"));
8686 -       /* Make IRQs happen for the system timer */
8687 -       ret = request_irq(irq, at91rm9200_timer_interrupt,
8688 -                         IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
8689 -                         "at91_tick", regmap_st);
8690 -       if (ret)
8691 -               panic(pr_fmt("Unable to setup IRQ\n"));
8693         sclk = of_clk_get(node, 0);
8694         if (IS_ERR(sclk))
8695                 panic(pr_fmt("Unable to get slow clock\n"));
8696 diff --git a/drivers/cpufreq/Kconfig.x86 b/drivers/cpufreq/Kconfig.x86
8697 index c59bdcb83217..8f23161d80be 100644
8698 --- a/drivers/cpufreq/Kconfig.x86
8699 +++ b/drivers/cpufreq/Kconfig.x86
8700 @@ -123,7 +123,7 @@ config X86_POWERNOW_K7_ACPI
8702  config X86_POWERNOW_K8
8703         tristate "AMD Opteron/Athlon64 PowerNow!"
8704 -       depends on ACPI && ACPI_PROCESSOR && X86_ACPI_CPUFREQ
8705 +       depends on ACPI && ACPI_PROCESSOR && X86_ACPI_CPUFREQ && !PREEMPT_RT_BASE
8706         help
8707           This adds the CPUFreq driver for K8/early Opteron/Athlon64 processors.
8708           Support for K10 and newer processors is now in acpi-cpufreq.
8709 diff --git a/drivers/cpuidle/coupled.c b/drivers/cpuidle/coupled.c
8710 index 344058f8501a..d5657d50ac40 100644
8711 --- a/drivers/cpuidle/coupled.c
8712 +++ b/drivers/cpuidle/coupled.c
8713 @@ -119,7 +119,6 @@ struct cpuidle_coupled {
8715  #define CPUIDLE_COUPLED_NOT_IDLE       (-1)
8717 -static DEFINE_MUTEX(cpuidle_coupled_lock);
8718  static DEFINE_PER_CPU(struct call_single_data, cpuidle_coupled_poke_cb);
8720  /*
8721 diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
8722 index 6ed7d63a0688..9da7482ad256 100644
8723 --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
8724 +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
8725 @@ -1264,7 +1264,9 @@ i915_gem_ringbuffer_submission(struct i915_execbuffer_params *params,
8726         if (ret)
8727                 return ret;
8729 +#ifndef CONFIG_PREEMPT_RT_BASE
8730         trace_i915_gem_ring_dispatch(params->request, params->dispatch_flags);
8731 +#endif
8733         i915_gem_execbuffer_move_to_active(vmas, params->request);
8734         i915_gem_execbuffer_retire_commands(params);
8735 diff --git a/drivers/gpu/drm/i915/i915_gem_shrinker.c b/drivers/gpu/drm/i915/i915_gem_shrinker.c
8736 index c0a96f1ee18e..deb1e207fa3c 100644
8737 --- a/drivers/gpu/drm/i915/i915_gem_shrinker.c
8738 +++ b/drivers/gpu/drm/i915/i915_gem_shrinker.c
8739 @@ -39,7 +39,7 @@ static bool mutex_is_locked_by(struct mutex *mutex, struct task_struct *task)
8740         if (!mutex_is_locked(mutex))
8741                 return false;
8743 -#if defined(CONFIG_DEBUG_MUTEXES) || defined(CONFIG_MUTEX_SPIN_ON_OWNER)
8744 +#if (defined(CONFIG_DEBUG_MUTEXES) || defined(CONFIG_MUTEX_SPIN_ON_OWNER)) && !defined(CONFIG_PREEMPT_RT_BASE)
8745         return mutex->owner == task;
8746  #else
8747         /* Since UP may be pre-empted, we cannot assume that we own the lock */
8748 diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
8749 index b7b0a38acd67..148dcb1349d5 100644
8750 --- a/drivers/gpu/drm/i915/i915_irq.c
8751 +++ b/drivers/gpu/drm/i915/i915_irq.c
8752 @@ -812,6 +812,7 @@ static int i915_get_crtc_scanoutpos(struct drm_device *dev, unsigned int pipe,
8753         spin_lock_irqsave(&dev_priv->uncore.lock, irqflags);
8755         /* preempt_disable_rt() should go right here in PREEMPT_RT patchset. */
8756 +       preempt_disable_rt();
8758         /* Get optional system timestamp before query. */
8759         if (stime)
8760 @@ -863,6 +864,7 @@ static int i915_get_crtc_scanoutpos(struct drm_device *dev, unsigned int pipe,
8761                 *etime = ktime_get();
8763         /* preempt_enable_rt() should go right here in PREEMPT_RT patchset. */
8764 +       preempt_enable_rt();
8766         spin_unlock_irqrestore(&dev_priv->uncore.lock, irqflags);
8768 diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
8769 index 4f5d07bb3511..8ecd5c016dba 100644
8770 --- a/drivers/gpu/drm/i915/intel_display.c
8771 +++ b/drivers/gpu/drm/i915/intel_display.c
8772 @@ -11400,7 +11400,7 @@ void intel_check_page_flip(struct drm_device *dev, int pipe)
8773         struct intel_crtc *intel_crtc = to_intel_crtc(crtc);
8774         struct intel_unpin_work *work;
8776 -       WARN_ON(!in_interrupt());
8777 +       WARN_ON_NONRT(!in_interrupt());
8779         if (crtc == NULL)
8780                 return;
8781 diff --git a/drivers/gpu/drm/i915/intel_sprite.c b/drivers/gpu/drm/i915/intel_sprite.c
8782 index 2cc6aa072f4c..b79d33f14868 100644
8783 --- a/drivers/gpu/drm/i915/intel_sprite.c
8784 +++ b/drivers/gpu/drm/i915/intel_sprite.c
8785 @@ -38,6 +38,7 @@
8786  #include "intel_drv.h"
8787  #include <drm/i915_drm.h>
8788  #include "i915_drv.h"
8789 +#include <linux/locallock.h>
8791  static bool
8792  format_is_yuv(uint32_t format)
8793 @@ -64,6 +65,8 @@ static int usecs_to_scanlines(const struct drm_display_mode *adjusted_mode,
8794                             1000 * adjusted_mode->crtc_htotal);
8797 +static DEFINE_LOCAL_IRQ_LOCK(pipe_update_lock);
8799  /**
8800   * intel_pipe_update_start() - start update of a set of display registers
8801   * @crtc: the crtc of which the registers are going to be updated
8802 @@ -96,7 +99,7 @@ void intel_pipe_update_start(struct intel_crtc *crtc)
8803         min = vblank_start - usecs_to_scanlines(adjusted_mode, 100);
8804         max = vblank_start - 1;
8806 -       local_irq_disable();
8807 +       local_lock_irq(pipe_update_lock);
8809         if (min <= 0 || max <= 0)
8810                 return;
8811 @@ -126,11 +129,11 @@ void intel_pipe_update_start(struct intel_crtc *crtc)
8812                         break;
8813                 }
8815 -               local_irq_enable();
8816 +               local_unlock_irq(pipe_update_lock);
8818                 timeout = schedule_timeout(timeout);
8820 -               local_irq_disable();
8821 +               local_lock_irq(pipe_update_lock);
8822         }
8824         finish_wait(wq, &wait);
8825 @@ -164,7 +167,7 @@ void intel_pipe_update_end(struct intel_crtc *crtc)
8827         trace_i915_pipe_update_end(crtc, end_vbl_count, scanline_end);
8829 -       local_irq_enable();
8830 +       local_unlock_irq(pipe_update_lock);
8832         if (crtc->debug.start_vbl_count &&
8833             crtc->debug.start_vbl_count != end_vbl_count) {
8834 diff --git a/drivers/gpu/drm/radeon/radeon_display.c b/drivers/gpu/drm/radeon/radeon_display.c
8835 index 3645b223aa37..642854b2ed2c 100644
8836 --- a/drivers/gpu/drm/radeon/radeon_display.c
8837 +++ b/drivers/gpu/drm/radeon/radeon_display.c
8838 @@ -1862,6 +1862,7 @@ int radeon_get_crtc_scanoutpos(struct drm_device *dev, unsigned int pipe,
8839         struct radeon_device *rdev = dev->dev_private;
8841         /* preempt_disable_rt() should go right here in PREEMPT_RT patchset. */
8842 +       preempt_disable_rt();
8844         /* Get optional system timestamp before query. */
8845         if (stime)
8846 @@ -1954,6 +1955,7 @@ int radeon_get_crtc_scanoutpos(struct drm_device *dev, unsigned int pipe,
8847                 *etime = ktime_get();
8849         /* preempt_enable_rt() should go right here in PREEMPT_RT patchset. */
8850 +       preempt_enable_rt();
8852         /* Decode into vertical and horizontal scanout position. */
8853         *vpos = position & 0x1fff;
8854 diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
8855 index 802dcb409030..d6d9427860d8 100644
8856 --- a/drivers/hv/vmbus_drv.c
8857 +++ b/drivers/hv/vmbus_drv.c
8858 @@ -820,7 +820,7 @@ static void vmbus_isr(void)
8859                         tasklet_schedule(&msg_dpc);
8860         }
8862 -       add_interrupt_randomness(HYPERVISOR_CALLBACK_VECTOR, 0);
8863 +       add_interrupt_randomness(HYPERVISOR_CALLBACK_VECTOR, 0, 0);
8867 diff --git a/drivers/i2c/busses/i2c-omap.c b/drivers/i2c/busses/i2c-omap.c
8868 index 08d26ba61ed3..46b89dd42b10 100644
8869 --- a/drivers/i2c/busses/i2c-omap.c
8870 +++ b/drivers/i2c/busses/i2c-omap.c
8871 @@ -995,15 +995,12 @@ omap_i2c_isr(int irq, void *dev_id)
8872         u16 mask;
8873         u16 stat;
8875 -       spin_lock(&omap->lock);
8876 -       mask = omap_i2c_read_reg(omap, OMAP_I2C_IE_REG);
8877         stat = omap_i2c_read_reg(omap, OMAP_I2C_STAT_REG);
8878 +       mask = omap_i2c_read_reg(omap, OMAP_I2C_IE_REG);
8880         if (stat & mask)
8881                 ret = IRQ_WAKE_THREAD;
8883 -       spin_unlock(&omap->lock);
8885         return ret;
8888 diff --git a/drivers/ide/alim15x3.c b/drivers/ide/alim15x3.c
8889 index 36f76e28a0bf..394f142f90c7 100644
8890 --- a/drivers/ide/alim15x3.c
8891 +++ b/drivers/ide/alim15x3.c
8892 @@ -234,7 +234,7 @@ static int init_chipset_ali15x3(struct pci_dev *dev)
8894         isa_dev = pci_get_device(PCI_VENDOR_ID_AL, PCI_DEVICE_ID_AL_M1533, NULL);
8896 -       local_irq_save(flags);
8897 +       local_irq_save_nort(flags);
8899         if (m5229_revision < 0xC2) {
8900                 /*
8901 @@ -325,7 +325,7 @@ out:
8902         }
8903         pci_dev_put(north);
8904         pci_dev_put(isa_dev);
8905 -       local_irq_restore(flags);
8906 +       local_irq_restore_nort(flags);
8907         return 0;
8910 diff --git a/drivers/ide/hpt366.c b/drivers/ide/hpt366.c
8911 index 696b6c1ec940..0d0a96629b73 100644
8912 --- a/drivers/ide/hpt366.c
8913 +++ b/drivers/ide/hpt366.c
8914 @@ -1241,7 +1241,7 @@ static int init_dma_hpt366(ide_hwif_t *hwif,
8916         dma_old = inb(base + 2);
8918 -       local_irq_save(flags);
8919 +       local_irq_save_nort(flags);
8921         dma_new = dma_old;
8922         pci_read_config_byte(dev, hwif->channel ? 0x4b : 0x43, &masterdma);
8923 @@ -1252,7 +1252,7 @@ static int init_dma_hpt366(ide_hwif_t *hwif,
8924         if (dma_new != dma_old)
8925                 outb(dma_new, base + 2);
8927 -       local_irq_restore(flags);
8928 +       local_irq_restore_nort(flags);
8930         printk(KERN_INFO "    %s: BM-DMA at 0x%04lx-0x%04lx\n",
8931                          hwif->name, base, base + 7);
8932 diff --git a/drivers/ide/ide-io-std.c b/drivers/ide/ide-io-std.c
8933 index 19763977568c..4169433faab5 100644
8934 --- a/drivers/ide/ide-io-std.c
8935 +++ b/drivers/ide/ide-io-std.c
8936 @@ -175,7 +175,7 @@ void ide_input_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf,
8937                 unsigned long uninitialized_var(flags);
8939                 if ((io_32bit & 2) && !mmio) {
8940 -                       local_irq_save(flags);
8941 +                       local_irq_save_nort(flags);
8942                         ata_vlb_sync(io_ports->nsect_addr);
8943                 }
8945 @@ -186,7 +186,7 @@ void ide_input_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf,
8946                         insl(data_addr, buf, words);
8948                 if ((io_32bit & 2) && !mmio)
8949 -                       local_irq_restore(flags);
8950 +                       local_irq_restore_nort(flags);
8952                 if (((len + 1) & 3) < 2)
8953                         return;
8954 @@ -219,7 +219,7 @@ void ide_output_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf,
8955                 unsigned long uninitialized_var(flags);
8957                 if ((io_32bit & 2) && !mmio) {
8958 -                       local_irq_save(flags);
8959 +                       local_irq_save_nort(flags);
8960                         ata_vlb_sync(io_ports->nsect_addr);
8961                 }
8963 @@ -230,7 +230,7 @@ void ide_output_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf,
8964                         outsl(data_addr, buf, words);
8966                 if ((io_32bit & 2) && !mmio)
8967 -                       local_irq_restore(flags);
8968 +                       local_irq_restore_nort(flags);
8970                 if (((len + 1) & 3) < 2)
8971                         return;
8972 diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
8973 index 669ea1e45795..e12e43e62245 100644
8974 --- a/drivers/ide/ide-io.c
8975 +++ b/drivers/ide/ide-io.c
8976 @@ -659,7 +659,7 @@ void ide_timer_expiry (unsigned long data)
8977                 /* disable_irq_nosync ?? */
8978                 disable_irq(hwif->irq);
8979                 /* local CPU only, as if we were handling an interrupt */
8980 -               local_irq_disable();
8981 +               local_irq_disable_nort();
8982                 if (hwif->polling) {
8983                         startstop = handler(drive);
8984                 } else if (drive_is_ready(drive)) {
8985 diff --git a/drivers/ide/ide-iops.c b/drivers/ide/ide-iops.c
8986 index 376f2dc410c5..f014dd1b73dc 100644
8987 --- a/drivers/ide/ide-iops.c
8988 +++ b/drivers/ide/ide-iops.c
8989 @@ -129,12 +129,12 @@ int __ide_wait_stat(ide_drive_t *drive, u8 good, u8 bad,
8990                                 if ((stat & ATA_BUSY) == 0)
8991                                         break;
8993 -                               local_irq_restore(flags);
8994 +                               local_irq_restore_nort(flags);
8995                                 *rstat = stat;
8996                                 return -EBUSY;
8997                         }
8998                 }
8999 -               local_irq_restore(flags);
9000 +               local_irq_restore_nort(flags);
9001         }
9002         /*
9003          * Allow status to settle, then read it again.
9004 diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
9005 index 0b63facd1d87..4ceba37afc0c 100644
9006 --- a/drivers/ide/ide-probe.c
9007 +++ b/drivers/ide/ide-probe.c
9008 @@ -196,10 +196,10 @@ static void do_identify(ide_drive_t *drive, u8 cmd, u16 *id)
9009         int bswap = 1;
9011         /* local CPU only; some systems need this */
9012 -       local_irq_save(flags);
9013 +       local_irq_save_nort(flags);
9014         /* read 512 bytes of id info */
9015         hwif->tp_ops->input_data(drive, NULL, id, SECTOR_SIZE);
9016 -       local_irq_restore(flags);
9017 +       local_irq_restore_nort(flags);
9019         drive->dev_flags |= IDE_DFLAG_ID_READ;
9020  #ifdef DEBUG
9021 diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c
9022 index a716693417a3..be0568c722d6 100644
9023 --- a/drivers/ide/ide-taskfile.c
9024 +++ b/drivers/ide/ide-taskfile.c
9025 @@ -250,7 +250,7 @@ void ide_pio_bytes(ide_drive_t *drive, struct ide_cmd *cmd,
9027                 page_is_high = PageHighMem(page);
9028                 if (page_is_high)
9029 -                       local_irq_save(flags);
9030 +                       local_irq_save_nort(flags);
9032                 buf = kmap_atomic(page) + offset;
9034 @@ -271,7 +271,7 @@ void ide_pio_bytes(ide_drive_t *drive, struct ide_cmd *cmd,
9035                 kunmap_atomic(buf);
9037                 if (page_is_high)
9038 -                       local_irq_restore(flags);
9039 +                       local_irq_restore_nort(flags);
9041                 len -= nr_bytes;
9042         }
9043 @@ -414,7 +414,7 @@ static ide_startstop_t pre_task_out_intr(ide_drive_t *drive,
9044         }
9046         if ((drive->dev_flags & IDE_DFLAG_UNMASK) == 0)
9047 -               local_irq_disable();
9048 +               local_irq_disable_nort();
9050         ide_set_handler(drive, &task_pio_intr, WAIT_WORSTCASE);
9052 diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
9053 index 5580ab0b5781..a123d0439c4c 100644
9054 --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
9055 +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
9056 @@ -862,7 +862,7 @@ void ipoib_mcast_restart_task(struct work_struct *work)
9058         ipoib_dbg_mcast(priv, "restarting multicast task\n");
9060 -       local_irq_save(flags);
9061 +       local_irq_save_nort(flags);
9062         netif_addr_lock(dev);
9063         spin_lock(&priv->lock);
9065 @@ -944,7 +944,7 @@ void ipoib_mcast_restart_task(struct work_struct *work)
9067         spin_unlock(&priv->lock);
9068         netif_addr_unlock(dev);
9069 -       local_irq_restore(flags);
9070 +       local_irq_restore_nort(flags);
9072         /*
9073          * make sure the in-flight joins have finished before we attempt
9074 diff --git a/drivers/input/gameport/gameport.c b/drivers/input/gameport/gameport.c
9075 index 4a2a9e370be7..e970d9afd179 100644
9076 --- a/drivers/input/gameport/gameport.c
9077 +++ b/drivers/input/gameport/gameport.c
9078 @@ -91,13 +91,13 @@ static int gameport_measure_speed(struct gameport *gameport)
9079         tx = ~0;
9081         for (i = 0; i < 50; i++) {
9082 -               local_irq_save(flags);
9083 +               local_irq_save_nort(flags);
9084                 t1 = ktime_get_ns();
9085                 for (t = 0; t < 50; t++)
9086                         gameport_read(gameport);
9087                 t2 = ktime_get_ns();
9088                 t3 = ktime_get_ns();
9089 -               local_irq_restore(flags);
9090 +               local_irq_restore_nort(flags);
9091                 udelay(i * 10);
9092                 t = (t2 - t1) - (t3 - t2);
9093                 if (t < tx)
9094 @@ -124,12 +124,12 @@ static int old_gameport_measure_speed(struct gameport *gameport)
9095         tx = 1 << 30;
9097         for(i = 0; i < 50; i++) {
9098 -               local_irq_save(flags);
9099 +               local_irq_save_nort(flags);
9100                 GET_TIME(t1);
9101                 for (t = 0; t < 50; t++) gameport_read(gameport);
9102                 GET_TIME(t2);
9103                 GET_TIME(t3);
9104 -               local_irq_restore(flags);
9105 +               local_irq_restore_nort(flags);
9106                 udelay(i * 10);
9107                 if ((t = DELTA(t2,t1) - DELTA(t3,t2)) < tx) tx = t;
9108         }
9109 @@ -148,11 +148,11 @@ static int old_gameport_measure_speed(struct gameport *gameport)
9110         tx = 1 << 30;
9112         for(i = 0; i < 50; i++) {
9113 -               local_irq_save(flags);
9114 +               local_irq_save_nort(flags);
9115                 t1 = rdtsc();
9116                 for (t = 0; t < 50; t++) gameport_read(gameport);
9117                 t2 = rdtsc();
9118 -               local_irq_restore(flags);
9119 +               local_irq_restore_nort(flags);
9120                 udelay(i * 10);
9121                 if (t2 - t1 < tx) tx = t2 - t1;
9122         }
9123 diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
9124 index a0ef57483ebb..b08732bae11a 100644
9125 --- a/drivers/iommu/amd_iommu.c
9126 +++ b/drivers/iommu/amd_iommu.c
9127 @@ -2022,10 +2022,10 @@ static int __attach_device(struct iommu_dev_data *dev_data,
9128         int ret;
9130         /*
9131 -        * Must be called with IRQs disabled. Warn here to detect early
9132 -        * when its not.
9133 +        * Must be called with IRQs disabled on a non RT kernel. Warn here to
9134 +        * detect early when its not.
9135          */
9136 -       WARN_ON(!irqs_disabled());
9137 +       WARN_ON_NONRT(!irqs_disabled());
9139         /* lock domain */
9140         spin_lock(&domain->lock);
9141 @@ -2188,10 +2188,10 @@ static void __detach_device(struct iommu_dev_data *dev_data)
9142         struct protection_domain *domain;
9144         /*
9145 -        * Must be called with IRQs disabled. Warn here to detect early
9146 -        * when its not.
9147 +        * Must be called with IRQs disabled on a non RT kernel. Warn here to
9148 +        * detect early when its not.
9149          */
9150 -       WARN_ON(!irqs_disabled());
9151 +       WARN_ON_NONRT(!irqs_disabled());
9153         if (WARN_ON(!dev_data->domain))
9154                 return;
9155 diff --git a/drivers/leds/trigger/Kconfig b/drivers/leds/trigger/Kconfig
9156 index 5bda6a9b56bb..d6286584c807 100644
9157 --- a/drivers/leds/trigger/Kconfig
9158 +++ b/drivers/leds/trigger/Kconfig
9159 @@ -61,7 +61,7 @@ config LEDS_TRIGGER_BACKLIGHT
9161  config LEDS_TRIGGER_CPU
9162         bool "LED CPU Trigger"
9163 -       depends on LEDS_TRIGGERS
9164 +       depends on LEDS_TRIGGERS && !PREEMPT_RT_BASE
9165         help
9166           This allows LEDs to be controlled by active CPUs. This shows
9167           the active CPUs across an array of LEDs so you can see which
9168 diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig
9169 index 4d200883c505..98b64ed5cb81 100644
9170 --- a/drivers/md/bcache/Kconfig
9171 +++ b/drivers/md/bcache/Kconfig
9172 @@ -1,6 +1,7 @@
9174  config BCACHE
9175         tristate "Block device as cache"
9176 +       depends on !PREEMPT_RT_FULL
9177         ---help---
9178         Allows a block device to be used as cache for other devices; uses
9179         a btree for indexing and the layout is optimized for SSDs.
9180 diff --git a/drivers/md/dm.c b/drivers/md/dm.c
9181 index 320eb3c4bb6b..aba2d71149e3 100644
9182 --- a/drivers/md/dm.c
9183 +++ b/drivers/md/dm.c
9184 @@ -2185,7 +2185,7 @@ static void dm_request_fn(struct request_queue *q)
9185                 /* Establish tio->ti before queuing work (map_tio_request) */
9186                 tio->ti = ti;
9187                 queue_kthread_work(&md->kworker, &tio->work);
9188 -               BUG_ON(!irqs_disabled());
9189 +               BUG_ON_NONRT(!irqs_disabled());
9190         }
9192         goto out;
9193 diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
9194 index 4384b46cee1a..6415f094a4eb 100644
9195 --- a/drivers/md/raid5.c
9196 +++ b/drivers/md/raid5.c
9197 @@ -1920,8 +1920,9 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
9198         struct raid5_percpu *percpu;
9199         unsigned long cpu;
9201 -       cpu = get_cpu();
9202 +       cpu = get_cpu_light();
9203         percpu = per_cpu_ptr(conf->percpu, cpu);
9204 +       spin_lock(&percpu->lock);
9205         if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) {
9206                 ops_run_biofill(sh);
9207                 overlap_clear++;
9208 @@ -1977,7 +1978,8 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
9209                         if (test_and_clear_bit(R5_Overlap, &dev->flags))
9210                                 wake_up(&sh->raid_conf->wait_for_overlap);
9211                 }
9212 -       put_cpu();
9213 +       spin_unlock(&percpu->lock);
9214 +       put_cpu_light();
9217  static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp)
9218 @@ -6416,6 +6418,7 @@ static int raid5_alloc_percpu(struct r5conf *conf)
9219                                __func__, cpu);
9220                         break;
9221                 }
9222 +               spin_lock_init(&per_cpu_ptr(conf->percpu, cpu)->lock);
9223         }
9224         put_online_cpus();
9226 diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
9227 index 517d4b68a1be..efe91887ecd7 100644
9228 --- a/drivers/md/raid5.h
9229 +++ b/drivers/md/raid5.h
9230 @@ -504,6 +504,7 @@ struct r5conf {
9231         int                     recovery_disabled;
9232         /* per cpu variables */
9233         struct raid5_percpu {
9234 +               spinlock_t      lock;           /* Protection for -RT */
9235                 struct page     *spare_page; /* Used when checking P/Q in raid6 */
9236                 struct flex_array *scribble;   /* space for constructing buffer
9237                                               * lists and performing address
9238 diff --git a/drivers/media/platform/vsp1/vsp1_video.c b/drivers/media/platform/vsp1/vsp1_video.c
9239 index 5ce88e1f5d71..b4f8cd74ecb8 100644
9240 --- a/drivers/media/platform/vsp1/vsp1_video.c
9241 +++ b/drivers/media/platform/vsp1/vsp1_video.c
9242 @@ -520,7 +520,7 @@ static bool vsp1_pipeline_stopped(struct vsp1_pipeline *pipe)
9243         bool stopped;
9245         spin_lock_irqsave(&pipe->irqlock, flags);
9246 -       stopped = pipe->state == VSP1_PIPELINE_STOPPED,
9247 +       stopped = pipe->state == VSP1_PIPELINE_STOPPED;
9248         spin_unlock_irqrestore(&pipe->irqlock, flags);
9250         return stopped;
9251 diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
9252 index 4bf7d50b1bc7..6f7e99ad6e29 100644
9253 --- a/drivers/misc/Kconfig
9254 +++ b/drivers/misc/Kconfig
9255 @@ -54,6 +54,7 @@ config AD525X_DPOT_SPI
9256  config ATMEL_TCLIB
9257         bool "Atmel AT32/AT91 Timer/Counter Library"
9258         depends on (AVR32 || ARCH_AT91)
9259 +       default y if PREEMPT_RT_FULL
9260         help
9261           Select this if you want a library to allocate the Timer/Counter
9262           blocks found on many Atmel processors.  This facilitates using
9263 @@ -69,8 +70,7 @@ config ATMEL_TCB_CLKSRC
9264           are combined to make a single 32-bit timer.
9266           When GENERIC_CLOCKEVENTS is defined, the third timer channel
9267 -         may be used as a clock event device supporting oneshot mode
9268 -         (delays of up to two seconds) based on the 32 KiHz clock.
9269 +         may be used as a clock event device supporting oneshot mode.
9271  config ATMEL_TCB_CLKSRC_BLOCK
9272         int
9273 @@ -84,6 +84,15 @@ config ATMEL_TCB_CLKSRC_BLOCK
9274           TC can be used for other purposes, such as PWM generation and
9275           interval timing.
9277 +config ATMEL_TCB_CLKSRC_USE_SLOW_CLOCK
9278 +       bool "TC Block use 32 KiHz clock"
9279 +       depends on ATMEL_TCB_CLKSRC
9280 +       default y if !PREEMPT_RT_FULL
9281 +       help
9282 +         Select this to use 32 KiHz base clock rate as TC block clock
9283 +         source for clock events.
9286  config DUMMY_IRQ
9287         tristate "Dummy IRQ handler"
9288         default n
9289 @@ -113,6 +122,35 @@ config IBM_ASM
9290           for information on the specific driver level and support statement
9291           for your IBM server.
9293 +config HWLAT_DETECTOR
9294 +       tristate "Testing module to detect hardware-induced latencies"
9295 +       depends on DEBUG_FS
9296 +       depends on RING_BUFFER
9297 +       default m
9298 +       ---help---
9299 +         A simple hardware latency detector. Use this module to detect
9300 +         large latencies introduced by the behavior of the underlying
9301 +         system firmware external to Linux. We do this using periodic
9302 +         use of stop_machine to grab all available CPUs and measure
9303 +         for unexplainable gaps in the CPU timestamp counter(s). By
9304 +         default, the module is not enabled until the "enable" file
9305 +         within the "hwlat_detector" debugfs directory is toggled.
9307 +         This module is often used to detect SMI (System Management
9308 +         Interrupts) on x86 systems, though is not x86 specific. To
9309 +         this end, we default to using a sample window of 1 second,
9310 +         during which we will sample for 0.5 seconds. If an SMI or
9311 +         similar event occurs during that time, it is recorded
9312 +         into an 8K samples global ring buffer until retreived.
9314 +         WARNING: This software should never be enabled (it can be built
9315 +         but should not be turned on after it is loaded) in a production
9316 +         environment where high latencies are a concern since the
9317 +         sampling mechanism actually introduces latencies for
9318 +         regular tasks while the CPU(s) are being held.
9320 +         If unsure, say N
9322  config PHANTOM
9323         tristate "Sensable PHANToM (PCI)"
9324         depends on PCI
9325 diff --git a/drivers/misc/Makefile b/drivers/misc/Makefile
9326 index 537d7f3b78da..ec4aecba0656 100644
9327 --- a/drivers/misc/Makefile
9328 +++ b/drivers/misc/Makefile
9329 @@ -39,6 +39,7 @@ obj-$(CONFIG_C2PORT)          += c2port/
9330  obj-$(CONFIG_HMC6352)          += hmc6352.o
9331  obj-y                          += eeprom/
9332  obj-y                          += cb710/
9333 +obj-$(CONFIG_HWLAT_DETECTOR)   += hwlat_detector.o
9334  obj-$(CONFIG_SPEAR13XX_PCIE_GADGET)    += spear13xx_pcie_gadget.o
9335  obj-$(CONFIG_VMWARE_BALLOON)   += vmw_balloon.o
9336  obj-$(CONFIG_ARM_CHARLCD)      += arm-charlcd.o
9337 diff --git a/drivers/misc/hwlat_detector.c b/drivers/misc/hwlat_detector.c
9338 new file mode 100644
9339 index 000000000000..52f5ad5fd9c0
9340 --- /dev/null
9341 +++ b/drivers/misc/hwlat_detector.c
9342 @@ -0,0 +1,1240 @@
9344 + * hwlat_detector.c - A simple Hardware Latency detector.
9345 + *
9346 + * Use this module to detect large system latencies induced by the behavior of
9347 + * certain underlying system hardware or firmware, independent of Linux itself.
9348 + * The code was developed originally to detect the presence of SMIs on Intel
9349 + * and AMD systems, although there is no dependency upon x86 herein.
9350 + *
9351 + * The classical example usage of this module is in detecting the presence of
9352 + * SMIs or System Management Interrupts on Intel and AMD systems. An SMI is a
9353 + * somewhat special form of hardware interrupt spawned from earlier CPU debug
9354 + * modes in which the (BIOS/EFI/etc.) firmware arranges for the South Bridge
9355 + * LPC (or other device) to generate a special interrupt under certain
9356 + * circumstances, for example, upon expiration of a special SMI timer device,
9357 + * due to certain external thermal readings, on certain I/O address accesses,
9358 + * and other situations. An SMI hits a special CPU pin, triggers a special
9359 + * SMI mode (complete with special memory map), and the OS is unaware.
9360 + *
9361 + * Although certain hardware-inducing latencies are necessary (for example,
9362 + * a modern system often requires an SMI handler for correct thermal control
9363 + * and remote management) they can wreak havoc upon any OS-level performance
9364 + * guarantees toward low-latency, especially when the OS is not even made
9365 + * aware of the presence of these interrupts. For this reason, we need a
9366 + * somewhat brute force mechanism to detect these interrupts. In this case,
9367 + * we do it by hogging all of the CPU(s) for configurable timer intervals,
9368 + * sampling the built-in CPU timer, looking for discontiguous readings.
9369 + *
9370 + * WARNING: This implementation necessarily introduces latencies. Therefore,
9371 + *          you should NEVER use this module in a production environment
9372 + *          requiring any kind of low-latency performance guarantee(s).
9373 + *
9374 + * Copyright (C) 2008-2009 Jon Masters, Red Hat, Inc. <jcm@redhat.com>
9375 + *
9376 + * Includes useful feedback from Clark Williams <clark@redhat.com>
9377 + *
9378 + * This file is licensed under the terms of the GNU General Public
9379 + * License version 2. This program is licensed "as is" without any
9380 + * warranty of any kind, whether express or implied.
9381 + */
9383 +#include <linux/module.h>
9384 +#include <linux/init.h>
9385 +#include <linux/ring_buffer.h>
9386 +#include <linux/time.h>
9387 +#include <linux/hrtimer.h>
9388 +#include <linux/kthread.h>
9389 +#include <linux/debugfs.h>
9390 +#include <linux/seq_file.h>
9391 +#include <linux/uaccess.h>
9392 +#include <linux/version.h>
9393 +#include <linux/delay.h>
9394 +#include <linux/slab.h>
9395 +#include <linux/trace_clock.h>
9397 +#define BUF_SIZE_DEFAULT       262144UL                /* 8K*(sizeof(entry)) */
9398 +#define BUF_FLAGS              (RB_FL_OVERWRITE)       /* no block on full */
9399 +#define U64STR_SIZE            22                      /* 20 digits max */
9401 +#define VERSION                        "1.0.0"
9402 +#define BANNER                 "hwlat_detector: "
9403 +#define DRVNAME                        "hwlat_detector"
9404 +#define DEFAULT_SAMPLE_WINDOW  1000000                 /* 1s */
9405 +#define DEFAULT_SAMPLE_WIDTH   500000                  /* 0.5s */
9406 +#define DEFAULT_LAT_THRESHOLD  10                      /* 10us */
9408 +/* Module metadata */
9410 +MODULE_LICENSE("GPL");
9411 +MODULE_AUTHOR("Jon Masters <jcm@redhat.com>");
9412 +MODULE_DESCRIPTION("A simple hardware latency detector");
9413 +MODULE_VERSION(VERSION);
9415 +/* Module parameters */
9417 +static int debug;
9418 +static int enabled;
9419 +static int threshold;
9421 +module_param(debug, int, 0);                   /* enable debug */
9422 +module_param(enabled, int, 0);                 /* enable detector */
9423 +module_param(threshold, int, 0);               /* latency threshold */
9425 +/* Buffering and sampling */
9427 +static struct ring_buffer *ring_buffer;                /* sample buffer */
9428 +static DEFINE_MUTEX(ring_buffer_mutex);                /* lock changes */
9429 +static unsigned long buf_size = BUF_SIZE_DEFAULT;
9430 +static struct task_struct *kthread;            /* sampling thread */
9432 +/* DebugFS filesystem entries */
9434 +static struct dentry *debug_dir;               /* debugfs directory */
9435 +static struct dentry *debug_max;               /* maximum TSC delta */
9436 +static struct dentry *debug_count;             /* total detect count */
9437 +static struct dentry *debug_sample_width;      /* sample width us */
9438 +static struct dentry *debug_sample_window;     /* sample window us */
9439 +static struct dentry *debug_sample;            /* raw samples us */
9440 +static struct dentry *debug_threshold;         /* threshold us */
9441 +static struct dentry *debug_enable;            /* enable/disable */
9443 +/* Individual samples and global state */
9445 +struct sample;                                 /* latency sample */
9446 +struct data;                                   /* Global state */
9448 +/* Sampling functions */
9449 +static int __buffer_add_sample(struct sample *sample);
9450 +static struct sample *buffer_get_sample(struct sample *sample);
9452 +/* Threading and state */
9453 +static int kthread_fn(void *unused);
9454 +static int start_kthread(void);
9455 +static int stop_kthread(void);
9456 +static void __reset_stats(void);
9457 +static int init_stats(void);
9459 +/* Debugfs interface */
9460 +static ssize_t simple_data_read(struct file *filp, char __user *ubuf,
9461 +                               size_t cnt, loff_t *ppos, const u64 *entry);
9462 +static ssize_t simple_data_write(struct file *filp, const char __user *ubuf,
9463 +                                size_t cnt, loff_t *ppos, u64 *entry);
9464 +static int debug_sample_fopen(struct inode *inode, struct file *filp);
9465 +static ssize_t debug_sample_fread(struct file *filp, char __user *ubuf,
9466 +                                 size_t cnt, loff_t *ppos);
9467 +static int debug_sample_release(struct inode *inode, struct file *filp);
9468 +static int debug_enable_fopen(struct inode *inode, struct file *filp);
9469 +static ssize_t debug_enable_fread(struct file *filp, char __user *ubuf,
9470 +                                 size_t cnt, loff_t *ppos);
9471 +static ssize_t debug_enable_fwrite(struct file *file,
9472 +                                  const char __user *user_buffer,
9473 +                                  size_t user_size, loff_t *offset);
9475 +/* Initialization functions */
9476 +static int init_debugfs(void);
9477 +static void free_debugfs(void);
9478 +static int detector_init(void);
9479 +static void detector_exit(void);
9481 +/* Individual latency samples are stored here when detected and packed into
9482 + * the ring_buffer circular buffer, where they are overwritten when
9483 + * more than buf_size/sizeof(sample) samples are received. */
9484 +struct sample {
9485 +       u64             seqnum;         /* unique sequence */
9486 +       u64             duration;       /* ktime delta */
9487 +       u64             outer_duration; /* ktime delta (outer loop) */
9488 +       struct timespec timestamp;      /* wall time */
9489 +       unsigned long   lost;
9492 +/* keep the global state somewhere. */
9493 +static struct data {
9495 +       struct mutex lock;              /* protect changes */
9497 +       u64     count;                  /* total since reset */
9498 +       u64     max_sample;             /* max hardware latency */
9499 +       u64     threshold;              /* sample threshold level */
9501 +       u64     sample_window;          /* total sampling window (on+off) */
9502 +       u64     sample_width;           /* active sampling portion of window */
9504 +       atomic_t sample_open;           /* whether the sample file is open */
9506 +       wait_queue_head_t wq;           /* waitqeue for new sample values */
9508 +} data;
9510 +/**
9511 + * __buffer_add_sample - add a new latency sample recording to the ring buffer
9512 + * @sample: The new latency sample value
9513 + *
9514 + * This receives a new latency sample and records it in a global ring buffer.
9515 + * No additional locking is used in this case.
9516 + */
9517 +static int __buffer_add_sample(struct sample *sample)
9519 +       return ring_buffer_write(ring_buffer,
9520 +                                sizeof(struct sample), sample);
9523 +/**
9524 + * buffer_get_sample - remove a hardware latency sample from the ring buffer
9525 + * @sample: Pre-allocated storage for the sample
9526 + *
9527 + * This retrieves a hardware latency sample from the global circular buffer
9528 + */
9529 +static struct sample *buffer_get_sample(struct sample *sample)
9531 +       struct ring_buffer_event *e = NULL;
9532 +       struct sample *s = NULL;
9533 +       unsigned int cpu = 0;
9535 +       if (!sample)
9536 +               return NULL;
9538 +       mutex_lock(&ring_buffer_mutex);
9539 +       for_each_online_cpu(cpu) {
9540 +               e = ring_buffer_consume(ring_buffer, cpu, NULL, &sample->lost);
9541 +               if (e)
9542 +                       break;
9543 +       }
9545 +       if (e) {
9546 +               s = ring_buffer_event_data(e);
9547 +               memcpy(sample, s, sizeof(struct sample));
9548 +       } else
9549 +               sample = NULL;
9550 +       mutex_unlock(&ring_buffer_mutex);
9552 +       return sample;
9555 +#ifndef CONFIG_TRACING
9556 +#define time_type      ktime_t
9557 +#define time_get()     ktime_get()
9558 +#define time_to_us(x)  ktime_to_us(x)
9559 +#define time_sub(a, b) ktime_sub(a, b)
9560 +#define init_time(a, b)        (a).tv64 = b
9561 +#define time_u64(a)    ((a).tv64)
9562 +#else
9563 +#define time_type      u64
9564 +#define time_get()     trace_clock_local()
9565 +#define time_to_us(x)  div_u64(x, 1000)
9566 +#define time_sub(a, b) ((a) - (b))
9567 +#define init_time(a, b)        (a = b)
9568 +#define time_u64(a)    a
9569 +#endif
9570 +/**
9571 + * get_sample - sample the CPU TSC and look for likely hardware latencies
9572 + *
9573 + * Used to repeatedly capture the CPU TSC (or similar), looking for potential
9574 + * hardware-induced latency. Called with interrupts disabled and with
9575 + * data.lock held.
9576 + */
9577 +static int get_sample(void)
9579 +       time_type start, t1, t2, last_t2;
9580 +       s64 diff, total = 0;
9581 +       u64 sample = 0;
9582 +       u64 outer_sample = 0;
9583 +       int ret = -1;
9585 +       init_time(last_t2, 0);
9586 +       start = time_get(); /* start timestamp */
9588 +       do {
9590 +               t1 = time_get();        /* we'll look for a discontinuity */
9591 +               t2 = time_get();
9593 +               if (time_u64(last_t2)) {
9594 +                       /* Check the delta from outer loop (t2 to next t1) */
9595 +                       diff = time_to_us(time_sub(t1, last_t2));
9596 +                       /* This shouldn't happen */
9597 +                       if (diff < 0) {
9598 +                               pr_err(BANNER "time running backwards\n");
9599 +                               goto out;
9600 +                       }
9601 +                       if (diff > outer_sample)
9602 +                               outer_sample = diff;
9603 +               }
9604 +               last_t2 = t2;
9606 +               total = time_to_us(time_sub(t2, start)); /* sample width */
9608 +               /* This checks the inner loop (t1 to t2) */
9609 +               diff = time_to_us(time_sub(t2, t1));     /* current diff */
9611 +               /* This shouldn't happen */
9612 +               if (diff < 0) {
9613 +                       pr_err(BANNER "time running backwards\n");
9614 +                       goto out;
9615 +               }
9617 +               if (diff > sample)
9618 +                       sample = diff; /* only want highest value */
9620 +       } while (total <= data.sample_width);
9622 +       ret = 0;
9624 +       /* If we exceed the threshold value, we have found a hardware latency */
9625 +       if (sample > data.threshold || outer_sample > data.threshold) {
9626 +               struct sample s;
9628 +               ret = 1;
9630 +               data.count++;
9631 +               s.seqnum = data.count;
9632 +               s.duration = sample;
9633 +               s.outer_duration = outer_sample;
9634 +               s.timestamp = CURRENT_TIME;
9635 +               __buffer_add_sample(&s);
9637 +               /* Keep a running maximum ever recorded hardware latency */
9638 +               if (sample > data.max_sample)
9639 +                       data.max_sample = sample;
9640 +       }
9642 +out:
9643 +       return ret;
9647 + * kthread_fn - The CPU time sampling/hardware latency detection kernel thread
9648 + * @unused: A required part of the kthread API.
9649 + *
9650 + * Used to periodically sample the CPU TSC via a call to get_sample. We
9651 + * disable interrupts, which does (intentionally) introduce latency since we
9652 + * need to ensure nothing else might be running (and thus pre-empting).
9653 + * Obviously this should never be used in production environments.
9654 + *
9655 + * Currently this runs on which ever CPU it was scheduled on, but most
9656 + * real-worald hardware latency situations occur across several CPUs,
9657 + * but we might later generalize this if we find there are any actualy
9658 + * systems with alternate SMI delivery or other hardware latencies.
9659 + */
9660 +static int kthread_fn(void *unused)
9662 +       int ret;
9663 +       u64 interval;
9665 +       while (!kthread_should_stop()) {
9667 +               mutex_lock(&data.lock);
9669 +               local_irq_disable();
9670 +               ret = get_sample();
9671 +               local_irq_enable();
9673 +               if (ret > 0)
9674 +                       wake_up(&data.wq); /* wake up reader(s) */
9676 +               interval = data.sample_window - data.sample_width;
9677 +               do_div(interval, USEC_PER_MSEC); /* modifies interval value */
9679 +               mutex_unlock(&data.lock);
9681 +               if (msleep_interruptible(interval))
9682 +                       break;
9683 +       }
9685 +       return 0;
9688 +/**
9689 + * start_kthread - Kick off the hardware latency sampling/detector kthread
9690 + *
9691 + * This starts a kernel thread that will sit and sample the CPU timestamp
9692 + * counter (TSC or similar) and look for potential hardware latencies.
9693 + */
9694 +static int start_kthread(void)
9696 +       kthread = kthread_run(kthread_fn, NULL,
9697 +                                       DRVNAME);
9698 +       if (IS_ERR(kthread)) {
9699 +               pr_err(BANNER "could not start sampling thread\n");
9700 +               enabled = 0;
9701 +               return -ENOMEM;
9702 +       }
9704 +       return 0;
9707 +/**
9708 + * stop_kthread - Inform the hardware latency samping/detector kthread to stop
9709 + *
9710 + * This kicks the running hardware latency sampling/detector kernel thread and
9711 + * tells it to stop sampling now. Use this on unload and at system shutdown.
9712 + */
9713 +static int stop_kthread(void)
9715 +       int ret;
9717 +       ret = kthread_stop(kthread);
9719 +       return ret;
9722 +/**
9723 + * __reset_stats - Reset statistics for the hardware latency detector
9724 + *
9725 + * We use data to store various statistics and global state. We call this
9726 + * function in order to reset those when "enable" is toggled on or off, and
9727 + * also at initialization. Should be called with data.lock held.
9728 + */
9729 +static void __reset_stats(void)
9731 +       data.count = 0;
9732 +       data.max_sample = 0;
9733 +       ring_buffer_reset(ring_buffer); /* flush out old sample entries */
9736 +/**
9737 + * init_stats - Setup global state statistics for the hardware latency detector
9738 + *
9739 + * We use data to store various statistics and global state. We also use
9740 + * a global ring buffer (ring_buffer) to keep raw samples of detected hardware
9741 + * induced system latencies. This function initializes these structures and
9742 + * allocates the global ring buffer also.
9743 + */
9744 +static int init_stats(void)
9746 +       int ret = -ENOMEM;
9748 +       mutex_init(&data.lock);
9749 +       init_waitqueue_head(&data.wq);
9750 +       atomic_set(&data.sample_open, 0);
9752 +       ring_buffer = ring_buffer_alloc(buf_size, BUF_FLAGS);
9754 +       if (WARN(!ring_buffer, KERN_ERR BANNER
9755 +                              "failed to allocate ring buffer!\n"))
9756 +               goto out;
9758 +       __reset_stats();
9759 +       data.threshold = threshold ?: DEFAULT_LAT_THRESHOLD; /* threshold us */
9760 +       data.sample_window = DEFAULT_SAMPLE_WINDOW; /* window us */
9761 +       data.sample_width = DEFAULT_SAMPLE_WIDTH;   /* width us */
9763 +       ret = 0;
9765 +out:
9766 +       return ret;
9771 + * simple_data_read - Wrapper read function for global state debugfs entries
9772 + * @filp: The active open file structure for the debugfs "file"
9773 + * @ubuf: The userspace provided buffer to read value into
9774 + * @cnt: The maximum number of bytes to read
9775 + * @ppos: The current "file" position
9776 + * @entry: The entry to read from
9777 + *
9778 + * This function provides a generic read implementation for the global state
9779 + * "data" structure debugfs filesystem entries. It would be nice to use
9780 + * simple_attr_read directly, but we need to make sure that the data.lock
9781 + * is held during the actual read.
9782 + */
9783 +static ssize_t simple_data_read(struct file *filp, char __user *ubuf,
9784 +                               size_t cnt, loff_t *ppos, const u64 *entry)
9786 +       char buf[U64STR_SIZE];
9787 +       u64 val = 0;
9788 +       int len = 0;
9790 +       memset(buf, 0, sizeof(buf));
9792 +       if (!entry)
9793 +               return -EFAULT;
9795 +       mutex_lock(&data.lock);
9796 +       val = *entry;
9797 +       mutex_unlock(&data.lock);
9799 +       len = snprintf(buf, sizeof(buf), "%llu\n", (unsigned long long)val);
9801 +       return simple_read_from_buffer(ubuf, cnt, ppos, buf, len);
9806 + * simple_data_write - Wrapper write function for global state debugfs entries
9807 + * @filp: The active open file structure for the debugfs "file"
9808 + * @ubuf: The userspace provided buffer to write value from
9809 + * @cnt: The maximum number of bytes to write
9810 + * @ppos: The current "file" position
9811 + * @entry: The entry to write to
9812 + *
9813 + * This function provides a generic write implementation for the global state
9814 + * "data" structure debugfs filesystem entries. It would be nice to use
9815 + * simple_attr_write directly, but we need to make sure that the data.lock
9816 + * is held during the actual write.
9817 + */
9818 +static ssize_t simple_data_write(struct file *filp, const char __user *ubuf,
9819 +                                size_t cnt, loff_t *ppos, u64 *entry)
9821 +       char buf[U64STR_SIZE];
9822 +       int csize = min(cnt, sizeof(buf));
9823 +       u64 val = 0;
9824 +       int err = 0;
9826 +       memset(buf, '\0', sizeof(buf));
9827 +       if (copy_from_user(buf, ubuf, csize))
9828 +               return -EFAULT;
9830 +       buf[U64STR_SIZE-1] = '\0';                      /* just in case */
9831 +       err = kstrtoull(buf, 10, &val);
9832 +       if (err)
9833 +               return -EINVAL;
9835 +       mutex_lock(&data.lock);
9836 +       *entry = val;
9837 +       mutex_unlock(&data.lock);
9839 +       return csize;
9842 +/**
9843 + * debug_count_fopen - Open function for "count" debugfs entry
9844 + * @inode: The in-kernel inode representation of the debugfs "file"
9845 + * @filp: The active open file structure for the debugfs "file"
9846 + *
9847 + * This function provides an open implementation for the "count" debugfs
9848 + * interface to the hardware latency detector.
9849 + */
9850 +static int debug_count_fopen(struct inode *inode, struct file *filp)
9852 +       return 0;
9855 +/**
9856 + * debug_count_fread - Read function for "count" debugfs entry
9857 + * @filp: The active open file structure for the debugfs "file"
9858 + * @ubuf: The userspace provided buffer to read value into
9859 + * @cnt: The maximum number of bytes to read
9860 + * @ppos: The current "file" position
9861 + *
9862 + * This function provides a read implementation for the "count" debugfs
9863 + * interface to the hardware latency detector. Can be used to read the
9864 + * number of latency readings exceeding the configured threshold since
9865 + * the detector was last reset (e.g. by writing a zero into "count").
9866 + */
9867 +static ssize_t debug_count_fread(struct file *filp, char __user *ubuf,
9868 +                                    size_t cnt, loff_t *ppos)
9870 +       return simple_data_read(filp, ubuf, cnt, ppos, &data.count);
9873 +/**
9874 + * debug_count_fwrite - Write function for "count" debugfs entry
9875 + * @filp: The active open file structure for the debugfs "file"
9876 + * @ubuf: The user buffer that contains the value to write
9877 + * @cnt: The maximum number of bytes to write to "file"
9878 + * @ppos: The current position in the debugfs "file"
9879 + *
9880 + * This function provides a write implementation for the "count" debugfs
9881 + * interface to the hardware latency detector. Can be used to write a
9882 + * desired value, especially to zero the total count.
9883 + */
9884 +static ssize_t  debug_count_fwrite(struct file *filp,
9885 +                                      const char __user *ubuf,
9886 +                                      size_t cnt,
9887 +                                      loff_t *ppos)
9889 +       return simple_data_write(filp, ubuf, cnt, ppos, &data.count);
9892 +/**
9893 + * debug_enable_fopen - Dummy open function for "enable" debugfs interface
9894 + * @inode: The in-kernel inode representation of the debugfs "file"
9895 + * @filp: The active open file structure for the debugfs "file"
9896 + *
9897 + * This function provides an open implementation for the "enable" debugfs
9898 + * interface to the hardware latency detector.
9899 + */
9900 +static int debug_enable_fopen(struct inode *inode, struct file *filp)
9902 +       return 0;
9905 +/**
9906 + * debug_enable_fread - Read function for "enable" debugfs interface
9907 + * @filp: The active open file structure for the debugfs "file"
9908 + * @ubuf: The userspace provided buffer to read value into
9909 + * @cnt: The maximum number of bytes to read
9910 + * @ppos: The current "file" position
9911 + *
9912 + * This function provides a read implementation for the "enable" debugfs
9913 + * interface to the hardware latency detector. Can be used to determine
9914 + * whether the detector is currently enabled ("0\n" or "1\n" returned).
9915 + */
9916 +static ssize_t debug_enable_fread(struct file *filp, char __user *ubuf,
9917 +                                     size_t cnt, loff_t *ppos)
9919 +       char buf[4];
9921 +       if ((cnt < sizeof(buf)) || (*ppos))
9922 +               return 0;
9924 +       buf[0] = enabled ? '1' : '0';
9925 +       buf[1] = '\n';
9926 +       buf[2] = '\0';
9927 +       if (copy_to_user(ubuf, buf, strlen(buf)))
9928 +               return -EFAULT;
9929 +       return *ppos = strlen(buf);
9932 +/**
9933 + * debug_enable_fwrite - Write function for "enable" debugfs interface
9934 + * @filp: The active open file structure for the debugfs "file"
9935 + * @ubuf: The user buffer that contains the value to write
9936 + * @cnt: The maximum number of bytes to write to "file"
9937 + * @ppos: The current position in the debugfs "file"
9938 + *
9939 + * This function provides a write implementation for the "enable" debugfs
9940 + * interface to the hardware latency detector. Can be used to enable or
9941 + * disable the detector, which will have the side-effect of possibly
9942 + * also resetting the global stats and kicking off the measuring
9943 + * kthread (on an enable) or the converse (upon a disable).
9944 + */
9945 +static ssize_t  debug_enable_fwrite(struct file *filp,
9946 +                                       const char __user *ubuf,
9947 +                                       size_t cnt,
9948 +                                       loff_t *ppos)
9950 +       char buf[4];
9951 +       int csize = min(cnt, sizeof(buf));
9952 +       long val = 0;
9953 +       int err = 0;
9955 +       memset(buf, '\0', sizeof(buf));
9956 +       if (copy_from_user(buf, ubuf, csize))
9957 +               return -EFAULT;
9959 +       buf[sizeof(buf)-1] = '\0';                      /* just in case */
9960 +       err = kstrtoul(buf, 10, &val);
9961 +       if (err)
9962 +               return -EINVAL;
9964 +       if (val) {
9965 +               if (enabled)
9966 +                       goto unlock;
9967 +               enabled = 1;
9968 +               __reset_stats();
9969 +               if (start_kthread())
9970 +                       return -EFAULT;
9971 +       } else {
9972 +               if (!enabled)
9973 +                       goto unlock;
9974 +               enabled = 0;
9975 +               err = stop_kthread();
9976 +               if (err) {
9977 +                       pr_err(BANNER "cannot stop kthread\n");
9978 +                       return -EFAULT;
9979 +               }
9980 +               wake_up(&data.wq);              /* reader(s) should return */
9981 +       }
9982 +unlock:
9983 +       return csize;
9986 +/**
9987 + * debug_max_fopen - Open function for "max" debugfs entry
9988 + * @inode: The in-kernel inode representation of the debugfs "file"
9989 + * @filp: The active open file structure for the debugfs "file"
9990 + *
9991 + * This function provides an open implementation for the "max" debugfs
9992 + * interface to the hardware latency detector.
9993 + */
9994 +static int debug_max_fopen(struct inode *inode, struct file *filp)
9996 +       return 0;
9999 +/**
10000 + * debug_max_fread - Read function for "max" debugfs entry
10001 + * @filp: The active open file structure for the debugfs "file"
10002 + * @ubuf: The userspace provided buffer to read value into
10003 + * @cnt: The maximum number of bytes to read
10004 + * @ppos: The current "file" position
10005 + *
10006 + * This function provides a read implementation for the "max" debugfs
10007 + * interface to the hardware latency detector. Can be used to determine
10008 + * the maximum latency value observed since it was last reset.
10009 + */
10010 +static ssize_t debug_max_fread(struct file *filp, char __user *ubuf,
10011 +                                  size_t cnt, loff_t *ppos)
10013 +       return simple_data_read(filp, ubuf, cnt, ppos, &data.max_sample);
10016 +/**
10017 + * debug_max_fwrite - Write function for "max" debugfs entry
10018 + * @filp: The active open file structure for the debugfs "file"
10019 + * @ubuf: The user buffer that contains the value to write
10020 + * @cnt: The maximum number of bytes to write to "file"
10021 + * @ppos: The current position in the debugfs "file"
10022 + *
10023 + * This function provides a write implementation for the "max" debugfs
10024 + * interface to the hardware latency detector. Can be used to reset the
10025 + * maximum or set it to some other desired value - if, then, subsequent
10026 + * measurements exceed this value, the maximum will be updated.
10027 + */
10028 +static ssize_t  debug_max_fwrite(struct file *filp,
10029 +                                    const char __user *ubuf,
10030 +                                    size_t cnt,
10031 +                                    loff_t *ppos)
10033 +       return simple_data_write(filp, ubuf, cnt, ppos, &data.max_sample);
10037 +/**
10038 + * debug_sample_fopen - An open function for "sample" debugfs interface
10039 + * @inode: The in-kernel inode representation of this debugfs "file"
10040 + * @filp: The active open file structure for the debugfs "file"
10041 + *
10042 + * This function handles opening the "sample" file within the hardware
10043 + * latency detector debugfs directory interface. This file is used to read
10044 + * raw samples from the global ring_buffer and allows the user to see a
10045 + * running latency history. Can be opened blocking or non-blocking,
10046 + * affecting whether it behaves as a buffer read pipe, or does not.
10047 + * Implements simple locking to prevent multiple simultaneous use.
10048 + */
10049 +static int debug_sample_fopen(struct inode *inode, struct file *filp)
10051 +       if (!atomic_add_unless(&data.sample_open, 1, 1))
10052 +               return -EBUSY;
10053 +       else
10054 +               return 0;
10057 +/**
10058 + * debug_sample_fread - A read function for "sample" debugfs interface
10059 + * @filp: The active open file structure for the debugfs "file"
10060 + * @ubuf: The user buffer that will contain the samples read
10061 + * @cnt: The maximum bytes to read from the debugfs "file"
10062 + * @ppos: The current position in the debugfs "file"
10063 + *
10064 + * This function handles reading from the "sample" file within the hardware
10065 + * latency detector debugfs directory interface. This file is used to read
10066 + * raw samples from the global ring_buffer and allows the user to see a
10067 + * running latency history. By default this will block pending a new
10068 + * value written into the sample buffer, unless there are already a
10069 + * number of value(s) waiting in the buffer, or the sample file was
10070 + * previously opened in a non-blocking mode of operation.
10071 + */
10072 +static ssize_t debug_sample_fread(struct file *filp, char __user *ubuf,
10073 +                                       size_t cnt, loff_t *ppos)
10075 +       int len = 0;
10076 +       char buf[64];
10077 +       struct sample *sample = NULL;
10079 +       if (!enabled)
10080 +               return 0;
10082 +       sample = kzalloc(sizeof(struct sample), GFP_KERNEL);
10083 +       if (!sample)
10084 +               return -ENOMEM;
10086 +       while (!buffer_get_sample(sample)) {
10088 +               DEFINE_WAIT(wait);
10090 +               if (filp->f_flags & O_NONBLOCK) {
10091 +                       len = -EAGAIN;
10092 +                       goto out;
10093 +               }
10095 +               prepare_to_wait(&data.wq, &wait, TASK_INTERRUPTIBLE);
10096 +               schedule();
10097 +               finish_wait(&data.wq, &wait);
10099 +               if (signal_pending(current)) {
10100 +                       len = -EINTR;
10101 +                       goto out;
10102 +               }
10104 +               if (!enabled) {                 /* enable was toggled */
10105 +                       len = 0;
10106 +                       goto out;
10107 +               }
10108 +       }
10110 +       len = snprintf(buf, sizeof(buf), "%010lu.%010lu\t%llu\t%llu\n",
10111 +                      sample->timestamp.tv_sec,
10112 +                      sample->timestamp.tv_nsec,
10113 +                      sample->duration,
10114 +                      sample->outer_duration);
10117 +       /* handling partial reads is more trouble than it's worth */
10118 +       if (len > cnt)
10119 +               goto out;
10121 +       if (copy_to_user(ubuf, buf, len))
10122 +               len = -EFAULT;
10124 +out:
10125 +       kfree(sample);
10126 +       return len;
10129 +/**
10130 + * debug_sample_release - Release function for "sample" debugfs interface
10131 + * @inode: The in-kernel inode represenation of the debugfs "file"
10132 + * @filp: The active open file structure for the debugfs "file"
10133 + *
10134 + * This function completes the close of the debugfs interface "sample" file.
10135 + * Frees the sample_open "lock" so that other users may open the interface.
10136 + */
10137 +static int debug_sample_release(struct inode *inode, struct file *filp)
10139 +       atomic_dec(&data.sample_open);
10141 +       return 0;
10144 +/**
10145 + * debug_threshold_fopen - Open function for "threshold" debugfs entry
10146 + * @inode: The in-kernel inode representation of the debugfs "file"
10147 + * @filp: The active open file structure for the debugfs "file"
10148 + *
10149 + * This function provides an open implementation for the "threshold" debugfs
10150 + * interface to the hardware latency detector.
10151 + */
10152 +static int debug_threshold_fopen(struct inode *inode, struct file *filp)
10154 +       return 0;
10157 +/**
10158 + * debug_threshold_fread - Read function for "threshold" debugfs entry
10159 + * @filp: The active open file structure for the debugfs "file"
10160 + * @ubuf: The userspace provided buffer to read value into
10161 + * @cnt: The maximum number of bytes to read
10162 + * @ppos: The current "file" position
10163 + *
10164 + * This function provides a read implementation for the "threshold" debugfs
10165 + * interface to the hardware latency detector. It can be used to determine
10166 + * the current threshold level at which a latency will be recorded in the
10167 + * global ring buffer, typically on the order of 10us.
10168 + */
10169 +static ssize_t debug_threshold_fread(struct file *filp, char __user *ubuf,
10170 +                                        size_t cnt, loff_t *ppos)
10172 +       return simple_data_read(filp, ubuf, cnt, ppos, &data.threshold);
10175 +/**
10176 + * debug_threshold_fwrite - Write function for "threshold" debugfs entry
10177 + * @filp: The active open file structure for the debugfs "file"
10178 + * @ubuf: The user buffer that contains the value to write
10179 + * @cnt: The maximum number of bytes to write to "file"
10180 + * @ppos: The current position in the debugfs "file"
10181 + *
10182 + * This function provides a write implementation for the "threshold" debugfs
10183 + * interface to the hardware latency detector. It can be used to configure
10184 + * the threshold level at which any subsequently detected latencies will
10185 + * be recorded into the global ring buffer.
10186 + */
10187 +static ssize_t  debug_threshold_fwrite(struct file *filp,
10188 +                                       const char __user *ubuf,
10189 +                                       size_t cnt,
10190 +                                       loff_t *ppos)
10192 +       int ret;
10194 +       ret = simple_data_write(filp, ubuf, cnt, ppos, &data.threshold);
10196 +       if (enabled)
10197 +               wake_up_process(kthread);
10199 +       return ret;
10202 +/**
10203 + * debug_width_fopen - Open function for "width" debugfs entry
10204 + * @inode: The in-kernel inode representation of the debugfs "file"
10205 + * @filp: The active open file structure for the debugfs "file"
10206 + *
10207 + * This function provides an open implementation for the "width" debugfs
10208 + * interface to the hardware latency detector.
10209 + */
10210 +static int debug_width_fopen(struct inode *inode, struct file *filp)
10212 +       return 0;
10215 +/**
10216 + * debug_width_fread - Read function for "width" debugfs entry
10217 + * @filp: The active open file structure for the debugfs "file"
10218 + * @ubuf: The userspace provided buffer to read value into
10219 + * @cnt: The maximum number of bytes to read
10220 + * @ppos: The current "file" position
10221 + *
10222 + * This function provides a read implementation for the "width" debugfs
10223 + * interface to the hardware latency detector. It can be used to determine
10224 + * for how many us of the total window us we will actively sample for any
10225 + * hardware-induced latecy periods. Obviously, it is not possible to
10226 + * sample constantly and have the system respond to a sample reader, or,
10227 + * worse, without having the system appear to have gone out to lunch.
10228 + */
10229 +static ssize_t debug_width_fread(struct file *filp, char __user *ubuf,
10230 +                                    size_t cnt, loff_t *ppos)
10232 +       return simple_data_read(filp, ubuf, cnt, ppos, &data.sample_width);
10235 +/**
10236 + * debug_width_fwrite - Write function for "width" debugfs entry
10237 + * @filp: The active open file structure for the debugfs "file"
10238 + * @ubuf: The user buffer that contains the value to write
10239 + * @cnt: The maximum number of bytes to write to "file"
10240 + * @ppos: The current position in the debugfs "file"
10241 + *
10242 + * This function provides a write implementation for the "width" debugfs
10243 + * interface to the hardware latency detector. It can be used to configure
10244 + * for how many us of the total window us we will actively sample for any
10245 + * hardware-induced latency periods. Obviously, it is not possible to
10246 + * sample constantly and have the system respond to a sample reader, or,
10247 + * worse, without having the system appear to have gone out to lunch. It
10248 + * is enforced that width is less that the total window size.
10249 + */
10250 +static ssize_t  debug_width_fwrite(struct file *filp,
10251 +                                      const char __user *ubuf,
10252 +                                      size_t cnt,
10253 +                                      loff_t *ppos)
10255 +       char buf[U64STR_SIZE];
10256 +       int csize = min(cnt, sizeof(buf));
10257 +       u64 val = 0;
10258 +       int err = 0;
10260 +       memset(buf, '\0', sizeof(buf));
10261 +       if (copy_from_user(buf, ubuf, csize))
10262 +               return -EFAULT;
10264 +       buf[U64STR_SIZE-1] = '\0';                      /* just in case */
10265 +       err = kstrtoull(buf, 10, &val);
10266 +       if (err)
10267 +               return -EINVAL;
10269 +       mutex_lock(&data.lock);
10270 +       if (val < data.sample_window)
10271 +               data.sample_width = val;
10272 +       else {
10273 +               mutex_unlock(&data.lock);
10274 +               return -EINVAL;
10275 +       }
10276 +       mutex_unlock(&data.lock);
10278 +       if (enabled)
10279 +               wake_up_process(kthread);
10281 +       return csize;
10284 +/**
10285 + * debug_window_fopen - Open function for "window" debugfs entry
10286 + * @inode: The in-kernel inode representation of the debugfs "file"
10287 + * @filp: The active open file structure for the debugfs "file"
10288 + *
10289 + * This function provides an open implementation for the "window" debugfs
10290 + * interface to the hardware latency detector. The window is the total time
10291 + * in us that will be considered one sample period. Conceptually, windows
10292 + * occur back-to-back and contain a sample width period during which
10293 + * actual sampling occurs.
10294 + */
10295 +static int debug_window_fopen(struct inode *inode, struct file *filp)
10297 +       return 0;
10300 +/**
10301 + * debug_window_fread - Read function for "window" debugfs entry
10302 + * @filp: The active open file structure for the debugfs "file"
10303 + * @ubuf: The userspace provided buffer to read value into
10304 + * @cnt: The maximum number of bytes to read
10305 + * @ppos: The current "file" position
10306 + *
10307 + * This function provides a read implementation for the "window" debugfs
10308 + * interface to the hardware latency detector. The window is the total time
10309 + * in us that will be considered one sample period. Conceptually, windows
10310 + * occur back-to-back and contain a sample width period during which
10311 + * actual sampling occurs. Can be used to read the total window size.
10312 + */
10313 +static ssize_t debug_window_fread(struct file *filp, char __user *ubuf,
10314 +                                     size_t cnt, loff_t *ppos)
10316 +       return simple_data_read(filp, ubuf, cnt, ppos, &data.sample_window);
10319 +/**
10320 + * debug_window_fwrite - Write function for "window" debugfs entry
10321 + * @filp: The active open file structure for the debugfs "file"
10322 + * @ubuf: The user buffer that contains the value to write
10323 + * @cnt: The maximum number of bytes to write to "file"
10324 + * @ppos: The current position in the debugfs "file"
10325 + *
10326 + * This function provides a write implementation for the "window" debufds
10327 + * interface to the hardware latency detetector. The window is the total time
10328 + * in us that will be considered one sample period. Conceptually, windows
10329 + * occur back-to-back and contain a sample width period during which
10330 + * actual sampling occurs. Can be used to write a new total window size. It
10331 + * is enfoced that any value written must be greater than the sample width
10332 + * size, or an error results.
10333 + */
10334 +static ssize_t  debug_window_fwrite(struct file *filp,
10335 +                                       const char __user *ubuf,
10336 +                                       size_t cnt,
10337 +                                       loff_t *ppos)
10339 +       char buf[U64STR_SIZE];
10340 +       int csize = min(cnt, sizeof(buf));
10341 +       u64 val = 0;
10342 +       int err = 0;
10344 +       memset(buf, '\0', sizeof(buf));
10345 +       if (copy_from_user(buf, ubuf, csize))
10346 +               return -EFAULT;
10348 +       buf[U64STR_SIZE-1] = '\0';                      /* just in case */
10349 +       err = kstrtoull(buf, 10, &val);
10350 +       if (err)
10351 +               return -EINVAL;
10353 +       mutex_lock(&data.lock);
10354 +       if (data.sample_width < val)
10355 +               data.sample_window = val;
10356 +       else {
10357 +               mutex_unlock(&data.lock);
10358 +               return -EINVAL;
10359 +       }
10360 +       mutex_unlock(&data.lock);
10362 +       return csize;
10366 + * Function pointers for the "count" debugfs file operations
10367 + */
10368 +static const struct file_operations count_fops = {
10369 +       .open           = debug_count_fopen,
10370 +       .read           = debug_count_fread,
10371 +       .write          = debug_count_fwrite,
10372 +       .owner          = THIS_MODULE,
10376 + * Function pointers for the "enable" debugfs file operations
10377 + */
10378 +static const struct file_operations enable_fops = {
10379 +       .open           = debug_enable_fopen,
10380 +       .read           = debug_enable_fread,
10381 +       .write          = debug_enable_fwrite,
10382 +       .owner          = THIS_MODULE,
10386 + * Function pointers for the "max" debugfs file operations
10387 + */
10388 +static const struct file_operations max_fops = {
10389 +       .open           = debug_max_fopen,
10390 +       .read           = debug_max_fread,
10391 +       .write          = debug_max_fwrite,
10392 +       .owner          = THIS_MODULE,
10396 + * Function pointers for the "sample" debugfs file operations
10397 + */
10398 +static const struct file_operations sample_fops = {
10399 +       .open           = debug_sample_fopen,
10400 +       .read           = debug_sample_fread,
10401 +       .release        = debug_sample_release,
10402 +       .owner          = THIS_MODULE,
10406 + * Function pointers for the "threshold" debugfs file operations
10407 + */
10408 +static const struct file_operations threshold_fops = {
10409 +       .open           = debug_threshold_fopen,
10410 +       .read           = debug_threshold_fread,
10411 +       .write          = debug_threshold_fwrite,
10412 +       .owner          = THIS_MODULE,
10416 + * Function pointers for the "width" debugfs file operations
10417 + */
10418 +static const struct file_operations width_fops = {
10419 +       .open           = debug_width_fopen,
10420 +       .read           = debug_width_fread,
10421 +       .write          = debug_width_fwrite,
10422 +       .owner          = THIS_MODULE,
10426 + * Function pointers for the "window" debugfs file operations
10427 + */
10428 +static const struct file_operations window_fops = {
10429 +       .open           = debug_window_fopen,
10430 +       .read           = debug_window_fread,
10431 +       .write          = debug_window_fwrite,
10432 +       .owner          = THIS_MODULE,
10435 +/**
10436 + * init_debugfs - A function to initialize the debugfs interface files
10437 + *
10438 + * This function creates entries in debugfs for "hwlat_detector", including
10439 + * files to read values from the detector, current samples, and the
10440 + * maximum sample that has been captured since the hardware latency
10441 + * dectector was started.
10442 + */
10443 +static int init_debugfs(void)
10445 +       int ret = -ENOMEM;
10447 +       debug_dir = debugfs_create_dir(DRVNAME, NULL);
10448 +       if (!debug_dir)
10449 +               goto err_debug_dir;
10451 +       debug_sample = debugfs_create_file("sample", 0444,
10452 +                                              debug_dir, NULL,
10453 +                                              &sample_fops);
10454 +       if (!debug_sample)
10455 +               goto err_sample;
10457 +       debug_count = debugfs_create_file("count", 0444,
10458 +                                             debug_dir, NULL,
10459 +                                             &count_fops);
10460 +       if (!debug_count)
10461 +               goto err_count;
10463 +       debug_max = debugfs_create_file("max", 0444,
10464 +                                           debug_dir, NULL,
10465 +                                           &max_fops);
10466 +       if (!debug_max)
10467 +               goto err_max;
10469 +       debug_sample_window = debugfs_create_file("window", 0644,
10470 +                                                     debug_dir, NULL,
10471 +                                                     &window_fops);
10472 +       if (!debug_sample_window)
10473 +               goto err_window;
10475 +       debug_sample_width = debugfs_create_file("width", 0644,
10476 +                                                    debug_dir, NULL,
10477 +                                                    &width_fops);
10478 +       if (!debug_sample_width)
10479 +               goto err_width;
10481 +       debug_threshold = debugfs_create_file("threshold", 0644,
10482 +                                                 debug_dir, NULL,
10483 +                                                 &threshold_fops);
10484 +       if (!debug_threshold)
10485 +               goto err_threshold;
10487 +       debug_enable = debugfs_create_file("enable", 0644,
10488 +                                              debug_dir, &enabled,
10489 +                                              &enable_fops);
10490 +       if (!debug_enable)
10491 +               goto err_enable;
10493 +       else {
10494 +               ret = 0;
10495 +               goto out;
10496 +       }
10498 +err_enable:
10499 +       debugfs_remove(debug_threshold);
10500 +err_threshold:
10501 +       debugfs_remove(debug_sample_width);
10502 +err_width:
10503 +       debugfs_remove(debug_sample_window);
10504 +err_window:
10505 +       debugfs_remove(debug_max);
10506 +err_max:
10507 +       debugfs_remove(debug_count);
10508 +err_count:
10509 +       debugfs_remove(debug_sample);
10510 +err_sample:
10511 +       debugfs_remove(debug_dir);
10512 +err_debug_dir:
10513 +out:
10514 +       return ret;
10517 +/**
10518 + * free_debugfs - A function to cleanup the debugfs file interface
10519 + */
10520 +static void free_debugfs(void)
10522 +       /* could also use a debugfs_remove_recursive */
10523 +       debugfs_remove(debug_enable);
10524 +       debugfs_remove(debug_threshold);
10525 +       debugfs_remove(debug_sample_width);
10526 +       debugfs_remove(debug_sample_window);
10527 +       debugfs_remove(debug_max);
10528 +       debugfs_remove(debug_count);
10529 +       debugfs_remove(debug_sample);
10530 +       debugfs_remove(debug_dir);
10533 +/**
10534 + * detector_init - Standard module initialization code
10535 + */
10536 +static int detector_init(void)
10538 +       int ret = -ENOMEM;
10540 +       pr_info(BANNER "version %s\n", VERSION);
10542 +       ret = init_stats();
10543 +       if (ret)
10544 +               goto out;
10546 +       ret = init_debugfs();
10547 +       if (ret)
10548 +               goto err_stats;
10550 +       if (enabled)
10551 +               ret = start_kthread();
10553 +       goto out;
10555 +err_stats:
10556 +       ring_buffer_free(ring_buffer);
10557 +out:
10558 +       return ret;
10562 +/**
10563 + * detector_exit - Standard module cleanup code
10564 + */
10565 +static void detector_exit(void)
10567 +       int err;
10569 +       if (enabled) {
10570 +               enabled = 0;
10571 +               err = stop_kthread();
10572 +               if (err)
10573 +                       pr_err(BANNER "cannot stop kthread\n");
10574 +       }
10576 +       free_debugfs();
10577 +       ring_buffer_free(ring_buffer);  /* free up the ring buffer */
10581 +module_init(detector_init);
10582 +module_exit(detector_exit);
10583 diff --git a/drivers/mmc/host/mmci.c b/drivers/mmc/host/mmci.c
10584 index acece3299756..58ea04a03fa9 100644
10585 --- a/drivers/mmc/host/mmci.c
10586 +++ b/drivers/mmc/host/mmci.c
10587 @@ -1155,15 +1155,12 @@ static irqreturn_t mmci_pio_irq(int irq, void *dev_id)
10588         struct sg_mapping_iter *sg_miter = &host->sg_miter;
10589         struct variant_data *variant = host->variant;
10590         void __iomem *base = host->base;
10591 -       unsigned long flags;
10592         u32 status;
10594         status = readl(base + MMCISTATUS);
10596         dev_dbg(mmc_dev(host->mmc), "irq1 (pio) %08x\n", status);
10598 -       local_irq_save(flags);
10600         do {
10601                 unsigned int remain, len;
10602                 char *buffer;
10603 @@ -1203,8 +1200,6 @@ static irqreturn_t mmci_pio_irq(int irq, void *dev_id)
10605         sg_miter_stop(sg_miter);
10607 -       local_irq_restore(flags);
10609         /*
10610          * If we have less than the fifo 'half-full' threshold to transfer,
10611          * trigger a PIO interrupt as soon as any data is available.
10612 diff --git a/drivers/net/ethernet/3com/3c59x.c b/drivers/net/ethernet/3com/3c59x.c
10613 index 2839af00f20c..4348b9c850d3 100644
10614 --- a/drivers/net/ethernet/3com/3c59x.c
10615 +++ b/drivers/net/ethernet/3com/3c59x.c
10616 @@ -842,9 +842,9 @@ static void poll_vortex(struct net_device *dev)
10618         struct vortex_private *vp = netdev_priv(dev);
10619         unsigned long flags;
10620 -       local_irq_save(flags);
10621 +       local_irq_save_nort(flags);
10622         (vp->full_bus_master_rx ? boomerang_interrupt:vortex_interrupt)(dev->irq,dev);
10623 -       local_irq_restore(flags);
10624 +       local_irq_restore_nort(flags);
10626  #endif
10628 @@ -1916,12 +1916,12 @@ static void vortex_tx_timeout(struct net_device *dev)
10629                          * Block interrupts because vortex_interrupt does a bare spin_lock()
10630                          */
10631                         unsigned long flags;
10632 -                       local_irq_save(flags);
10633 +                       local_irq_save_nort(flags);
10634                         if (vp->full_bus_master_tx)
10635                                 boomerang_interrupt(dev->irq, dev);
10636                         else
10637                                 vortex_interrupt(dev->irq, dev);
10638 -                       local_irq_restore(flags);
10639 +                       local_irq_restore_nort(flags);
10640                 }
10641         }
10643 diff --git a/drivers/net/ethernet/atheros/atl1c/atl1c_main.c b/drivers/net/ethernet/atheros/atl1c/atl1c_main.c
10644 index 8b5988e210d5..cf9928ccdd7e 100644
10645 --- a/drivers/net/ethernet/atheros/atl1c/atl1c_main.c
10646 +++ b/drivers/net/ethernet/atheros/atl1c/atl1c_main.c
10647 @@ -2221,11 +2221,7 @@ static netdev_tx_t atl1c_xmit_frame(struct sk_buff *skb,
10648         }
10650         tpd_req = atl1c_cal_tpd_req(skb);
10651 -       if (!spin_trylock_irqsave(&adapter->tx_lock, flags)) {
10652 -               if (netif_msg_pktdata(adapter))
10653 -                       dev_info(&adapter->pdev->dev, "tx locked\n");
10654 -               return NETDEV_TX_LOCKED;
10655 -       }
10656 +       spin_lock_irqsave(&adapter->tx_lock, flags);
10658         if (atl1c_tpd_avail(adapter, type) < tpd_req) {
10659                 /* no enough descriptor, just stop queue */
10660 diff --git a/drivers/net/ethernet/atheros/atl1e/atl1e_main.c b/drivers/net/ethernet/atheros/atl1e/atl1e_main.c
10661 index 59a03a193e83..734f7a7ad2c3 100644
10662 --- a/drivers/net/ethernet/atheros/atl1e/atl1e_main.c
10663 +++ b/drivers/net/ethernet/atheros/atl1e/atl1e_main.c
10664 @@ -1880,8 +1880,7 @@ static netdev_tx_t atl1e_xmit_frame(struct sk_buff *skb,
10665                 return NETDEV_TX_OK;
10666         }
10667         tpd_req = atl1e_cal_tdp_req(skb);
10668 -       if (!spin_trylock_irqsave(&adapter->tx_lock, flags))
10669 -               return NETDEV_TX_LOCKED;
10670 +       spin_lock_irqsave(&adapter->tx_lock, flags);
10672         if (atl1e_tpd_avail(adapter) < tpd_req) {
10673                 /* no enough descriptor, just stop queue */
10674 diff --git a/drivers/net/ethernet/chelsio/cxgb/sge.c b/drivers/net/ethernet/chelsio/cxgb/sge.c
10675 index 526ea74e82d9..86f467a2c485 100644
10676 --- a/drivers/net/ethernet/chelsio/cxgb/sge.c
10677 +++ b/drivers/net/ethernet/chelsio/cxgb/sge.c
10678 @@ -1664,8 +1664,7 @@ static int t1_sge_tx(struct sk_buff *skb, struct adapter *adapter,
10679         struct cmdQ *q = &sge->cmdQ[qid];
10680         unsigned int credits, pidx, genbit, count, use_sched_skb = 0;
10682 -       if (!spin_trylock(&q->lock))
10683 -               return NETDEV_TX_LOCKED;
10684 +       spin_lock(&q->lock);
10686         reclaim_completed_tx(sge, q);
10688 diff --git a/drivers/net/ethernet/neterion/s2io.c b/drivers/net/ethernet/neterion/s2io.c
10689 index 9ba975853ec6..813cfa698160 100644
10690 --- a/drivers/net/ethernet/neterion/s2io.c
10691 +++ b/drivers/net/ethernet/neterion/s2io.c
10692 @@ -4084,12 +4084,7 @@ static netdev_tx_t s2io_xmit(struct sk_buff *skb, struct net_device *dev)
10693                         [skb->priority & (MAX_TX_FIFOS - 1)];
10694         fifo = &mac_control->fifos[queue];
10696 -       if (do_spin_lock)
10697 -               spin_lock_irqsave(&fifo->tx_lock, flags);
10698 -       else {
10699 -               if (unlikely(!spin_trylock_irqsave(&fifo->tx_lock, flags)))
10700 -                       return NETDEV_TX_LOCKED;
10701 -       }
10702 +       spin_lock_irqsave(&fifo->tx_lock, flags);
10704         if (sp->config.multiq) {
10705                 if (__netif_subqueue_stopped(dev, fifo->fifo_no)) {
10706 diff --git a/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c b/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c
10707 index 3b98b263bad0..ca4add749410 100644
10708 --- a/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c
10709 +++ b/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c
10710 @@ -2137,10 +2137,8 @@ static int pch_gbe_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
10711         struct pch_gbe_tx_ring *tx_ring = adapter->tx_ring;
10712         unsigned long flags;
10714 -       if (!spin_trylock_irqsave(&tx_ring->tx_lock, flags)) {
10715 -               /* Collision - tell upper layer to requeue */
10716 -               return NETDEV_TX_LOCKED;
10717 -       }
10718 +       spin_lock_irqsave(&tx_ring->tx_lock, flags);
10720         if (unlikely(!PCH_GBE_DESC_UNUSED(tx_ring))) {
10721                 netif_stop_queue(netdev);
10722                 spin_unlock_irqrestore(&tx_ring->tx_lock, flags);
10723 diff --git a/drivers/net/ethernet/realtek/8139too.c b/drivers/net/ethernet/realtek/8139too.c
10724 index ef668d300800..d987d571fdd6 100644
10725 --- a/drivers/net/ethernet/realtek/8139too.c
10726 +++ b/drivers/net/ethernet/realtek/8139too.c
10727 @@ -2229,7 +2229,7 @@ static void rtl8139_poll_controller(struct net_device *dev)
10728         struct rtl8139_private *tp = netdev_priv(dev);
10729         const int irq = tp->pci_dev->irq;
10731 -       disable_irq(irq);
10732 +       disable_irq_nosync(irq);
10733         rtl8139_interrupt(irq, dev);
10734         enable_irq(irq);
10736 diff --git a/drivers/net/ethernet/tehuti/tehuti.c b/drivers/net/ethernet/tehuti/tehuti.c
10737 index 14c9d1baa85c..e1a5305418a8 100644
10738 --- a/drivers/net/ethernet/tehuti/tehuti.c
10739 +++ b/drivers/net/ethernet/tehuti/tehuti.c
10740 @@ -1629,13 +1629,8 @@ static netdev_tx_t bdx_tx_transmit(struct sk_buff *skb,
10741         unsigned long flags;
10743         ENTER;
10744 -       local_irq_save(flags);
10745 -       if (!spin_trylock(&priv->tx_lock)) {
10746 -               local_irq_restore(flags);
10747 -               DBG("%s[%s]: TX locked, returning NETDEV_TX_LOCKED\n",
10748 -                   BDX_DRV_NAME, ndev->name);
10749 -               return NETDEV_TX_LOCKED;
10750 -       }
10752 +       spin_lock_irqsave(&priv->tx_lock, flags);
10754         /* build tx descriptor */
10755         BDX_ASSERT(f->m.wptr >= f->m.memsz);    /* started with valid wptr */
10756 diff --git a/drivers/net/rionet.c b/drivers/net/rionet.c
10757 index e7034c55e796..2e4ee0f912bf 100644
10758 --- a/drivers/net/rionet.c
10759 +++ b/drivers/net/rionet.c
10760 @@ -174,11 +174,7 @@ static int rionet_start_xmit(struct sk_buff *skb, struct net_device *ndev)
10761         unsigned long flags;
10762         int add_num = 1;
10764 -       local_irq_save(flags);
10765 -       if (!spin_trylock(&rnet->tx_lock)) {
10766 -               local_irq_restore(flags);
10767 -               return NETDEV_TX_LOCKED;
10768 -       }
10769 +       spin_lock_irqsave(&rnet->tx_lock, flags);
10771         if (is_multicast_ether_addr(eth->h_dest))
10772                 add_num = nets[rnet->mport->id].nact;
10773 diff --git a/drivers/net/wireless/orinoco/orinoco_usb.c b/drivers/net/wireless/orinoco/orinoco_usb.c
10774 index f2cd513d54b2..6c0f4c9638a2 100644
10775 --- a/drivers/net/wireless/orinoco/orinoco_usb.c
10776 +++ b/drivers/net/wireless/orinoco/orinoco_usb.c
10777 @@ -697,7 +697,7 @@ static void ezusb_req_ctx_wait(struct ezusb_priv *upriv,
10778                         while (!ctx->done.done && msecs--)
10779                                 udelay(1000);
10780                 } else {
10781 -                       wait_event_interruptible(ctx->done.wait,
10782 +                       swait_event_interruptible(ctx->done.wait,
10783                                                  ctx->done.done);
10784                 }
10785                 break;
10786 diff --git a/drivers/pci/access.c b/drivers/pci/access.c
10787 index 59ac36fe7c42..7a45a20af78a 100644
10788 --- a/drivers/pci/access.c
10789 +++ b/drivers/pci/access.c
10790 @@ -561,7 +561,7 @@ void pci_cfg_access_unlock(struct pci_dev *dev)
10791         WARN_ON(!dev->block_cfg_access);
10793         dev->block_cfg_access = 0;
10794 -       wake_up_all(&pci_cfg_wait);
10795 +       wake_up_all_locked(&pci_cfg_wait);
10796         raw_spin_unlock_irqrestore(&pci_lock, flags);
10798  EXPORT_SYMBOL_GPL(pci_cfg_access_unlock);
10799 diff --git a/drivers/pinctrl/qcom/pinctrl-msm.c b/drivers/pinctrl/qcom/pinctrl-msm.c
10800 index 9736f9be5447..5fe9b173dcb3 100644
10801 --- a/drivers/pinctrl/qcom/pinctrl-msm.c
10802 +++ b/drivers/pinctrl/qcom/pinctrl-msm.c
10803 @@ -60,7 +60,7 @@ struct msm_pinctrl {
10804         struct notifier_block restart_nb;
10805         int irq;
10807 -       spinlock_t lock;
10808 +       raw_spinlock_t lock;
10810         DECLARE_BITMAP(dual_edge_irqs, MAX_NR_GPIO);
10811         DECLARE_BITMAP(enabled_irqs, MAX_NR_GPIO);
10812 @@ -156,14 +156,14 @@ static int msm_pinmux_set_mux(struct pinctrl_dev *pctldev,
10813         if (WARN_ON(i == g->nfuncs))
10814                 return -EINVAL;
10816 -       spin_lock_irqsave(&pctrl->lock, flags);
10817 +       raw_spin_lock_irqsave(&pctrl->lock, flags);
10819         val = readl(pctrl->regs + g->ctl_reg);
10820         val &= ~(0x7 << g->mux_bit);
10821         val |= i << g->mux_bit;
10822         writel(val, pctrl->regs + g->ctl_reg);
10824 -       spin_unlock_irqrestore(&pctrl->lock, flags);
10825 +       raw_spin_unlock_irqrestore(&pctrl->lock, flags);
10827         return 0;
10829 @@ -326,14 +326,14 @@ static int msm_config_group_set(struct pinctrl_dev *pctldev,
10830                         break;
10831                 case PIN_CONFIG_OUTPUT:
10832                         /* set output value */
10833 -                       spin_lock_irqsave(&pctrl->lock, flags);
10834 +                       raw_spin_lock_irqsave(&pctrl->lock, flags);
10835                         val = readl(pctrl->regs + g->io_reg);
10836                         if (arg)
10837                                 val |= BIT(g->out_bit);
10838                         else
10839                                 val &= ~BIT(g->out_bit);
10840                         writel(val, pctrl->regs + g->io_reg);
10841 -                       spin_unlock_irqrestore(&pctrl->lock, flags);
10842 +                       raw_spin_unlock_irqrestore(&pctrl->lock, flags);
10844                         /* enable output */
10845                         arg = 1;
10846 @@ -354,12 +354,12 @@ static int msm_config_group_set(struct pinctrl_dev *pctldev,
10847                         return -EINVAL;
10848                 }
10850 -               spin_lock_irqsave(&pctrl->lock, flags);
10851 +               raw_spin_lock_irqsave(&pctrl->lock, flags);
10852                 val = readl(pctrl->regs + g->ctl_reg);
10853                 val &= ~(mask << bit);
10854                 val |= arg << bit;
10855                 writel(val, pctrl->regs + g->ctl_reg);
10856 -               spin_unlock_irqrestore(&pctrl->lock, flags);
10857 +               raw_spin_unlock_irqrestore(&pctrl->lock, flags);
10858         }
10860         return 0;
10861 @@ -387,13 +387,13 @@ static int msm_gpio_direction_input(struct gpio_chip *chip, unsigned offset)
10863         g = &pctrl->soc->groups[offset];
10865 -       spin_lock_irqsave(&pctrl->lock, flags);
10866 +       raw_spin_lock_irqsave(&pctrl->lock, flags);
10868         val = readl(pctrl->regs + g->ctl_reg);
10869         val &= ~BIT(g->oe_bit);
10870         writel(val, pctrl->regs + g->ctl_reg);
10872 -       spin_unlock_irqrestore(&pctrl->lock, flags);
10873 +       raw_spin_unlock_irqrestore(&pctrl->lock, flags);
10875         return 0;
10877 @@ -407,7 +407,7 @@ static int msm_gpio_direction_output(struct gpio_chip *chip, unsigned offset, in
10879         g = &pctrl->soc->groups[offset];
10881 -       spin_lock_irqsave(&pctrl->lock, flags);
10882 +       raw_spin_lock_irqsave(&pctrl->lock, flags);
10884         val = readl(pctrl->regs + g->io_reg);
10885         if (value)
10886 @@ -420,7 +420,7 @@ static int msm_gpio_direction_output(struct gpio_chip *chip, unsigned offset, in
10887         val |= BIT(g->oe_bit);
10888         writel(val, pctrl->regs + g->ctl_reg);
10890 -       spin_unlock_irqrestore(&pctrl->lock, flags);
10891 +       raw_spin_unlock_irqrestore(&pctrl->lock, flags);
10893         return 0;
10895 @@ -446,7 +446,7 @@ static void msm_gpio_set(struct gpio_chip *chip, unsigned offset, int value)
10897         g = &pctrl->soc->groups[offset];
10899 -       spin_lock_irqsave(&pctrl->lock, flags);
10900 +       raw_spin_lock_irqsave(&pctrl->lock, flags);
10902         val = readl(pctrl->regs + g->io_reg);
10903         if (value)
10904 @@ -455,7 +455,7 @@ static void msm_gpio_set(struct gpio_chip *chip, unsigned offset, int value)
10905                 val &= ~BIT(g->out_bit);
10906         writel(val, pctrl->regs + g->io_reg);
10908 -       spin_unlock_irqrestore(&pctrl->lock, flags);
10909 +       raw_spin_unlock_irqrestore(&pctrl->lock, flags);
10912  #ifdef CONFIG_DEBUG_FS
10913 @@ -574,7 +574,7 @@ static void msm_gpio_irq_mask(struct irq_data *d)
10915         g = &pctrl->soc->groups[d->hwirq];
10917 -       spin_lock_irqsave(&pctrl->lock, flags);
10918 +       raw_spin_lock_irqsave(&pctrl->lock, flags);
10920         val = readl(pctrl->regs + g->intr_cfg_reg);
10921         val &= ~BIT(g->intr_enable_bit);
10922 @@ -582,7 +582,7 @@ static void msm_gpio_irq_mask(struct irq_data *d)
10924         clear_bit(d->hwirq, pctrl->enabled_irqs);
10926 -       spin_unlock_irqrestore(&pctrl->lock, flags);
10927 +       raw_spin_unlock_irqrestore(&pctrl->lock, flags);
10930  static void msm_gpio_irq_unmask(struct irq_data *d)
10931 @@ -595,7 +595,7 @@ static void msm_gpio_irq_unmask(struct irq_data *d)
10933         g = &pctrl->soc->groups[d->hwirq];
10935 -       spin_lock_irqsave(&pctrl->lock, flags);
10936 +       raw_spin_lock_irqsave(&pctrl->lock, flags);
10938         val = readl(pctrl->regs + g->intr_cfg_reg);
10939         val |= BIT(g->intr_enable_bit);
10940 @@ -603,7 +603,7 @@ static void msm_gpio_irq_unmask(struct irq_data *d)
10942         set_bit(d->hwirq, pctrl->enabled_irqs);
10944 -       spin_unlock_irqrestore(&pctrl->lock, flags);
10945 +       raw_spin_unlock_irqrestore(&pctrl->lock, flags);
10948  static void msm_gpio_irq_ack(struct irq_data *d)
10949 @@ -616,7 +616,7 @@ static void msm_gpio_irq_ack(struct irq_data *d)
10951         g = &pctrl->soc->groups[d->hwirq];
10953 -       spin_lock_irqsave(&pctrl->lock, flags);
10954 +       raw_spin_lock_irqsave(&pctrl->lock, flags);
10956         val = readl(pctrl->regs + g->intr_status_reg);
10957         if (g->intr_ack_high)
10958 @@ -628,7 +628,7 @@ static void msm_gpio_irq_ack(struct irq_data *d)
10959         if (test_bit(d->hwirq, pctrl->dual_edge_irqs))
10960                 msm_gpio_update_dual_edge_pos(pctrl, g, d);
10962 -       spin_unlock_irqrestore(&pctrl->lock, flags);
10963 +       raw_spin_unlock_irqrestore(&pctrl->lock, flags);
10966  static int msm_gpio_irq_set_type(struct irq_data *d, unsigned int type)
10967 @@ -641,7 +641,7 @@ static int msm_gpio_irq_set_type(struct irq_data *d, unsigned int type)
10969         g = &pctrl->soc->groups[d->hwirq];
10971 -       spin_lock_irqsave(&pctrl->lock, flags);
10972 +       raw_spin_lock_irqsave(&pctrl->lock, flags);
10974         /*
10975          * For hw without possibility of detecting both edges
10976 @@ -715,7 +715,7 @@ static int msm_gpio_irq_set_type(struct irq_data *d, unsigned int type)
10977         if (test_bit(d->hwirq, pctrl->dual_edge_irqs))
10978                 msm_gpio_update_dual_edge_pos(pctrl, g, d);
10980 -       spin_unlock_irqrestore(&pctrl->lock, flags);
10981 +       raw_spin_unlock_irqrestore(&pctrl->lock, flags);
10983         if (type & (IRQ_TYPE_LEVEL_LOW | IRQ_TYPE_LEVEL_HIGH))
10984                 irq_set_handler_locked(d, handle_level_irq);
10985 @@ -731,11 +731,11 @@ static int msm_gpio_irq_set_wake(struct irq_data *d, unsigned int on)
10986         struct msm_pinctrl *pctrl = to_msm_pinctrl(gc);
10987         unsigned long flags;
10989 -       spin_lock_irqsave(&pctrl->lock, flags);
10990 +       raw_spin_lock_irqsave(&pctrl->lock, flags);
10992         irq_set_irq_wake(pctrl->irq, on);
10994 -       spin_unlock_irqrestore(&pctrl->lock, flags);
10995 +       raw_spin_unlock_irqrestore(&pctrl->lock, flags);
10997         return 0;
10999 @@ -881,7 +881,7 @@ int msm_pinctrl_probe(struct platform_device *pdev,
11000         pctrl->soc = soc_data;
11001         pctrl->chip = msm_gpio_template;
11003 -       spin_lock_init(&pctrl->lock);
11004 +       raw_spin_lock_init(&pctrl->lock);
11006         res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
11007         pctrl->regs = devm_ioremap_resource(&pdev->dev, res);
11008 diff --git a/drivers/scsi/fcoe/fcoe.c b/drivers/scsi/fcoe/fcoe.c
11009 index f4424063b860..cbbbebd86c6e 100644
11010 --- a/drivers/scsi/fcoe/fcoe.c
11011 +++ b/drivers/scsi/fcoe/fcoe.c
11012 @@ -1286,7 +1286,7 @@ static void fcoe_percpu_thread_destroy(unsigned int cpu)
11013         struct sk_buff *skb;
11014  #ifdef CONFIG_SMP
11015         struct fcoe_percpu_s *p0;
11016 -       unsigned targ_cpu = get_cpu();
11017 +       unsigned targ_cpu = get_cpu_light();
11018  #endif /* CONFIG_SMP */
11020         FCOE_DBG("Destroying receive thread for CPU %d\n", cpu);
11021 @@ -1342,7 +1342,7 @@ static void fcoe_percpu_thread_destroy(unsigned int cpu)
11022                         kfree_skb(skb);
11023                 spin_unlock_bh(&p->fcoe_rx_list.lock);
11024         }
11025 -       put_cpu();
11026 +       put_cpu_light();
11027  #else
11028         /*
11029          * This a non-SMP scenario where the singular Rx thread is
11030 @@ -1566,11 +1566,11 @@ err2:
11031  static int fcoe_alloc_paged_crc_eof(struct sk_buff *skb, int tlen)
11033         struct fcoe_percpu_s *fps;
11034 -       int rc;
11035 +       int rc, cpu = get_cpu_light();
11037 -       fps = &get_cpu_var(fcoe_percpu);
11038 +       fps = &per_cpu(fcoe_percpu, cpu);
11039         rc = fcoe_get_paged_crc_eof(skb, tlen, fps);
11040 -       put_cpu_var(fcoe_percpu);
11041 +       put_cpu_light();
11043         return rc;
11045 @@ -1766,11 +1766,11 @@ static inline int fcoe_filter_frames(struct fc_lport *lport,
11046                 return 0;
11047         }
11049 -       stats = per_cpu_ptr(lport->stats, get_cpu());
11050 +       stats = per_cpu_ptr(lport->stats, get_cpu_light());
11051         stats->InvalidCRCCount++;
11052         if (stats->InvalidCRCCount < 5)
11053                 printk(KERN_WARNING "fcoe: dropping frame with CRC error\n");
11054 -       put_cpu();
11055 +       put_cpu_light();
11056         return -EINVAL;
11059 @@ -1814,7 +1814,7 @@ static void fcoe_recv_frame(struct sk_buff *skb)
11060          */
11061         hp = (struct fcoe_hdr *) skb_network_header(skb);
11063 -       stats = per_cpu_ptr(lport->stats, get_cpu());
11064 +       stats = per_cpu_ptr(lport->stats, get_cpu_light());
11065         if (unlikely(FC_FCOE_DECAPS_VER(hp) != FC_FCOE_VER)) {
11066                 if (stats->ErrorFrames < 5)
11067                         printk(KERN_WARNING "fcoe: FCoE version "
11068 @@ -1846,13 +1846,13 @@ static void fcoe_recv_frame(struct sk_buff *skb)
11069                 goto drop;
11071         if (!fcoe_filter_frames(lport, fp)) {
11072 -               put_cpu();
11073 +               put_cpu_light();
11074                 fc_exch_recv(lport, fp);
11075                 return;
11076         }
11077  drop:
11078         stats->ErrorFrames++;
11079 -       put_cpu();
11080 +       put_cpu_light();
11081         kfree_skb(skb);
11084 diff --git a/drivers/scsi/fcoe/fcoe_ctlr.c b/drivers/scsi/fcoe/fcoe_ctlr.c
11085 index 34a1b1f333b4..d91131210695 100644
11086 --- a/drivers/scsi/fcoe/fcoe_ctlr.c
11087 +++ b/drivers/scsi/fcoe/fcoe_ctlr.c
11088 @@ -831,7 +831,7 @@ static unsigned long fcoe_ctlr_age_fcfs(struct fcoe_ctlr *fip)
11090         INIT_LIST_HEAD(&del_list);
11092 -       stats = per_cpu_ptr(fip->lp->stats, get_cpu());
11093 +       stats = per_cpu_ptr(fip->lp->stats, get_cpu_light());
11095         list_for_each_entry_safe(fcf, next, &fip->fcfs, list) {
11096                 deadline = fcf->time + fcf->fka_period + fcf->fka_period / 2;
11097 @@ -867,7 +867,7 @@ static unsigned long fcoe_ctlr_age_fcfs(struct fcoe_ctlr *fip)
11098                                 sel_time = fcf->time;
11099                 }
11100         }
11101 -       put_cpu();
11102 +       put_cpu_light();
11104         list_for_each_entry_safe(fcf, next, &del_list, list) {
11105                 /* Removes fcf from current list */
11106 diff --git a/drivers/scsi/libfc/fc_exch.c b/drivers/scsi/libfc/fc_exch.c
11107 index 30f9ef0c0d4f..6c686bc01a82 100644
11108 --- a/drivers/scsi/libfc/fc_exch.c
11109 +++ b/drivers/scsi/libfc/fc_exch.c
11110 @@ -814,10 +814,10 @@ static struct fc_exch *fc_exch_em_alloc(struct fc_lport *lport,
11111         }
11112         memset(ep, 0, sizeof(*ep));
11114 -       cpu = get_cpu();
11115 +       cpu = get_cpu_light();
11116         pool = per_cpu_ptr(mp->pool, cpu);
11117         spin_lock_bh(&pool->lock);
11118 -       put_cpu();
11119 +       put_cpu_light();
11121         /* peek cache of free slot */
11122         if (pool->left != FC_XID_UNKNOWN) {
11123 diff --git a/drivers/scsi/libsas/sas_ata.c b/drivers/scsi/libsas/sas_ata.c
11124 index 6f5e2720ffad..ee8a8ed49b89 100644
11125 --- a/drivers/scsi/libsas/sas_ata.c
11126 +++ b/drivers/scsi/libsas/sas_ata.c
11127 @@ -190,7 +190,7 @@ static unsigned int sas_ata_qc_issue(struct ata_queued_cmd *qc)
11128         /* TODO: audit callers to ensure they are ready for qc_issue to
11129          * unconditionally re-enable interrupts
11130          */
11131 -       local_irq_save(flags);
11132 +       local_irq_save_nort(flags);
11133         spin_unlock(ap->lock);
11135         /* If the device fell off, no sense in issuing commands */
11136 @@ -255,7 +255,7 @@ static unsigned int sas_ata_qc_issue(struct ata_queued_cmd *qc)
11138   out:
11139         spin_lock(ap->lock);
11140 -       local_irq_restore(flags);
11141 +       local_irq_restore_nort(flags);
11142         return ret;
11145 diff --git a/drivers/scsi/qla2xxx/qla_inline.h b/drivers/scsi/qla2xxx/qla_inline.h
11146 index fee9eb7c8a60..b42d4adc42dc 100644
11147 --- a/drivers/scsi/qla2xxx/qla_inline.h
11148 +++ b/drivers/scsi/qla2xxx/qla_inline.h
11149 @@ -59,12 +59,12 @@ qla2x00_poll(struct rsp_que *rsp)
11151         unsigned long flags;
11152         struct qla_hw_data *ha = rsp->hw;
11153 -       local_irq_save(flags);
11154 +       local_irq_save_nort(flags);
11155         if (IS_P3P_TYPE(ha))
11156                 qla82xx_poll(0, rsp);
11157         else
11158                 ha->isp_ops->intr_handler(0, rsp);
11159 -       local_irq_restore(flags);
11160 +       local_irq_restore_nort(flags);
11163  static inline uint8_t *
11164 diff --git a/drivers/thermal/x86_pkg_temp_thermal.c b/drivers/thermal/x86_pkg_temp_thermal.c
11165 index 7fc919f7da4d..e03fa17b8670 100644
11166 --- a/drivers/thermal/x86_pkg_temp_thermal.c
11167 +++ b/drivers/thermal/x86_pkg_temp_thermal.c
11168 @@ -29,6 +29,7 @@
11169  #include <linux/pm.h>
11170  #include <linux/thermal.h>
11171  #include <linux/debugfs.h>
11172 +#include <linux/swork.h>
11173  #include <asm/cpu_device_id.h>
11174  #include <asm/mce.h>
11176 @@ -352,7 +353,7 @@ static void pkg_temp_thermal_threshold_work_fn(struct work_struct *work)
11177         }
11180 -static int pkg_temp_thermal_platform_thermal_notify(__u64 msr_val)
11181 +static void platform_thermal_notify_work(struct swork_event *event)
11183         unsigned long flags;
11184         int cpu = smp_processor_id();
11185 @@ -369,7 +370,7 @@ static int pkg_temp_thermal_platform_thermal_notify(__u64 msr_val)
11186                         pkg_work_scheduled[phy_id]) {
11187                 disable_pkg_thres_interrupt();
11188                 spin_unlock_irqrestore(&pkg_work_lock, flags);
11189 -               return -EINVAL;
11190 +               return;
11191         }
11192         pkg_work_scheduled[phy_id] = 1;
11193         spin_unlock_irqrestore(&pkg_work_lock, flags);
11194 @@ -378,9 +379,48 @@ static int pkg_temp_thermal_platform_thermal_notify(__u64 msr_val)
11195         schedule_delayed_work_on(cpu,
11196                                 &per_cpu(pkg_temp_thermal_threshold_work, cpu),
11197                                 msecs_to_jiffies(notify_delay_ms));
11200 +#ifdef CONFIG_PREEMPT_RT_FULL
11201 +static struct swork_event notify_work;
11203 +static int thermal_notify_work_init(void)
11205 +       int err;
11207 +       err = swork_get();
11208 +       if (err)
11209 +               return err;
11211 +       INIT_SWORK(&notify_work, platform_thermal_notify_work);
11212         return 0;
11215 +static void thermal_notify_work_cleanup(void)
11217 +       swork_put();
11220 +static int pkg_temp_thermal_platform_thermal_notify(__u64 msr_val)
11222 +       swork_queue(&notify_work);
11223 +       return 0;
11226 +#else  /* !CONFIG_PREEMPT_RT_FULL */
11228 +static int thermal_notify_work_init(void) { return 0; }
11230 +static void thermal_notify_work_cleanup(void) {  }
11232 +static int pkg_temp_thermal_platform_thermal_notify(__u64 msr_val)
11234 +       platform_thermal_notify_work(NULL);
11236 +       return 0;
11238 +#endif /* CONFIG_PREEMPT_RT_FULL */
11240  static int find_siblings_cpu(int cpu)
11242         int i;
11243 @@ -584,6 +624,9 @@ static int __init pkg_temp_thermal_init(void)
11244         if (!x86_match_cpu(pkg_temp_thermal_ids))
11245                 return -ENODEV;
11247 +       if (!thermal_notify_work_init())
11248 +               return -ENODEV;
11250         spin_lock_init(&pkg_work_lock);
11251         platform_thermal_package_notify =
11252                         pkg_temp_thermal_platform_thermal_notify;
11253 @@ -608,7 +651,7 @@ err_ret:
11254         kfree(pkg_work_scheduled);
11255         platform_thermal_package_notify = NULL;
11256         platform_thermal_package_rate_control = NULL;
11258 +       thermal_notify_work_cleanup();
11259         return -ENODEV;
11262 @@ -633,6 +676,7 @@ static void __exit pkg_temp_thermal_exit(void)
11263         mutex_unlock(&phy_dev_list_mutex);
11264         platform_thermal_package_notify = NULL;
11265         platform_thermal_package_rate_control = NULL;
11266 +       thermal_notify_work_cleanup();
11267         for_each_online_cpu(i)
11268                 cancel_delayed_work_sync(
11269                         &per_cpu(pkg_temp_thermal_threshold_work, i));
11270 diff --git a/drivers/tty/serial/8250/8250_core.c b/drivers/tty/serial/8250/8250_core.c
11271 index 39126460c1f5..af7701ca4d48 100644
11272 --- a/drivers/tty/serial/8250/8250_core.c
11273 +++ b/drivers/tty/serial/8250/8250_core.c
11274 @@ -58,7 +58,16 @@ static struct uart_driver serial8250_reg;
11276  static unsigned int skip_txen_test; /* force skip of txen test at init time */
11278 -#define PASS_LIMIT     512
11280 + * On -rt we can have a more delays, and legitimately
11281 + * so - so don't drop work spuriously and spam the
11282 + * syslog:
11283 + */
11284 +#ifdef CONFIG_PREEMPT_RT_FULL
11285 +# define PASS_LIMIT    1000000
11286 +#else
11287 +# define PASS_LIMIT    512
11288 +#endif
11290  #include <asm/serial.h>
11291  /*
11292 diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c
11293 index 56ccbcefdd85..a0b9e854672c 100644
11294 --- a/drivers/tty/serial/8250/8250_port.c
11295 +++ b/drivers/tty/serial/8250/8250_port.c
11296 @@ -35,6 +35,7 @@
11297  #include <linux/nmi.h>
11298  #include <linux/mutex.h>
11299  #include <linux/slab.h>
11300 +#include <linux/kdb.h>
11301  #include <linux/uaccess.h>
11302  #include <linux/pm_runtime.h>
11304 @@ -2843,9 +2844,9 @@ void serial8250_console_write(struct uart_8250_port *up, const char *s,
11306         serial8250_rpm_get(up);
11308 -       if (port->sysrq)
11309 +       if (port->sysrq || oops_in_progress)
11310                 locked = 0;
11311 -       else if (oops_in_progress)
11312 +       else if (in_kdb_printk())
11313                 locked = spin_trylock_irqsave(&port->lock, flags);
11314         else
11315                 spin_lock_irqsave(&port->lock, flags);
11316 diff --git a/drivers/tty/serial/amba-pl011.c b/drivers/tty/serial/amba-pl011.c
11317 index 899a77187bde..3ff6363b3751 100644
11318 --- a/drivers/tty/serial/amba-pl011.c
11319 +++ b/drivers/tty/serial/amba-pl011.c
11320 @@ -2067,13 +2067,19 @@ pl011_console_write(struct console *co, const char *s, unsigned int count)
11322         clk_enable(uap->clk);
11324 -       local_irq_save(flags);
11325 +       /*
11326 +        * local_irq_save(flags);
11327 +        *
11328 +        * This local_irq_save() is nonsense. If we come in via sysrq
11329 +        * handling then interrupts are already disabled. Aside of
11330 +        * that the port.sysrq check is racy on SMP regardless.
11331 +       */
11332         if (uap->port.sysrq)
11333                 locked = 0;
11334         else if (oops_in_progress)
11335 -               locked = spin_trylock(&uap->port.lock);
11336 +               locked = spin_trylock_irqsave(&uap->port.lock, flags);
11337         else
11338 -               spin_lock(&uap->port.lock);
11339 +               spin_lock_irqsave(&uap->port.lock, flags);
11341         /*
11342          *      First save the CR then disable the interrupts
11343 @@ -2098,8 +2104,7 @@ pl011_console_write(struct console *co, const char *s, unsigned int count)
11344                 writew(old_cr, uap->port.membase + UART011_CR);
11346         if (locked)
11347 -               spin_unlock(&uap->port.lock);
11348 -       local_irq_restore(flags);
11349 +               spin_unlock_irqrestore(&uap->port.lock, flags);
11351         clk_disable(uap->clk);
11353 diff --git a/drivers/tty/serial/omap-serial.c b/drivers/tty/serial/omap-serial.c
11354 index de1c143b475f..3fef536a1844 100644
11355 --- a/drivers/tty/serial/omap-serial.c
11356 +++ b/drivers/tty/serial/omap-serial.c
11357 @@ -1257,13 +1257,10 @@ serial_omap_console_write(struct console *co, const char *s,
11359         pm_runtime_get_sync(up->dev);
11361 -       local_irq_save(flags);
11362 -       if (up->port.sysrq)
11363 -               locked = 0;
11364 -       else if (oops_in_progress)
11365 -               locked = spin_trylock(&up->port.lock);
11366 +       if (up->port.sysrq || oops_in_progress)
11367 +               locked = spin_trylock_irqsave(&up->port.lock, flags);
11368         else
11369 -               spin_lock(&up->port.lock);
11370 +               spin_lock_irqsave(&up->port.lock, flags);
11372         /*
11373          * First save the IER then disable the interrupts
11374 @@ -1292,8 +1289,7 @@ serial_omap_console_write(struct console *co, const char *s,
11375         pm_runtime_mark_last_busy(up->dev);
11376         pm_runtime_put_autosuspend(up->dev);
11377         if (locked)
11378 -               spin_unlock(&up->port.lock);
11379 -       local_irq_restore(flags);
11380 +               spin_unlock_irqrestore(&up->port.lock, flags);
11383  static int __init
11384 diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c
11385 index c3f4f2ab7b33..ae1ffb5f7610 100644
11386 --- a/drivers/usb/core/hcd.c
11387 +++ b/drivers/usb/core/hcd.c
11388 @@ -1738,9 +1738,9 @@ static void __usb_hcd_giveback_urb(struct urb *urb)
11389          * and no one may trigger the above deadlock situation when
11390          * running complete() in tasklet.
11391          */
11392 -       local_irq_save(flags);
11393 +       local_irq_save_nort(flags);
11394         urb->complete(urb);
11395 -       local_irq_restore(flags);
11396 +       local_irq_restore_nort(flags);
11398         usb_anchor_resume_wakeups(anchor);
11399         atomic_dec(&urb->use_count);
11400 diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c
11401 index 9ad5145d3103..bf883695cdfb 100644
11402 --- a/drivers/usb/gadget/function/f_fs.c
11403 +++ b/drivers/usb/gadget/function/f_fs.c
11404 @@ -1404,7 +1404,7 @@ static void ffs_data_put(struct ffs_data *ffs)
11405                 pr_info("%s(): freeing\n", __func__);
11406                 ffs_data_clear(ffs);
11407                 BUG_ON(waitqueue_active(&ffs->ev.waitq) ||
11408 -                      waitqueue_active(&ffs->ep0req_completion.wait));
11409 +                      swait_active(&ffs->ep0req_completion.wait));
11410                 kfree(ffs->dev_name);
11411                 kfree(ffs);
11412         }
11413 diff --git a/drivers/usb/gadget/legacy/inode.c b/drivers/usb/gadget/legacy/inode.c
11414 index de014436fb22..e52700c7555e 100644
11415 --- a/drivers/usb/gadget/legacy/inode.c
11416 +++ b/drivers/usb/gadget/legacy/inode.c
11417 @@ -345,7 +345,7 @@ ep_io (struct ep_data *epdata, void *buf, unsigned len)
11418         spin_unlock_irq (&epdata->dev->lock);
11420         if (likely (value == 0)) {
11421 -               value = wait_event_interruptible (done.wait, done.done);
11422 +               value = swait_event_interruptible (done.wait, done.done);
11423                 if (value != 0) {
11424                         spin_lock_irq (&epdata->dev->lock);
11425                         if (likely (epdata->ep != NULL)) {
11426 @@ -354,7 +354,7 @@ ep_io (struct ep_data *epdata, void *buf, unsigned len)
11427                                 usb_ep_dequeue (epdata->ep, epdata->req);
11428                                 spin_unlock_irq (&epdata->dev->lock);
11430 -                               wait_event (done.wait, done.done);
11431 +                               swait_event (done.wait, done.done);
11432                                 if (epdata->status == -ECONNRESET)
11433                                         epdata->status = -EINTR;
11434                         } else {
11435 diff --git a/drivers/usb/gadget/udc/atmel_usba_udc.c b/drivers/usb/gadget/udc/atmel_usba_udc.c
11436 index f92f5aff0dd5..f9bba26e3655 100644
11437 --- a/drivers/usb/gadget/udc/atmel_usba_udc.c
11438 +++ b/drivers/usb/gadget/udc/atmel_usba_udc.c
11439 @@ -17,7 +17,9 @@
11440  #include <linux/device.h>
11441  #include <linux/dma-mapping.h>
11442  #include <linux/list.h>
11443 +#include <linux/mfd/syscon.h>
11444  #include <linux/platform_device.h>
11445 +#include <linux/regmap.h>
11446  #include <linux/usb/ch9.h>
11447  #include <linux/usb/gadget.h>
11448  #include <linux/usb/atmel_usba_udc.h>
11449 @@ -1888,20 +1890,15 @@ static int atmel_usba_stop(struct usb_gadget *gadget)
11450  #ifdef CONFIG_OF
11451  static void at91sam9rl_toggle_bias(struct usba_udc *udc, int is_on)
11453 -       unsigned int uckr = at91_pmc_read(AT91_CKGR_UCKR);
11455 -       if (is_on)
11456 -               at91_pmc_write(AT91_CKGR_UCKR, uckr | AT91_PMC_BIASEN);
11457 -       else
11458 -               at91_pmc_write(AT91_CKGR_UCKR, uckr & ~(AT91_PMC_BIASEN));
11459 +       regmap_update_bits(udc->pmc, AT91_CKGR_UCKR, AT91_PMC_BIASEN,
11460 +                          is_on ? AT91_PMC_BIASEN : 0);
11463  static void at91sam9g45_pulse_bias(struct usba_udc *udc)
11465 -       unsigned int uckr = at91_pmc_read(AT91_CKGR_UCKR);
11467 -       at91_pmc_write(AT91_CKGR_UCKR, uckr & ~(AT91_PMC_BIASEN));
11468 -       at91_pmc_write(AT91_CKGR_UCKR, uckr | AT91_PMC_BIASEN);
11469 +       regmap_update_bits(udc->pmc, AT91_CKGR_UCKR, AT91_PMC_BIASEN, 0);
11470 +       regmap_update_bits(udc->pmc, AT91_CKGR_UCKR, AT91_PMC_BIASEN,
11471 +                          AT91_PMC_BIASEN);
11474  static const struct usba_udc_errata at91sam9rl_errata = {
11475 @@ -1938,6 +1935,9 @@ static struct usba_ep * atmel_udc_of_init(struct platform_device *pdev,
11476                 return ERR_PTR(-EINVAL);
11478         udc->errata = match->data;
11479 +       udc->pmc = syscon_regmap_lookup_by_compatible("atmel,at91sam9g45-pmc");
11480 +       if (udc->errata && IS_ERR(udc->pmc))
11481 +               return ERR_CAST(udc->pmc);
11483         udc->num_ep = 0;
11485 diff --git a/drivers/usb/gadget/udc/atmel_usba_udc.h b/drivers/usb/gadget/udc/atmel_usba_udc.h
11486 index ea448a344767..3e1c9d589dfa 100644
11487 --- a/drivers/usb/gadget/udc/atmel_usba_udc.h
11488 +++ b/drivers/usb/gadget/udc/atmel_usba_udc.h
11489 @@ -354,6 +354,8 @@ struct usba_udc {
11490         struct dentry *debugfs_root;
11491         struct dentry *debugfs_regs;
11492  #endif
11494 +       struct regmap *pmc;
11495  };
11497  static inline struct usba_ep *to_usba_ep(struct usb_ep *ep)
11498 diff --git a/fs/aio.c b/fs/aio.c
11499 index fe4f49212b99..c3194afdc3df 100644
11500 --- a/fs/aio.c
11501 +++ b/fs/aio.c
11502 @@ -40,6 +40,7 @@
11503  #include <linux/ramfs.h>
11504  #include <linux/percpu-refcount.h>
11505  #include <linux/mount.h>
11506 +#include <linux/swork.h>
11508  #include <asm/kmap_types.h>
11509  #include <asm/uaccess.h>
11510 @@ -115,7 +116,7 @@ struct kioctx {
11511         struct page             **ring_pages;
11512         long                    nr_pages;
11514 -       struct work_struct      free_work;
11515 +       struct swork_event      free_work;
11517         /*
11518          * signals when all in-flight requests are done
11519 @@ -258,6 +259,7 @@ static int __init aio_setup(void)
11520                 .mount          = aio_mount,
11521                 .kill_sb        = kill_anon_super,
11522         };
11523 +       BUG_ON(swork_get());
11524         aio_mnt = kern_mount(&aio_fs);
11525         if (IS_ERR(aio_mnt))
11526                 panic("Failed to create aio fs mount.");
11527 @@ -573,9 +575,9 @@ static int kiocb_cancel(struct aio_kiocb *kiocb)
11528         return cancel(&kiocb->common);
11531 -static void free_ioctx(struct work_struct *work)
11532 +static void free_ioctx(struct swork_event *sev)
11534 -       struct kioctx *ctx = container_of(work, struct kioctx, free_work);
11535 +       struct kioctx *ctx = container_of(sev, struct kioctx, free_work);
11537         pr_debug("freeing %p\n", ctx);
11539 @@ -594,8 +596,8 @@ static void free_ioctx_reqs(struct percpu_ref *ref)
11540         if (ctx->rq_wait && atomic_dec_and_test(&ctx->rq_wait->count))
11541                 complete(&ctx->rq_wait->comp);
11543 -       INIT_WORK(&ctx->free_work, free_ioctx);
11544 -       schedule_work(&ctx->free_work);
11545 +       INIT_SWORK(&ctx->free_work, free_ioctx);
11546 +       swork_queue(&ctx->free_work);
11549  /*
11550 @@ -603,9 +605,9 @@ static void free_ioctx_reqs(struct percpu_ref *ref)
11551   * and ctx->users has dropped to 0, so we know no more kiocbs can be submitted -
11552   * now it's safe to cancel any that need to be.
11553   */
11554 -static void free_ioctx_users(struct percpu_ref *ref)
11555 +static void free_ioctx_users_work(struct swork_event *sev)
11557 -       struct kioctx *ctx = container_of(ref, struct kioctx, users);
11558 +       struct kioctx *ctx = container_of(sev, struct kioctx, free_work);
11559         struct aio_kiocb *req;
11561         spin_lock_irq(&ctx->ctx_lock);
11562 @@ -624,6 +626,14 @@ static void free_ioctx_users(struct percpu_ref *ref)
11563         percpu_ref_put(&ctx->reqs);
11566 +static void free_ioctx_users(struct percpu_ref *ref)
11568 +       struct kioctx *ctx = container_of(ref, struct kioctx, users);
11570 +       INIT_SWORK(&ctx->free_work, free_ioctx_users_work);
11571 +       swork_queue(&ctx->free_work);
11574  static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
11576         unsigned i, new_nr;
11577 diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
11578 index 502d3892d8a4..05af8d3e6e88 100644
11579 --- a/fs/autofs4/autofs_i.h
11580 +++ b/fs/autofs4/autofs_i.h
11581 @@ -34,6 +34,7 @@
11582  #include <linux/sched.h>
11583  #include <linux/mount.h>
11584  #include <linux/namei.h>
11585 +#include <linux/delay.h>
11586  #include <asm/current.h>
11587  #include <asm/uaccess.h>
11589 diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
11590 index 7a5a598a2d94..d08bcdc30566 100644
11591 --- a/fs/autofs4/expire.c
11592 +++ b/fs/autofs4/expire.c
11593 @@ -150,7 +150,7 @@ again:
11594                         parent = p->d_parent;
11595                         if (!spin_trylock(&parent->d_lock)) {
11596                                 spin_unlock(&p->d_lock);
11597 -                               cpu_relax();
11598 +                               cpu_chill();
11599                                 goto relock;
11600                         }
11601                         spin_unlock(&p->d_lock);
11602 diff --git a/fs/buffer.c b/fs/buffer.c
11603 index 4f4cd959da7c..72b27e17b907 100644
11604 --- a/fs/buffer.c
11605 +++ b/fs/buffer.c
11606 @@ -305,8 +305,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
11607          * decide that the page is now completely done.
11608          */
11609         first = page_buffers(page);
11610 -       local_irq_save(flags);
11611 -       bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
11612 +       flags = bh_uptodate_lock_irqsave(first);
11613         clear_buffer_async_read(bh);
11614         unlock_buffer(bh);
11615         tmp = bh;
11616 @@ -319,8 +318,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
11617                 }
11618                 tmp = tmp->b_this_page;
11619         } while (tmp != bh);
11620 -       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
11621 -       local_irq_restore(flags);
11622 +       bh_uptodate_unlock_irqrestore(first, flags);
11624         /*
11625          * If none of the buffers had errors and they are all
11626 @@ -332,9 +330,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
11627         return;
11629  still_busy:
11630 -       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
11631 -       local_irq_restore(flags);
11632 -       return;
11633 +       bh_uptodate_unlock_irqrestore(first, flags);
11636  /*
11637 @@ -362,8 +358,7 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate)
11638         }
11640         first = page_buffers(page);
11641 -       local_irq_save(flags);
11642 -       bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
11643 +       flags = bh_uptodate_lock_irqsave(first);
11645         clear_buffer_async_write(bh);
11646         unlock_buffer(bh);
11647 @@ -375,15 +370,12 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate)
11648                 }
11649                 tmp = tmp->b_this_page;
11650         }
11651 -       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
11652 -       local_irq_restore(flags);
11653 +       bh_uptodate_unlock_irqrestore(first, flags);
11654         end_page_writeback(page);
11655         return;
11657  still_busy:
11658 -       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
11659 -       local_irq_restore(flags);
11660 -       return;
11661 +       bh_uptodate_unlock_irqrestore(first, flags);
11663  EXPORT_SYMBOL(end_buffer_async_write);
11665 @@ -3325,6 +3317,7 @@ struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
11666         struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
11667         if (ret) {
11668                 INIT_LIST_HEAD(&ret->b_assoc_buffers);
11669 +               buffer_head_init_locks(ret);
11670                 preempt_disable();
11671                 __this_cpu_inc(bh_accounting.nr);
11672                 recalc_bh_state();
11673 diff --git a/fs/dcache.c b/fs/dcache.c
11674 index 849c1c1e787b..995995130191 100644
11675 --- a/fs/dcache.c
11676 +++ b/fs/dcache.c
11677 @@ -19,6 +19,7 @@
11678  #include <linux/mm.h>
11679  #include <linux/fs.h>
11680  #include <linux/fsnotify.h>
11681 +#include <linux/delay.h>
11682  #include <linux/slab.h>
11683  #include <linux/init.h>
11684  #include <linux/hash.h>
11685 @@ -747,6 +748,8 @@ static inline bool fast_dput(struct dentry *dentry)
11686   */
11687  void dput(struct dentry *dentry)
11689 +       struct dentry *parent;
11691         if (unlikely(!dentry))
11692                 return;
11694 @@ -783,9 +786,18 @@ repeat:
11695         return;
11697  kill_it:
11698 -       dentry = dentry_kill(dentry);
11699 -       if (dentry) {
11700 -               cond_resched();
11701 +       parent = dentry_kill(dentry);
11702 +       if (parent) {
11703 +               int r;
11705 +               if (parent == dentry) {
11706 +                       /* the task with the highest priority won't schedule */
11707 +                       r = cond_resched();
11708 +                       if (!r)
11709 +                               cpu_chill();
11710 +               } else {
11711 +                       dentry = parent;
11712 +               }
11713                 goto repeat;
11714         }
11716 @@ -2397,7 +2409,7 @@ again:
11717         if (dentry->d_lockref.count == 1) {
11718                 if (!spin_trylock(&inode->i_lock)) {
11719                         spin_unlock(&dentry->d_lock);
11720 -                       cpu_relax();
11721 +                       cpu_chill();
11722                         goto again;
11723                 }
11724                 dentry->d_flags &= ~DCACHE_CANT_MOUNT;
11725 diff --git a/fs/eventpoll.c b/fs/eventpoll.c
11726 index 1e009cad8d5c..d0c12504d3b4 100644
11727 --- a/fs/eventpoll.c
11728 +++ b/fs/eventpoll.c
11729 @@ -505,12 +505,12 @@ static int ep_poll_wakeup_proc(void *priv, void *cookie, int call_nests)
11730   */
11731  static void ep_poll_safewake(wait_queue_head_t *wq)
11733 -       int this_cpu = get_cpu();
11734 +       int this_cpu = get_cpu_light();
11736         ep_call_nested(&poll_safewake_ncalls, EP_MAX_NESTS,
11737                        ep_poll_wakeup_proc, NULL, wq, (void *) (long) this_cpu);
11739 -       put_cpu();
11740 +       put_cpu_light();
11743  static void ep_remove_wait_queue(struct eppoll_entry *pwq)
11744 diff --git a/fs/exec.c b/fs/exec.c
11745 index 3a6de10d3891..74962ac0a738 100644
11746 --- a/fs/exec.c
11747 +++ b/fs/exec.c
11748 @@ -866,12 +866,14 @@ static int exec_mmap(struct mm_struct *mm)
11749                 }
11750         }
11751         task_lock(tsk);
11752 +       preempt_disable_rt();
11753         active_mm = tsk->active_mm;
11754         tsk->mm = mm;
11755         tsk->active_mm = mm;
11756         activate_mm(active_mm, mm);
11757         tsk->mm->vmacache_seqnum = 0;
11758         vmacache_flush(tsk);
11759 +       preempt_enable_rt();
11760         task_unlock(tsk);
11761         if (old_mm) {
11762                 up_read(&old_mm->mmap_sem);
11763 diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
11764 index 2871576fbca4..d1137790ea58 100644
11765 --- a/fs/f2fs/f2fs.h
11766 +++ b/fs/f2fs/f2fs.h
11767 @@ -24,7 +24,6 @@
11769  #ifdef CONFIG_F2FS_CHECK_FS
11770  #define f2fs_bug_on(sbi, condition)    BUG_ON(condition)
11771 -#define f2fs_down_write(x, y)  down_write_nest_lock(x, y)
11772  #else
11773  #define f2fs_bug_on(sbi, condition)                                    \
11774         do {                                                            \
11775 @@ -33,7 +32,6 @@
11776                         set_sbi_flag(sbi, SBI_NEED_FSCK);               \
11777                 }                                                       \
11778         } while (0)
11779 -#define f2fs_down_write(x, y)  down_write(x)
11780  #endif
11782  /*
11783 @@ -959,7 +957,7 @@ static inline void f2fs_unlock_op(struct f2fs_sb_info *sbi)
11785  static inline void f2fs_lock_all(struct f2fs_sb_info *sbi)
11787 -       f2fs_down_write(&sbi->cp_rwsem, &sbi->cp_mutex);
11788 +       down_write(&sbi->cp_rwsem);
11791  static inline void f2fs_unlock_all(struct f2fs_sb_info *sbi)
11792 diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
11793 index 684996c8a3a4..6e18a06aaabe 100644
11794 --- a/fs/jbd2/checkpoint.c
11795 +++ b/fs/jbd2/checkpoint.c
11796 @@ -116,6 +116,8 @@ void __jbd2_log_wait_for_space(journal_t *journal)
11797         nblocks = jbd2_space_needed(journal);
11798         while (jbd2_log_space_left(journal) < nblocks) {
11799                 write_unlock(&journal->j_state_lock);
11800 +               if (current->plug)
11801 +                       io_schedule();
11802                 mutex_lock(&journal->j_checkpoint_mutex);
11804                 /*
11805 diff --git a/fs/namespace.c b/fs/namespace.c
11806 index f26d18d69712..6e0e024715e8 100644
11807 --- a/fs/namespace.c
11808 +++ b/fs/namespace.c
11809 @@ -14,6 +14,7 @@
11810  #include <linux/mnt_namespace.h>
11811  #include <linux/user_namespace.h>
11812  #include <linux/namei.h>
11813 +#include <linux/delay.h>
11814  #include <linux/security.h>
11815  #include <linux/idr.h>
11816  #include <linux/init.h>                /* init_rootfs */
11817 @@ -356,8 +357,11 @@ int __mnt_want_write(struct vfsmount *m)
11818          * incremented count after it has set MNT_WRITE_HOLD.
11819          */
11820         smp_mb();
11821 -       while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD)
11822 -               cpu_relax();
11823 +       while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) {
11824 +               preempt_enable();
11825 +               cpu_chill();
11826 +               preempt_disable();
11827 +       }
11828         /*
11829          * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
11830          * be set to match its requirements. So we must not load that until
11831 diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
11832 index 7521e11db728..f0de4b6b8bf3 100644
11833 --- a/fs/ntfs/aops.c
11834 +++ b/fs/ntfs/aops.c
11835 @@ -107,8 +107,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
11836                                 "0x%llx.", (unsigned long long)bh->b_blocknr);
11837         }
11838         first = page_buffers(page);
11839 -       local_irq_save(flags);
11840 -       bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
11841 +       flags = bh_uptodate_lock_irqsave(first);
11842         clear_buffer_async_read(bh);
11843         unlock_buffer(bh);
11844         tmp = bh;
11845 @@ -123,8 +122,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
11846                 }
11847                 tmp = tmp->b_this_page;
11848         } while (tmp != bh);
11849 -       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
11850 -       local_irq_restore(flags);
11851 +       bh_uptodate_unlock_irqrestore(first, flags);
11852         /*
11853          * If none of the buffers had errors then we can set the page uptodate,
11854          * but we first have to perform the post read mst fixups, if the
11855 @@ -145,13 +143,13 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
11856                 recs = PAGE_CACHE_SIZE / rec_size;
11857                 /* Should have been verified before we got here... */
11858                 BUG_ON(!recs);
11859 -               local_irq_save(flags);
11860 +               local_irq_save_nort(flags);
11861                 kaddr = kmap_atomic(page);
11862                 for (i = 0; i < recs; i++)
11863                         post_read_mst_fixup((NTFS_RECORD*)(kaddr +
11864                                         i * rec_size), rec_size);
11865                 kunmap_atomic(kaddr);
11866 -               local_irq_restore(flags);
11867 +               local_irq_restore_nort(flags);
11868                 flush_dcache_page(page);
11869                 if (likely(page_uptodate && !PageError(page)))
11870                         SetPageUptodate(page);
11871 @@ -159,9 +157,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
11872         unlock_page(page);
11873         return;
11874  still_busy:
11875 -       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
11876 -       local_irq_restore(flags);
11877 -       return;
11878 +       bh_uptodate_unlock_irqrestore(first, flags);
11881  /**
11882 diff --git a/fs/timerfd.c b/fs/timerfd.c
11883 index 1327a02ec778..4260febcb029 100644
11884 --- a/fs/timerfd.c
11885 +++ b/fs/timerfd.c
11886 @@ -461,7 +461,10 @@ static int do_timerfd_settime(int ufd, int flags,
11887                                 break;
11888                 }
11889                 spin_unlock_irq(&ctx->wqh.lock);
11890 -               cpu_relax();
11891 +               if (isalarm(ctx))
11892 +                       hrtimer_wait_for_timer(&ctx->t.alarm.timer);
11893 +               else
11894 +                       hrtimer_wait_for_timer(&ctx->t.tmr);
11895         }
11897         /*
11898 diff --git a/include/acpi/platform/aclinux.h b/include/acpi/platform/aclinux.h
11899 index 323e5daece54..cc5fbd534fd4 100644
11900 --- a/include/acpi/platform/aclinux.h
11901 +++ b/include/acpi/platform/aclinux.h
11902 @@ -127,6 +127,7 @@
11904  #define acpi_cache_t                        struct kmem_cache
11905  #define acpi_spinlock                       spinlock_t *
11906 +#define acpi_raw_spinlock              raw_spinlock_t *
11907  #define acpi_cpu_flags                      unsigned long
11909  /* Use native linux version of acpi_os_allocate_zeroed */
11910 @@ -145,6 +146,20 @@
11911  #define ACPI_USE_ALTERNATE_PROTOTYPE_acpi_os_get_thread_id
11912  #define ACPI_USE_ALTERNATE_PROTOTYPE_acpi_os_create_lock
11914 +#define acpi_os_create_raw_lock(__handle)                      \
11915 +({                                                             \
11916 +        raw_spinlock_t *lock = ACPI_ALLOCATE(sizeof(*lock));   \
11917 +                                                               \
11918 +        if (lock) {                                            \
11919 +               *(__handle) = lock;                             \
11920 +               raw_spin_lock_init(*(__handle));                \
11921 +        }                                                      \
11922 +        lock ? AE_OK : AE_NO_MEMORY;                           \
11923 + })
11925 +#define acpi_os_delete_raw_lock(__handle)      kfree(__handle)
11928  /*
11929   * OSL interfaces used by debugger/disassembler
11930   */
11931 diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h
11932 index 630dd2372238..850e4d993a88 100644
11933 --- a/include/asm-generic/bug.h
11934 +++ b/include/asm-generic/bug.h
11935 @@ -206,6 +206,20 @@ extern void warn_slowpath_null(const char *file, const int line);
11936  # define WARN_ON_SMP(x)                        ({0;})
11937  #endif
11939 +#ifdef CONFIG_PREEMPT_RT_BASE
11940 +# define BUG_ON_RT(c)                  BUG_ON(c)
11941 +# define BUG_ON_NONRT(c)               do { } while (0)
11942 +# define WARN_ON_RT(condition)         WARN_ON(condition)
11943 +# define WARN_ON_NONRT(condition)      do { } while (0)
11944 +# define WARN_ON_ONCE_NONRT(condition) do { } while (0)
11945 +#else
11946 +# define BUG_ON_RT(c)                  do { } while (0)
11947 +# define BUG_ON_NONRT(c)               BUG_ON(c)
11948 +# define WARN_ON_RT(condition)         do { } while (0)
11949 +# define WARN_ON_NONRT(condition)      WARN_ON(condition)
11950 +# define WARN_ON_ONCE_NONRT(condition) WARN_ON_ONCE(condition)
11951 +#endif
11953  #endif /* __ASSEMBLY__ */
11955  #endif
11956 diff --git a/include/asm-generic/preempt.h b/include/asm-generic/preempt.h
11957 index 5d8ffa3e6f8c..c1cde3577551 100644
11958 --- a/include/asm-generic/preempt.h
11959 +++ b/include/asm-generic/preempt.h
11960 @@ -7,10 +7,10 @@
11962  static __always_inline int preempt_count(void)
11964 -       return current_thread_info()->preempt_count;
11965 +       return READ_ONCE(current_thread_info()->preempt_count);
11968 -static __always_inline int *preempt_count_ptr(void)
11969 +static __always_inline volatile int *preempt_count_ptr(void)
11971         return &current_thread_info()->preempt_count;
11973 diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
11974 index daf17d70aeca..463df8954255 100644
11975 --- a/include/linux/blk-mq.h
11976 +++ b/include/linux/blk-mq.h
11977 @@ -212,6 +212,7 @@ static inline u16 blk_mq_unique_tag_to_tag(u32 unique_tag)
11979  struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *, const int ctx_index);
11980  struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_tag_set *, unsigned int, int);
11981 +void __blk_mq_complete_request_remote_work(struct work_struct *work);
11983  int blk_mq_request_started(struct request *rq);
11984  void blk_mq_start_request(struct request *rq);
11985 diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
11986 index fe14382f9664..a82143ad6702 100644
11987 --- a/include/linux/blkdev.h
11988 +++ b/include/linux/blkdev.h
11989 @@ -89,6 +89,7 @@ struct request {
11990         struct list_head queuelist;
11991         union {
11992                 struct call_single_data csd;
11993 +               struct work_struct work;
11994                 unsigned long fifo_time;
11995         };
11997 @@ -455,7 +456,7 @@ struct request_queue {
11998         struct throtl_data *td;
11999  #endif
12000         struct rcu_head         rcu_head;
12001 -       wait_queue_head_t       mq_freeze_wq;
12002 +       struct swait_queue_head mq_freeze_wq;
12003         struct percpu_ref       q_usage_counter;
12004         struct list_head        all_q_node;
12006 diff --git a/include/linux/bottom_half.h b/include/linux/bottom_half.h
12007 index 8fdcb783197d..d07dbeec7bc1 100644
12008 --- a/include/linux/bottom_half.h
12009 +++ b/include/linux/bottom_half.h
12010 @@ -3,6 +3,39 @@
12012  #include <linux/preempt.h>
12014 +#ifdef CONFIG_PREEMPT_RT_FULL
12016 +extern void __local_bh_disable(void);
12017 +extern void _local_bh_enable(void);
12018 +extern void __local_bh_enable(void);
12020 +static inline void local_bh_disable(void)
12022 +       __local_bh_disable();
12025 +static inline void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
12027 +       __local_bh_disable();
12030 +static inline void local_bh_enable(void)
12032 +       __local_bh_enable();
12035 +static inline void __local_bh_enable_ip(unsigned long ip, unsigned int cnt)
12037 +       __local_bh_enable();
12040 +static inline void local_bh_enable_ip(unsigned long ip)
12042 +       __local_bh_enable();
12045 +#else
12047  #ifdef CONFIG_TRACE_IRQFLAGS
12048  extern void __local_bh_disable_ip(unsigned long ip, unsigned int cnt);
12049  #else
12050 @@ -30,5 +63,6 @@ static inline void local_bh_enable(void)
12052         __local_bh_enable_ip(_THIS_IP_, SOFTIRQ_DISABLE_OFFSET);
12054 +#endif
12056  #endif /* _LINUX_BH_H */
12057 diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
12058 index 89d9aa9e79bf..4a201008b02d 100644
12059 --- a/include/linux/buffer_head.h
12060 +++ b/include/linux/buffer_head.h
12061 @@ -75,8 +75,50 @@ struct buffer_head {
12062         struct address_space *b_assoc_map;      /* mapping this buffer is
12063                                                    associated with */
12064         atomic_t b_count;               /* users using this buffer_head */
12065 +#ifdef CONFIG_PREEMPT_RT_BASE
12066 +       spinlock_t b_uptodate_lock;
12067 +#if IS_ENABLED(CONFIG_JBD2)
12068 +       spinlock_t b_state_lock;
12069 +       spinlock_t b_journal_head_lock;
12070 +#endif
12071 +#endif
12072  };
12074 +static inline unsigned long bh_uptodate_lock_irqsave(struct buffer_head *bh)
12076 +       unsigned long flags;
12078 +#ifndef CONFIG_PREEMPT_RT_BASE
12079 +       local_irq_save(flags);
12080 +       bit_spin_lock(BH_Uptodate_Lock, &bh->b_state);
12081 +#else
12082 +       spin_lock_irqsave(&bh->b_uptodate_lock, flags);
12083 +#endif
12084 +       return flags;
12087 +static inline void
12088 +bh_uptodate_unlock_irqrestore(struct buffer_head *bh, unsigned long flags)
12090 +#ifndef CONFIG_PREEMPT_RT_BASE
12091 +       bit_spin_unlock(BH_Uptodate_Lock, &bh->b_state);
12092 +       local_irq_restore(flags);
12093 +#else
12094 +       spin_unlock_irqrestore(&bh->b_uptodate_lock, flags);
12095 +#endif
12098 +static inline void buffer_head_init_locks(struct buffer_head *bh)
12100 +#ifdef CONFIG_PREEMPT_RT_BASE
12101 +       spin_lock_init(&bh->b_uptodate_lock);
12102 +#if IS_ENABLED(CONFIG_JBD2)
12103 +       spin_lock_init(&bh->b_state_lock);
12104 +       spin_lock_init(&bh->b_journal_head_lock);
12105 +#endif
12106 +#endif
12109  /*
12110   * macro tricks to expand the set_buffer_foo(), clear_buffer_foo()
12111   * and buffer_foo() functions.
12112 diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
12113 index 8da263299754..0cc474291e08 100644
12114 --- a/include/linux/cgroup-defs.h
12115 +++ b/include/linux/cgroup-defs.h
12116 @@ -16,6 +16,7 @@
12117  #include <linux/percpu-refcount.h>
12118  #include <linux/percpu-rwsem.h>
12119  #include <linux/workqueue.h>
12120 +#include <linux/swork.h>
12122  #ifdef CONFIG_CGROUPS
12124 @@ -142,6 +143,7 @@ struct cgroup_subsys_state {
12125         /* percpu_ref killing and RCU release */
12126         struct rcu_head rcu_head;
12127         struct work_struct destroy_work;
12128 +       struct swork_event destroy_swork;
12129  };
12131  /*
12132 diff --git a/include/linux/clk/at91_pmc.h b/include/linux/clk/at91_pmc.h
12133 index 1e6932222e11..17f413bbbedf 100644
12134 --- a/include/linux/clk/at91_pmc.h
12135 +++ b/include/linux/clk/at91_pmc.h
12136 @@ -16,18 +16,6 @@
12137  #ifndef AT91_PMC_H
12138  #define AT91_PMC_H
12140 -#ifndef __ASSEMBLY__
12141 -extern void __iomem *at91_pmc_base;
12143 -#define at91_pmc_read(field) \
12144 -       readl_relaxed(at91_pmc_base + field)
12146 -#define at91_pmc_write(field, value) \
12147 -       writel_relaxed(value, at91_pmc_base + field)
12148 -#else
12149 -.extern at91_pmc_base
12150 -#endif
12152  #define        AT91_PMC_SCER           0x00                    /* System Clock Enable Register */
12153  #define        AT91_PMC_SCDR           0x04                    /* System Clock Disable Register */
12155 diff --git a/include/linux/completion.h b/include/linux/completion.h
12156 index 5d5aaae3af43..3bca1590e29f 100644
12157 --- a/include/linux/completion.h
12158 +++ b/include/linux/completion.h
12159 @@ -7,8 +7,7 @@
12160   * Atomic wait-for-completion handler data structures.
12161   * See kernel/sched/completion.c for details.
12162   */
12164 -#include <linux/wait.h>
12165 +#include <linux/swait.h>
12167  /*
12168   * struct completion - structure used to maintain state for a "completion"
12169 @@ -24,11 +23,11 @@
12170   */
12171  struct completion {
12172         unsigned int done;
12173 -       wait_queue_head_t wait;
12174 +       struct swait_queue_head wait;
12175  };
12177  #define COMPLETION_INITIALIZER(work) \
12178 -       { 0, __WAIT_QUEUE_HEAD_INITIALIZER((work).wait) }
12179 +       { 0, __SWAIT_QUEUE_HEAD_INITIALIZER((work).wait) }
12181  #define COMPLETION_INITIALIZER_ONSTACK(work) \
12182         ({ init_completion(&work); work; })
12183 @@ -73,7 +72,7 @@ struct completion {
12184  static inline void init_completion(struct completion *x)
12186         x->done = 0;
12187 -       init_waitqueue_head(&x->wait);
12188 +       init_swait_queue_head(&x->wait);
12191  /**
12192 diff --git a/include/linux/cpu.h b/include/linux/cpu.h
12193 index 3ea9aae2387d..5de5f28a5ef3 100644
12194 --- a/include/linux/cpu.h
12195 +++ b/include/linux/cpu.h
12196 @@ -224,6 +224,8 @@ extern void get_online_cpus(void);
12197  extern void put_online_cpus(void);
12198  extern void cpu_hotplug_disable(void);
12199  extern void cpu_hotplug_enable(void);
12200 +extern void pin_current_cpu(void);
12201 +extern void unpin_current_cpu(void);
12202  #define hotcpu_notifier(fn, pri)       cpu_notifier(fn, pri)
12203  #define __hotcpu_notifier(fn, pri)     __cpu_notifier(fn, pri)
12204  #define register_hotcpu_notifier(nb)   register_cpu_notifier(nb)
12205 @@ -241,6 +243,8 @@ static inline void cpu_hotplug_done(void) {}
12206  #define put_online_cpus()      do { } while (0)
12207  #define cpu_hotplug_disable()  do { } while (0)
12208  #define cpu_hotplug_enable()   do { } while (0)
12209 +static inline void pin_current_cpu(void) { }
12210 +static inline void unpin_current_cpu(void) { }
12211  #define hotcpu_notifier(fn, pri)       do { (void)(fn); } while (0)
12212  #define __hotcpu_notifier(fn, pri)     do { (void)(fn); } while (0)
12213  /* These aren't inline functions due to a GCC bug. */
12214 diff --git a/include/linux/delay.h b/include/linux/delay.h
12215 index a6ecb34cf547..37caab306336 100644
12216 --- a/include/linux/delay.h
12217 +++ b/include/linux/delay.h
12218 @@ -52,4 +52,10 @@ static inline void ssleep(unsigned int seconds)
12219         msleep(seconds * 1000);
12222 +#ifdef CONFIG_PREEMPT_RT_FULL
12223 +extern void cpu_chill(void);
12224 +#else
12225 +# define cpu_chill()   cpu_relax()
12226 +#endif
12228  #endif /* defined(_LINUX_DELAY_H) */
12229 diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
12230 index 60048c50404e..f2cd67624f18 100644
12231 --- a/include/linux/ftrace.h
12232 +++ b/include/linux/ftrace.h
12233 @@ -694,6 +694,18 @@ static inline void __ftrace_enabled_restore(int enabled)
12234  #define CALLER_ADDR5 ((unsigned long)ftrace_return_address(5))
12235  #define CALLER_ADDR6 ((unsigned long)ftrace_return_address(6))
12237 +static inline unsigned long get_lock_parent_ip(void)
12239 +       unsigned long addr = CALLER_ADDR0;
12241 +       if (!in_lock_functions(addr))
12242 +               return addr;
12243 +       addr = CALLER_ADDR1;
12244 +       if (!in_lock_functions(addr))
12245 +               return addr;
12246 +       return CALLER_ADDR2;
12249  #ifdef CONFIG_IRQSOFF_TRACER
12250    extern void time_hardirqs_on(unsigned long a0, unsigned long a1);
12251    extern void time_hardirqs_off(unsigned long a0, unsigned long a1);
12252 diff --git a/include/linux/highmem.h b/include/linux/highmem.h
12253 index bb3f3297062a..a117a33ef72c 100644
12254 --- a/include/linux/highmem.h
12255 +++ b/include/linux/highmem.h
12256 @@ -7,6 +7,7 @@
12257  #include <linux/mm.h>
12258  #include <linux/uaccess.h>
12259  #include <linux/hardirq.h>
12260 +#include <linux/sched.h>
12262  #include <asm/cacheflush.h>
12264 @@ -65,7 +66,7 @@ static inline void kunmap(struct page *page)
12266  static inline void *kmap_atomic(struct page *page)
12268 -       preempt_disable();
12269 +       preempt_disable_nort();
12270         pagefault_disable();
12271         return page_address(page);
12273 @@ -74,7 +75,7 @@ static inline void *kmap_atomic(struct page *page)
12274  static inline void __kunmap_atomic(void *addr)
12276         pagefault_enable();
12277 -       preempt_enable();
12278 +       preempt_enable_nort();
12281  #define kmap_atomic_pfn(pfn)   kmap_atomic(pfn_to_page(pfn))
12282 @@ -86,32 +87,51 @@ static inline void __kunmap_atomic(void *addr)
12284  #if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32)
12286 +#ifndef CONFIG_PREEMPT_RT_FULL
12287  DECLARE_PER_CPU(int, __kmap_atomic_idx);
12288 +#endif
12290  static inline int kmap_atomic_idx_push(void)
12292 +#ifndef CONFIG_PREEMPT_RT_FULL
12293         int idx = __this_cpu_inc_return(__kmap_atomic_idx) - 1;
12295 -#ifdef CONFIG_DEBUG_HIGHMEM
12296 +# ifdef CONFIG_DEBUG_HIGHMEM
12297         WARN_ON_ONCE(in_irq() && !irqs_disabled());
12298         BUG_ON(idx >= KM_TYPE_NR);
12299 -#endif
12300 +# endif
12301         return idx;
12302 +#else
12303 +       current->kmap_idx++;
12304 +       BUG_ON(current->kmap_idx > KM_TYPE_NR);
12305 +       return current->kmap_idx - 1;
12306 +#endif
12309  static inline int kmap_atomic_idx(void)
12311 +#ifndef CONFIG_PREEMPT_RT_FULL
12312         return __this_cpu_read(__kmap_atomic_idx) - 1;
12313 +#else
12314 +       return current->kmap_idx - 1;
12315 +#endif
12318  static inline void kmap_atomic_idx_pop(void)
12320 -#ifdef CONFIG_DEBUG_HIGHMEM
12321 +#ifndef CONFIG_PREEMPT_RT_FULL
12322 +# ifdef CONFIG_DEBUG_HIGHMEM
12323         int idx = __this_cpu_dec_return(__kmap_atomic_idx);
12325         BUG_ON(idx < 0);
12326 -#else
12327 +# else
12328         __this_cpu_dec(__kmap_atomic_idx);
12329 +# endif
12330 +#else
12331 +       current->kmap_idx--;
12332 +# ifdef CONFIG_DEBUG_HIGHMEM
12333 +       BUG_ON(current->kmap_idx < 0);
12334 +# endif
12335  #endif
12338 diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
12339 index 2ead22dd74a0..8fbcdfa5dc77 100644
12340 --- a/include/linux/hrtimer.h
12341 +++ b/include/linux/hrtimer.h
12342 @@ -87,6 +87,9 @@ enum hrtimer_restart {
12343   * @function:  timer expiry callback function
12344   * @base:      pointer to the timer base (per cpu and per clock)
12345   * @state:     state information (See bit values above)
12346 + * @cb_entry:  list entry to defer timers from hardirq context
12347 + * @irqsafe:   timer can run in hardirq context
12348 + * @praecox:   timer expiry time if expired at the time of programming
12349   * @is_rel:    Set if the timer was armed relative
12350   * @start_pid:  timer statistics field to store the pid of the task which
12351   *             started the timer
12352 @@ -103,6 +106,11 @@ struct hrtimer {
12353         enum hrtimer_restart            (*function)(struct hrtimer *);
12354         struct hrtimer_clock_base       *base;
12355         u8                              state;
12356 +       struct list_head                cb_entry;
12357 +       int                             irqsafe;
12358 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
12359 +       ktime_t                         praecox;
12360 +#endif
12361         u8                              is_rel;
12362  #ifdef CONFIG_TIMER_STATS
12363         int                             start_pid;
12364 @@ -123,11 +131,7 @@ struct hrtimer_sleeper {
12365         struct task_struct *task;
12366  };
12368 -#ifdef CONFIG_64BIT
12369  # define HRTIMER_CLOCK_BASE_ALIGN      64
12370 -#else
12371 -# define HRTIMER_CLOCK_BASE_ALIGN      32
12372 -#endif
12374  /**
12375   * struct hrtimer_clock_base - the timer base for a specific clock
12376 @@ -136,6 +140,7 @@ struct hrtimer_sleeper {
12377   *                     timer to a base on another cpu.
12378   * @clockid:           clock id for per_cpu support
12379   * @active:            red black tree root node for the active timers
12380 + * @expired:           list head for deferred timers.
12381   * @get_time:          function to retrieve the current time of the clock
12382   * @offset:            offset of this clock to the monotonic base
12383   */
12384 @@ -144,6 +149,7 @@ struct hrtimer_clock_base {
12385         int                     index;
12386         clockid_t               clockid;
12387         struct timerqueue_head  active;
12388 +       struct list_head        expired;
12389         ktime_t                 (*get_time)(void);
12390         ktime_t                 offset;
12391  } __attribute__((__aligned__(HRTIMER_CLOCK_BASE_ALIGN)));
12392 @@ -187,6 +193,7 @@ struct hrtimer_cpu_base {
12393         raw_spinlock_t                  lock;
12394         seqcount_t                      seq;
12395         struct hrtimer                  *running;
12396 +       struct hrtimer                  *running_soft;
12397         unsigned int                    cpu;
12398         unsigned int                    active_bases;
12399         unsigned int                    clock_was_set_seq;
12400 @@ -203,6 +210,9 @@ struct hrtimer_cpu_base {
12401         unsigned int                    nr_hangs;
12402         unsigned int                    max_hang_time;
12403  #endif
12404 +#ifdef CONFIG_PREEMPT_RT_BASE
12405 +       wait_queue_head_t               wait;
12406 +#endif
12407         struct hrtimer_clock_base       clock_base[HRTIMER_MAX_CLOCK_BASES];
12408  } ____cacheline_aligned;
12410 @@ -412,6 +422,13 @@ static inline void hrtimer_restart(struct hrtimer *timer)
12411         hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
12414 +/* Softirq preemption could deadlock timer removal */
12415 +#ifdef CONFIG_PREEMPT_RT_BASE
12416 +  extern void hrtimer_wait_for_timer(const struct hrtimer *timer);
12417 +#else
12418 +# define hrtimer_wait_for_timer(timer) do { cpu_relax(); } while (0)
12419 +#endif
12421  /* Query timers: */
12422  extern ktime_t __hrtimer_get_remaining(const struct hrtimer *timer, bool adjust);
12424 @@ -436,7 +453,7 @@ static inline int hrtimer_is_queued(struct hrtimer *timer)
12425   * Helper function to check, whether the timer is running the callback
12426   * function
12427   */
12428 -static inline int hrtimer_callback_running(struct hrtimer *timer)
12429 +static inline int hrtimer_callback_running(const struct hrtimer *timer)
12431         return timer->base->cpu_base->running == timer;
12433 diff --git a/include/linux/idr.h b/include/linux/idr.h
12434 index 013fd9bc4cb6..f62be0aec911 100644
12435 --- a/include/linux/idr.h
12436 +++ b/include/linux/idr.h
12437 @@ -95,10 +95,14 @@ bool idr_is_empty(struct idr *idp);
12438   * Each idr_preload() should be matched with an invocation of this
12439   * function.  See idr_preload() for details.
12440   */
12441 +#ifdef CONFIG_PREEMPT_RT_FULL
12442 +void idr_preload_end(void);
12443 +#else
12444  static inline void idr_preload_end(void)
12446         preempt_enable();
12448 +#endif
12450  /**
12451   * idr_find - return pointer for given id
12452 diff --git a/include/linux/init_task.h b/include/linux/init_task.h
12453 index 1c1ff7e4faa4..60fadde71a44 100644
12454 --- a/include/linux/init_task.h
12455 +++ b/include/linux/init_task.h
12456 @@ -148,9 +148,15 @@ extern struct task_group root_task_group;
12457  # define INIT_PERF_EVENTS(tsk)
12458  #endif
12460 +#ifdef CONFIG_PREEMPT_RT_BASE
12461 +# define INIT_TIMER_LIST               .posix_timer_list = NULL,
12462 +#else
12463 +# define INIT_TIMER_LIST
12464 +#endif
12466  #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
12467  # define INIT_VTIME(tsk)                                               \
12468 -       .vtime_seqlock = __SEQLOCK_UNLOCKED(tsk.vtime_seqlock), \
12469 +       .vtime_seqcount = SEQCNT_ZERO(tsk.vtime_seqcount),      \
12470         .vtime_snap = 0,                                \
12471         .vtime_snap_whence = VTIME_SYS,
12472  #else
12473 @@ -239,6 +245,7 @@ extern struct task_group root_task_group;
12474         .cpu_timers     = INIT_CPU_TIMERS(tsk.cpu_timers),              \
12475         .pi_lock        = __RAW_SPIN_LOCK_UNLOCKED(tsk.pi_lock),        \
12476         .timer_slack_ns = 50000, /* 50 usec default slack */            \
12477 +       INIT_TIMER_LIST                                                 \
12478         .pids = {                                                       \
12479                 [PIDTYPE_PID]  = INIT_PID_LINK(PIDTYPE_PID),            \
12480                 [PIDTYPE_PGID] = INIT_PID_LINK(PIDTYPE_PGID),           \
12481 diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
12482 index ad16809c8596..655cee096aed 100644
12483 --- a/include/linux/interrupt.h
12484 +++ b/include/linux/interrupt.h
12485 @@ -61,6 +61,7 @@
12486   *                interrupt handler after suspending interrupts. For system
12487   *                wakeup devices users need to implement wakeup detection in
12488   *                their interrupt handlers.
12489 + * IRQF_NO_SOFTIRQ_CALL - Do not process softirqs in the irq thread context (RT)
12490   */
12491  #define IRQF_SHARED            0x00000080
12492  #define IRQF_PROBE_SHARED      0x00000100
12493 @@ -74,6 +75,7 @@
12494  #define IRQF_NO_THREAD         0x00010000
12495  #define IRQF_EARLY_RESUME      0x00020000
12496  #define IRQF_COND_SUSPEND      0x00040000
12497 +#define IRQF_NO_SOFTIRQ_CALL   0x00080000
12499  #define IRQF_TIMER             (__IRQF_TIMER | IRQF_NO_SUSPEND | IRQF_NO_THREAD)
12501 @@ -186,7 +188,7 @@ extern void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id);
12502  #ifdef CONFIG_LOCKDEP
12503  # define local_irq_enable_in_hardirq() do { } while (0)
12504  #else
12505 -# define local_irq_enable_in_hardirq() local_irq_enable()
12506 +# define local_irq_enable_in_hardirq() local_irq_enable_nort()
12507  #endif
12509  extern void disable_irq_nosync(unsigned int irq);
12510 @@ -206,6 +208,7 @@ extern void resume_device_irqs(void);
12511   * @irq:               Interrupt to which notification applies
12512   * @kref:              Reference count, for internal use
12513   * @work:              Work item, for internal use
12514 + * @list:              List item for deferred callbacks
12515   * @notify:            Function to be called on change.  This will be
12516   *                     called in process context.
12517   * @release:           Function to be called on release.  This will be
12518 @@ -217,6 +220,7 @@ struct irq_affinity_notify {
12519         unsigned int irq;
12520         struct kref kref;
12521         struct work_struct work;
12522 +       struct list_head list;
12523         void (*notify)(struct irq_affinity_notify *, const cpumask_t *mask);
12524         void (*release)(struct kref *ref);
12525  };
12526 @@ -379,9 +383,13 @@ extern int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
12527                                  bool state);
12529  #ifdef CONFIG_IRQ_FORCED_THREADING
12530 +# ifndef CONFIG_PREEMPT_RT_BASE
12531  extern bool force_irqthreads;
12532 +# else
12533 +#  define force_irqthreads     (true)
12534 +# endif
12535  #else
12536 -#define force_irqthreads       (0)
12537 +#define force_irqthreads       (false)
12538  #endif
12540  #ifndef __ARCH_SET_SOFTIRQ_PENDING
12541 @@ -438,9 +446,10 @@ struct softirq_action
12542         void    (*action)(struct softirq_action *);
12543  };
12545 +#ifndef CONFIG_PREEMPT_RT_FULL
12546  asmlinkage void do_softirq(void);
12547  asmlinkage void __do_softirq(void);
12549 +static inline void thread_do_softirq(void) { do_softirq(); }
12550  #ifdef __ARCH_HAS_DO_SOFTIRQ
12551  void do_softirq_own_stack(void);
12552  #else
12553 @@ -449,13 +458,25 @@ static inline void do_softirq_own_stack(void)
12554         __do_softirq();
12556  #endif
12557 +#else
12558 +extern void thread_do_softirq(void);
12559 +#endif
12561  extern void open_softirq(int nr, void (*action)(struct softirq_action *));
12562  extern void softirq_init(void);
12563  extern void __raise_softirq_irqoff(unsigned int nr);
12564 +#ifdef CONFIG_PREEMPT_RT_FULL
12565 +extern void __raise_softirq_irqoff_ksoft(unsigned int nr);
12566 +#else
12567 +static inline void __raise_softirq_irqoff_ksoft(unsigned int nr)
12569 +       __raise_softirq_irqoff(nr);
12571 +#endif
12573  extern void raise_softirq_irqoff(unsigned int nr);
12574  extern void raise_softirq(unsigned int nr);
12575 +extern void softirq_check_pending_idle(void);
12577  DECLARE_PER_CPU(struct task_struct *, ksoftirqd);
12579 @@ -477,8 +498,9 @@ static inline struct task_struct *this_cpu_ksoftirqd(void)
12580       to be executed on some cpu at least once after this.
12581     * If the tasklet is already scheduled, but its execution is still not
12582       started, it will be executed only once.
12583 -   * If this tasklet is already running on another CPU (or schedule is called
12584 -     from tasklet itself), it is rescheduled for later.
12585 +   * If this tasklet is already running on another CPU, it is rescheduled
12586 +     for later.
12587 +   * Schedule must not be called from the tasklet itself (a lockup occurs)
12588     * Tasklet is strictly serialized wrt itself, but not
12589       wrt another tasklets. If client needs some intertask synchronization,
12590       he makes it with spinlocks.
12591 @@ -503,27 +525,36 @@ struct tasklet_struct name = { NULL, 0, ATOMIC_INIT(1), func, data }
12592  enum
12594         TASKLET_STATE_SCHED,    /* Tasklet is scheduled for execution */
12595 -       TASKLET_STATE_RUN       /* Tasklet is running (SMP only) */
12596 +       TASKLET_STATE_RUN,      /* Tasklet is running (SMP only) */
12597 +       TASKLET_STATE_PENDING   /* Tasklet is pending */
12598  };
12600 -#ifdef CONFIG_SMP
12601 +#define TASKLET_STATEF_SCHED   (1 << TASKLET_STATE_SCHED)
12602 +#define TASKLET_STATEF_RUN     (1 << TASKLET_STATE_RUN)
12603 +#define TASKLET_STATEF_PENDING (1 << TASKLET_STATE_PENDING)
12605 +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
12606  static inline int tasklet_trylock(struct tasklet_struct *t)
12608         return !test_and_set_bit(TASKLET_STATE_RUN, &(t)->state);
12611 +static inline int tasklet_tryunlock(struct tasklet_struct *t)
12613 +       return cmpxchg(&t->state, TASKLET_STATEF_RUN, 0) == TASKLET_STATEF_RUN;
12616  static inline void tasklet_unlock(struct tasklet_struct *t)
12618         smp_mb__before_atomic();
12619         clear_bit(TASKLET_STATE_RUN, &(t)->state);
12622 -static inline void tasklet_unlock_wait(struct tasklet_struct *t)
12624 -       while (test_bit(TASKLET_STATE_RUN, &(t)->state)) { barrier(); }
12626 +extern void tasklet_unlock_wait(struct tasklet_struct *t);
12628  #else
12629  #define tasklet_trylock(t) 1
12630 +#define tasklet_tryunlock(t)   1
12631  #define tasklet_unlock_wait(t) do { } while (0)
12632  #define tasklet_unlock(t) do { } while (0)
12633  #endif
12634 @@ -572,12 +603,7 @@ static inline void tasklet_disable(struct tasklet_struct *t)
12635         smp_mb();
12638 -static inline void tasklet_enable(struct tasklet_struct *t)
12640 -       smp_mb__before_atomic();
12641 -       atomic_dec(&t->count);
12644 +extern void tasklet_enable(struct tasklet_struct *t);
12645  extern void tasklet_kill(struct tasklet_struct *t);
12646  extern void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu);
12647  extern void tasklet_init(struct tasklet_struct *t,
12648 @@ -608,6 +634,12 @@ void tasklet_hrtimer_cancel(struct tasklet_hrtimer *ttimer)
12649         tasklet_kill(&ttimer->tasklet);
12652 +#ifdef CONFIG_PREEMPT_RT_FULL
12653 +extern void softirq_early_init(void);
12654 +#else
12655 +static inline void softirq_early_init(void) { }
12656 +#endif
12658  /*
12659   * Autoprobing for irqs:
12660   *
12661 diff --git a/include/linux/irq.h b/include/linux/irq.h
12662 index f7cade00c525..dac9e11ba037 100644
12663 --- a/include/linux/irq.h
12664 +++ b/include/linux/irq.h
12665 @@ -72,6 +72,7 @@ enum irqchip_irq_state;
12666   * IRQ_IS_POLLED               - Always polled by another interrupt. Exclude
12667   *                               it from the spurious interrupt detection
12668   *                               mechanism and from core side polling.
12669 + * IRQ_NO_SOFTIRQ_CALL         - No softirq processing in the irq thread context (RT)
12670   * IRQ_DISABLE_UNLAZY          - Disable lazy irq disable
12671   */
12672  enum {
12673 @@ -99,13 +100,14 @@ enum {
12674         IRQ_PER_CPU_DEVID       = (1 << 17),
12675         IRQ_IS_POLLED           = (1 << 18),
12676         IRQ_DISABLE_UNLAZY      = (1 << 19),
12677 +       IRQ_NO_SOFTIRQ_CALL     = (1 << 20),
12678  };
12680  #define IRQF_MODIFY_MASK       \
12681         (IRQ_TYPE_SENSE_MASK | IRQ_NOPROBE | IRQ_NOREQUEST | \
12682          IRQ_NOAUTOEN | IRQ_MOVE_PCNTXT | IRQ_LEVEL | IRQ_NO_BALANCING | \
12683          IRQ_PER_CPU | IRQ_NESTED_THREAD | IRQ_NOTHREAD | IRQ_PER_CPU_DEVID | \
12684 -        IRQ_IS_POLLED | IRQ_DISABLE_UNLAZY)
12685 +        IRQ_IS_POLLED | IRQ_DISABLE_UNLAZY | IRQ_NO_SOFTIRQ_CALL)
12687  #define IRQ_NO_BALANCING_MASK  (IRQ_PER_CPU | IRQ_NO_BALANCING)
12689 diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h
12690 index 47b9ebd4a74f..2543aab05daa 100644
12691 --- a/include/linux/irq_work.h
12692 +++ b/include/linux/irq_work.h
12693 @@ -16,6 +16,7 @@
12694  #define IRQ_WORK_BUSY          2UL
12695  #define IRQ_WORK_FLAGS         3UL
12696  #define IRQ_WORK_LAZY          4UL /* Doesn't want IPI, wait for tick */
12697 +#define IRQ_WORK_HARD_IRQ      8UL /* Run hard IRQ context, even on RT */
12699  struct irq_work {
12700         unsigned long flags;
12701 @@ -51,4 +52,10 @@ static inline bool irq_work_needs_cpu(void) { return false; }
12702  static inline void irq_work_run(void) { }
12703  #endif
12705 +#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_PREEMPT_RT_FULL)
12706 +void irq_work_tick_soft(void);
12707 +#else
12708 +static inline void irq_work_tick_soft(void) { }
12709 +#endif
12711  #endif /* _LINUX_IRQ_WORK_H */
12712 diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
12713 index a587a33363c7..ad57402a242d 100644
12714 --- a/include/linux/irqdesc.h
12715 +++ b/include/linux/irqdesc.h
12716 @@ -61,6 +61,7 @@ struct irq_desc {
12717         unsigned int            irqs_unhandled;
12718         atomic_t                threads_handled;
12719         int                     threads_handled_last;
12720 +       u64                     random_ip;
12721         raw_spinlock_t          lock;
12722         struct cpumask          *percpu_enabled;
12723  #ifdef CONFIG_SMP
12724 diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h
12725 index 5dd1272d1ab2..9b77034f7c5e 100644
12726 --- a/include/linux/irqflags.h
12727 +++ b/include/linux/irqflags.h
12728 @@ -25,8 +25,6 @@
12729  # define trace_softirqs_enabled(p)     ((p)->softirqs_enabled)
12730  # define trace_hardirq_enter() do { current->hardirq_context++; } while (0)
12731  # define trace_hardirq_exit()  do { current->hardirq_context--; } while (0)
12732 -# define lockdep_softirq_enter()       do { current->softirq_context++; } while (0)
12733 -# define lockdep_softirq_exit()        do { current->softirq_context--; } while (0)
12734  # define INIT_TRACE_IRQFLAGS   .softirqs_enabled = 1,
12735  #else
12736  # define trace_hardirqs_on()           do { } while (0)
12737 @@ -39,9 +37,15 @@
12738  # define trace_softirqs_enabled(p)     0
12739  # define trace_hardirq_enter()         do { } while (0)
12740  # define trace_hardirq_exit()          do { } while (0)
12741 +# define INIT_TRACE_IRQFLAGS
12742 +#endif
12744 +#if defined(CONFIG_TRACE_IRQFLAGS) && !defined(CONFIG_PREEMPT_RT_FULL)
12745 +# define lockdep_softirq_enter() do { current->softirq_context++; } while (0)
12746 +# define lockdep_softirq_exit()         do { current->softirq_context--; } while (0)
12747 +#else
12748  # define lockdep_softirq_enter()       do { } while (0)
12749  # define lockdep_softirq_exit()                do { } while (0)
12750 -# define INIT_TRACE_IRQFLAGS
12751  #endif
12753  #if defined(CONFIG_IRQSOFF_TRACER) || \
12754 @@ -148,4 +152,23 @@
12756  #define irqs_disabled_flags(flags) raw_irqs_disabled_flags(flags)
12759 + * local_irq* variants depending on RT/!RT
12760 + */
12761 +#ifdef CONFIG_PREEMPT_RT_FULL
12762 +# define local_irq_disable_nort()      do { } while (0)
12763 +# define local_irq_enable_nort()       do { } while (0)
12764 +# define local_irq_save_nort(flags)    local_save_flags(flags)
12765 +# define local_irq_restore_nort(flags) (void)(flags)
12766 +# define local_irq_disable_rt()                local_irq_disable()
12767 +# define local_irq_enable_rt()         local_irq_enable()
12768 +#else
12769 +# define local_irq_disable_nort()      local_irq_disable()
12770 +# define local_irq_enable_nort()       local_irq_enable()
12771 +# define local_irq_save_nort(flags)    local_irq_save(flags)
12772 +# define local_irq_restore_nort(flags) local_irq_restore(flags)
12773 +# define local_irq_disable_rt()                do { } while (0)
12774 +# define local_irq_enable_rt()         do { } while (0)
12775 +#endif
12777  #endif
12778 diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
12779 index 65407f6c9120..eb5aabe4e18c 100644
12780 --- a/include/linux/jbd2.h
12781 +++ b/include/linux/jbd2.h
12782 @@ -352,32 +352,56 @@ static inline struct journal_head *bh2jh(struct buffer_head *bh)
12784  static inline void jbd_lock_bh_state(struct buffer_head *bh)
12786 +#ifndef CONFIG_PREEMPT_RT_BASE
12787         bit_spin_lock(BH_State, &bh->b_state);
12788 +#else
12789 +       spin_lock(&bh->b_state_lock);
12790 +#endif
12793  static inline int jbd_trylock_bh_state(struct buffer_head *bh)
12795 +#ifndef CONFIG_PREEMPT_RT_BASE
12796         return bit_spin_trylock(BH_State, &bh->b_state);
12797 +#else
12798 +       return spin_trylock(&bh->b_state_lock);
12799 +#endif
12802  static inline int jbd_is_locked_bh_state(struct buffer_head *bh)
12804 +#ifndef CONFIG_PREEMPT_RT_BASE
12805         return bit_spin_is_locked(BH_State, &bh->b_state);
12806 +#else
12807 +       return spin_is_locked(&bh->b_state_lock);
12808 +#endif
12811  static inline void jbd_unlock_bh_state(struct buffer_head *bh)
12813 +#ifndef CONFIG_PREEMPT_RT_BASE
12814         bit_spin_unlock(BH_State, &bh->b_state);
12815 +#else
12816 +       spin_unlock(&bh->b_state_lock);
12817 +#endif
12820  static inline void jbd_lock_bh_journal_head(struct buffer_head *bh)
12822 +#ifndef CONFIG_PREEMPT_RT_BASE
12823         bit_spin_lock(BH_JournalHead, &bh->b_state);
12824 +#else
12825 +       spin_lock(&bh->b_journal_head_lock);
12826 +#endif
12829  static inline void jbd_unlock_bh_journal_head(struct buffer_head *bh)
12831 +#ifndef CONFIG_PREEMPT_RT_BASE
12832         bit_spin_unlock(BH_JournalHead, &bh->b_state);
12833 +#else
12834 +       spin_unlock(&bh->b_journal_head_lock);
12835 +#endif
12838  #define J_ASSERT(assert)       BUG_ON(!(assert))
12839 diff --git a/include/linux/kdb.h b/include/linux/kdb.h
12840 index a19bcf9e762e..897495386446 100644
12841 --- a/include/linux/kdb.h
12842 +++ b/include/linux/kdb.h
12843 @@ -167,6 +167,7 @@ extern __printf(2, 0) int vkdb_printf(enum kdb_msgsrc src, const char *fmt,
12844  extern __printf(1, 2) int kdb_printf(const char *, ...);
12845  typedef __printf(1, 2) int (*kdb_printf_t)(const char *, ...);
12847 +#define in_kdb_printk()        (kdb_trap_printk)
12848  extern void kdb_init(int level);
12850  /* Access to kdb specific polling devices */
12851 @@ -201,6 +202,7 @@ extern int kdb_register_flags(char *, kdb_func_t, char *, char *,
12852  extern int kdb_unregister(char *);
12853  #else /* ! CONFIG_KGDB_KDB */
12854  static inline __printf(1, 2) int kdb_printf(const char *fmt, ...) { return 0; }
12855 +#define in_kdb_printk() (0)
12856  static inline void kdb_init(int level) {}
12857  static inline int kdb_register(char *cmd, kdb_func_t func, char *usage,
12858                                char *help, short minlen) { return 0; }
12859 diff --git a/include/linux/kernel.h b/include/linux/kernel.h
12860 index 50220cab738c..d68f639f7330 100644
12861 --- a/include/linux/kernel.h
12862 +++ b/include/linux/kernel.h
12863 @@ -188,6 +188,9 @@ extern int _cond_resched(void);
12864   */
12865  # define might_sleep() \
12866         do { __might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0)
12868 +# define might_sleep_no_state_check() \
12869 +       do { ___might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0)
12870  # define sched_annotate_sleep()        (current->task_state_change = 0)
12871  #else
12872    static inline void ___might_sleep(const char *file, int line,
12873 @@ -195,6 +198,7 @@ extern int _cond_resched(void);
12874    static inline void __might_sleep(const char *file, int line,
12875                                    int preempt_offset) { }
12876  # define might_sleep() do { might_resched(); } while (0)
12877 +# define might_sleep_no_state_check() do { might_resched(); } while (0)
12878  # define sched_annotate_sleep() do { } while (0)
12879  #endif
12881 @@ -255,6 +259,7 @@ extern long (*panic_blink)(int state);
12882  __printf(1, 2)
12883  void panic(const char *fmt, ...)
12884         __noreturn __cold;
12885 +void nmi_panic(struct pt_regs *regs, const char *msg);
12886  extern void oops_enter(void);
12887  extern void oops_exit(void);
12888  void print_oops_end_marker(void);
12889 @@ -448,6 +453,14 @@ extern int sysctl_panic_on_stackoverflow;
12890  extern bool crash_kexec_post_notifiers;
12892  /*
12893 + * panic_cpu is used for synchronizing panic() and crash_kexec() execution. It
12894 + * holds a CPU number which is executing panic() currently. A value of
12895 + * PANIC_CPU_INVALID means no CPU has entered panic() or crash_kexec().
12896 + */
12897 +extern atomic_t panic_cpu;
12898 +#define PANIC_CPU_INVALID      -1
12901   * Only to be used by arch init code. If the user over-wrote the default
12902   * CONFIG_PANIC_TIMEOUT, honor it.
12903   */
12904 @@ -475,6 +488,7 @@ extern enum system_states {
12905         SYSTEM_HALT,
12906         SYSTEM_POWER_OFF,
12907         SYSTEM_RESTART,
12908 +       SYSTEM_SUSPEND,
12909  } system_state;
12911  #define TAINT_PROPRIETARY_MODULE       0
12912 diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
12913 index d7ce4e3280db..0b6d392b38e7 100644
12914 --- a/include/linux/kvm_host.h
12915 +++ b/include/linux/kvm_host.h
12916 @@ -25,6 +25,7 @@
12917  #include <linux/irqflags.h>
12918  #include <linux/context_tracking.h>
12919  #include <linux/irqbypass.h>
12920 +#include <linux/swait.h>
12921  #include <asm/signal.h>
12923  #include <linux/kvm.h>
12924 @@ -243,7 +244,7 @@ struct kvm_vcpu {
12925         int fpu_active;
12926         int guest_fpu_loaded, guest_xcr0_loaded;
12927         unsigned char fpu_counter;
12928 -       wait_queue_head_t wq;
12929 +       struct swait_queue_head wq;
12930         struct pid *pid;
12931         int sigset_active;
12932         sigset_t sigset;
12933 @@ -794,7 +795,7 @@ static inline bool kvm_arch_has_assigned_device(struct kvm *kvm)
12935  #endif
12937 -static inline wait_queue_head_t *kvm_arch_vcpu_wq(struct kvm_vcpu *vcpu)
12938 +static inline struct swait_queue_head *kvm_arch_vcpu_wq(struct kvm_vcpu *vcpu)
12940  #ifdef __KVM_HAVE_ARCH_WQP
12941         return vcpu->arch.wqp;
12942 diff --git a/include/linux/lglock.h b/include/linux/lglock.h
12943 index c92ebd100d9b..6f035f635d0e 100644
12944 --- a/include/linux/lglock.h
12945 +++ b/include/linux/lglock.h
12946 @@ -34,13 +34,30 @@
12947  #endif
12949  struct lglock {
12950 +#ifdef CONFIG_PREEMPT_RT_FULL
12951 +       struct rt_mutex __percpu *lock;
12952 +#else
12953         arch_spinlock_t __percpu *lock;
12954 +#endif
12955  #ifdef CONFIG_DEBUG_LOCK_ALLOC
12956         struct lock_class_key lock_key;
12957         struct lockdep_map    lock_dep_map;
12958  #endif
12959  };
12961 +#ifdef CONFIG_PREEMPT_RT_FULL
12962 +# define DEFINE_LGLOCK(name)                                           \
12963 +       static DEFINE_PER_CPU(struct rt_mutex, name ## _lock)           \
12964 +       = __RT_MUTEX_INITIALIZER( name ## _lock);                       \
12965 +       struct lglock name = { .lock = &name ## _lock }
12967 +# define DEFINE_STATIC_LGLOCK(name)                                    \
12968 +       static DEFINE_PER_CPU(struct rt_mutex, name ## _lock)           \
12969 +       = __RT_MUTEX_INITIALIZER( name ## _lock);                       \
12970 +       static struct lglock name = { .lock = &name ## _lock }
12972 +#else
12974  #define DEFINE_LGLOCK(name)                                            \
12975         static DEFINE_PER_CPU(arch_spinlock_t, name ## _lock)           \
12976         = __ARCH_SPIN_LOCK_UNLOCKED;                                    \
12977 @@ -50,6 +67,7 @@ struct lglock {
12978         static DEFINE_PER_CPU(arch_spinlock_t, name ## _lock)           \
12979         = __ARCH_SPIN_LOCK_UNLOCKED;                                    \
12980         static struct lglock name = { .lock = &name ## _lock }
12981 +#endif
12983  void lg_lock_init(struct lglock *lg, char *name);
12985 @@ -64,6 +82,12 @@ void lg_double_unlock(struct lglock *lg, int cpu1, int cpu2);
12986  void lg_global_lock(struct lglock *lg);
12987  void lg_global_unlock(struct lglock *lg);
12989 +#ifndef CONFIG_PREEMPT_RT_FULL
12990 +#define lg_global_trylock_relax(name)  lg_global_lock(name)
12991 +#else
12992 +void lg_global_trylock_relax(struct lglock *lg);
12993 +#endif
12995  #else
12996  /* When !CONFIG_SMP, map lglock to spinlock */
12997  #define lglock spinlock
12998 diff --git a/include/linux/list_bl.h b/include/linux/list_bl.h
12999 index 8132214e8efd..89ffaa7bd342 100644
13000 --- a/include/linux/list_bl.h
13001 +++ b/include/linux/list_bl.h
13002 @@ -2,6 +2,7 @@
13003  #define _LINUX_LIST_BL_H
13005  #include <linux/list.h>
13006 +#include <linux/spinlock.h>
13007  #include <linux/bit_spinlock.h>
13009  /*
13010 @@ -32,13 +33,24 @@
13012  struct hlist_bl_head {
13013         struct hlist_bl_node *first;
13014 +#ifdef CONFIG_PREEMPT_RT_BASE
13015 +       raw_spinlock_t lock;
13016 +#endif
13017  };
13019  struct hlist_bl_node {
13020         struct hlist_bl_node *next, **pprev;
13021  };
13022 -#define INIT_HLIST_BL_HEAD(ptr) \
13023 -       ((ptr)->first = NULL)
13025 +#ifdef CONFIG_PREEMPT_RT_BASE
13026 +#define INIT_HLIST_BL_HEAD(h)          \
13027 +do {                                   \
13028 +       (h)->first = NULL;              \
13029 +       raw_spin_lock_init(&(h)->lock); \
13030 +} while (0)
13031 +#else
13032 +#define INIT_HLIST_BL_HEAD(h) (h)->first = NULL
13033 +#endif
13035  static inline void INIT_HLIST_BL_NODE(struct hlist_bl_node *h)
13037 @@ -118,12 +130,26 @@ static inline void hlist_bl_del_init(struct hlist_bl_node *n)
13039  static inline void hlist_bl_lock(struct hlist_bl_head *b)
13041 +#ifndef CONFIG_PREEMPT_RT_BASE
13042         bit_spin_lock(0, (unsigned long *)b);
13043 +#else
13044 +       raw_spin_lock(&b->lock);
13045 +#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
13046 +       __set_bit(0, (unsigned long *)b);
13047 +#endif
13048 +#endif
13051  static inline void hlist_bl_unlock(struct hlist_bl_head *b)
13053 +#ifndef CONFIG_PREEMPT_RT_BASE
13054         __bit_spin_unlock(0, (unsigned long *)b);
13055 +#else
13056 +#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
13057 +       __clear_bit(0, (unsigned long *)b);
13058 +#endif
13059 +       raw_spin_unlock(&b->lock);
13060 +#endif
13063  static inline bool hlist_bl_is_locked(struct hlist_bl_head *b)
13064 diff --git a/include/linux/locallock.h b/include/linux/locallock.h
13065 new file mode 100644
13066 index 000000000000..e572a3971631
13067 --- /dev/null
13068 +++ b/include/linux/locallock.h
13069 @@ -0,0 +1,276 @@
13070 +#ifndef _LINUX_LOCALLOCK_H
13071 +#define _LINUX_LOCALLOCK_H
13073 +#include <linux/percpu.h>
13074 +#include <linux/spinlock.h>
13076 +#ifdef CONFIG_PREEMPT_RT_BASE
13078 +#ifdef CONFIG_DEBUG_SPINLOCK
13079 +# define LL_WARN(cond) WARN_ON(cond)
13080 +#else
13081 +# define LL_WARN(cond) do { } while (0)
13082 +#endif
13085 + * per cpu lock based substitute for local_irq_*()
13086 + */
13087 +struct local_irq_lock {
13088 +       spinlock_t              lock;
13089 +       struct task_struct      *owner;
13090 +       int                     nestcnt;
13091 +       unsigned long           flags;
13094 +#define DEFINE_LOCAL_IRQ_LOCK(lvar)                                    \
13095 +       DEFINE_PER_CPU(struct local_irq_lock, lvar) = {                 \
13096 +               .lock = __SPIN_LOCK_UNLOCKED((lvar).lock) }
13098 +#define DECLARE_LOCAL_IRQ_LOCK(lvar)                                   \
13099 +       DECLARE_PER_CPU(struct local_irq_lock, lvar)
13101 +#define local_irq_lock_init(lvar)                                      \
13102 +       do {                                                            \
13103 +               int __cpu;                                              \
13104 +               for_each_possible_cpu(__cpu)                            \
13105 +                       spin_lock_init(&per_cpu(lvar, __cpu).lock);     \
13106 +       } while (0)
13109 + * spin_lock|trylock|unlock_local flavour that does not migrate disable
13110 + * used for __local_lock|trylock|unlock where get_local_var/put_local_var
13111 + * already takes care of the migrate_disable/enable
13112 + * for CONFIG_PREEMPT_BASE map to the normal spin_* calls.
13113 + */
13114 +#ifdef CONFIG_PREEMPT_RT_FULL
13115 +# define spin_lock_local(lock)                 rt_spin_lock__no_mg(lock)
13116 +# define spin_trylock_local(lock)              rt_spin_trylock__no_mg(lock)
13117 +# define spin_unlock_local(lock)               rt_spin_unlock__no_mg(lock)
13118 +#else
13119 +# define spin_lock_local(lock)                 spin_lock(lock)
13120 +# define spin_trylock_local(lock)              spin_trylock(lock)
13121 +# define spin_unlock_local(lock)               spin_unlock(lock)
13122 +#endif
13124 +static inline void __local_lock(struct local_irq_lock *lv)
13126 +       if (lv->owner != current) {
13127 +               spin_lock_local(&lv->lock);
13128 +               LL_WARN(lv->owner);
13129 +               LL_WARN(lv->nestcnt);
13130 +               lv->owner = current;
13131 +       }
13132 +       lv->nestcnt++;
13135 +#define local_lock(lvar)                                       \
13136 +       do { __local_lock(&get_local_var(lvar)); } while (0)
13138 +#define local_lock_on(lvar, cpu)                               \
13139 +       do { __local_lock(&per_cpu(lvar, cpu)); } while (0)
13141 +static inline int __local_trylock(struct local_irq_lock *lv)
13143 +       if (lv->owner != current && spin_trylock_local(&lv->lock)) {
13144 +               LL_WARN(lv->owner);
13145 +               LL_WARN(lv->nestcnt);
13146 +               lv->owner = current;
13147 +               lv->nestcnt = 1;
13148 +               return 1;
13149 +       }
13150 +       return 0;
13153 +#define local_trylock(lvar)                                            \
13154 +       ({                                                              \
13155 +               int __locked;                                           \
13156 +               __locked = __local_trylock(&get_local_var(lvar));       \
13157 +               if (!__locked)                                          \
13158 +                       put_local_var(lvar);                            \
13159 +               __locked;                                               \
13160 +       })
13162 +static inline void __local_unlock(struct local_irq_lock *lv)
13164 +       LL_WARN(lv->nestcnt == 0);
13165 +       LL_WARN(lv->owner != current);
13166 +       if (--lv->nestcnt)
13167 +               return;
13169 +       lv->owner = NULL;
13170 +       spin_unlock_local(&lv->lock);
13173 +#define local_unlock(lvar)                                     \
13174 +       do {                                                    \
13175 +               __local_unlock(this_cpu_ptr(&lvar));            \
13176 +               put_local_var(lvar);                            \
13177 +       } while (0)
13179 +#define local_unlock_on(lvar, cpu)                       \
13180 +       do { __local_unlock(&per_cpu(lvar, cpu)); } while (0)
13182 +static inline void __local_lock_irq(struct local_irq_lock *lv)
13184 +       spin_lock_irqsave(&lv->lock, lv->flags);
13185 +       LL_WARN(lv->owner);
13186 +       LL_WARN(lv->nestcnt);
13187 +       lv->owner = current;
13188 +       lv->nestcnt = 1;
13191 +#define local_lock_irq(lvar)                                           \
13192 +       do { __local_lock_irq(&get_local_var(lvar)); } while (0)
13194 +#define local_lock_irq_on(lvar, cpu)                                   \
13195 +       do { __local_lock_irq(&per_cpu(lvar, cpu)); } while (0)
13197 +static inline void __local_unlock_irq(struct local_irq_lock *lv)
13199 +       LL_WARN(!lv->nestcnt);
13200 +       LL_WARN(lv->owner != current);
13201 +       lv->owner = NULL;
13202 +       lv->nestcnt = 0;
13203 +       spin_unlock_irq(&lv->lock);
13206 +#define local_unlock_irq(lvar)                                         \
13207 +       do {                                                            \
13208 +               __local_unlock_irq(this_cpu_ptr(&lvar));                \
13209 +               put_local_var(lvar);                                    \
13210 +       } while (0)
13212 +#define local_unlock_irq_on(lvar, cpu)                                 \
13213 +       do {                                                            \
13214 +               __local_unlock_irq(&per_cpu(lvar, cpu));                \
13215 +       } while (0)
13217 +static inline int __local_lock_irqsave(struct local_irq_lock *lv)
13219 +       if (lv->owner != current) {
13220 +               __local_lock_irq(lv);
13221 +               return 0;
13222 +       } else {
13223 +               lv->nestcnt++;
13224 +               return 1;
13225 +       }
13228 +#define local_lock_irqsave(lvar, _flags)                               \
13229 +       do {                                                            \
13230 +               if (__local_lock_irqsave(&get_local_var(lvar)))         \
13231 +                       put_local_var(lvar);                            \
13232 +               _flags = __this_cpu_read(lvar.flags);                   \
13233 +       } while (0)
13235 +#define local_lock_irqsave_on(lvar, _flags, cpu)                       \
13236 +       do {                                                            \
13237 +               __local_lock_irqsave(&per_cpu(lvar, cpu));              \
13238 +               _flags = per_cpu(lvar, cpu).flags;                      \
13239 +       } while (0)
13241 +static inline int __local_unlock_irqrestore(struct local_irq_lock *lv,
13242 +                                           unsigned long flags)
13244 +       LL_WARN(!lv->nestcnt);
13245 +       LL_WARN(lv->owner != current);
13246 +       if (--lv->nestcnt)
13247 +               return 0;
13249 +       lv->owner = NULL;
13250 +       spin_unlock_irqrestore(&lv->lock, lv->flags);
13251 +       return 1;
13254 +#define local_unlock_irqrestore(lvar, flags)                           \
13255 +       do {                                                            \
13256 +               if (__local_unlock_irqrestore(this_cpu_ptr(&lvar), flags)) \
13257 +                       put_local_var(lvar);                            \
13258 +       } while (0)
13260 +#define local_unlock_irqrestore_on(lvar, flags, cpu)                   \
13261 +       do {                                                            \
13262 +               __local_unlock_irqrestore(&per_cpu(lvar, cpu), flags);  \
13263 +       } while (0)
13265 +#define local_spin_trylock_irq(lvar, lock)                             \
13266 +       ({                                                              \
13267 +               int __locked;                                           \
13268 +               local_lock_irq(lvar);                                   \
13269 +               __locked = spin_trylock(lock);                          \
13270 +               if (!__locked)                                          \
13271 +                       local_unlock_irq(lvar);                         \
13272 +               __locked;                                               \
13273 +       })
13275 +#define local_spin_lock_irq(lvar, lock)                                        \
13276 +       do {                                                            \
13277 +               local_lock_irq(lvar);                                   \
13278 +               spin_lock(lock);                                        \
13279 +       } while (0)
13281 +#define local_spin_unlock_irq(lvar, lock)                              \
13282 +       do {                                                            \
13283 +               spin_unlock(lock);                                      \
13284 +               local_unlock_irq(lvar);                                 \
13285 +       } while (0)
13287 +#define local_spin_lock_irqsave(lvar, lock, flags)                     \
13288 +       do {                                                            \
13289 +               local_lock_irqsave(lvar, flags);                        \
13290 +               spin_lock(lock);                                        \
13291 +       } while (0)
13293 +#define local_spin_unlock_irqrestore(lvar, lock, flags)                        \
13294 +       do {                                                            \
13295 +               spin_unlock(lock);                                      \
13296 +               local_unlock_irqrestore(lvar, flags);                   \
13297 +       } while (0)
13299 +#define get_locked_var(lvar, var)                                      \
13300 +       (*({                                                            \
13301 +               local_lock(lvar);                                       \
13302 +               this_cpu_ptr(&var);                                     \
13303 +       }))
13305 +#define put_locked_var(lvar, var)      local_unlock(lvar);
13307 +#define local_lock_cpu(lvar)                                           \
13308 +       ({                                                              \
13309 +               local_lock(lvar);                                       \
13310 +               smp_processor_id();                                     \
13311 +       })
13313 +#define local_unlock_cpu(lvar)                 local_unlock(lvar)
13315 +#else /* PREEMPT_RT_BASE */
13317 +#define DEFINE_LOCAL_IRQ_LOCK(lvar)            __typeof__(const int) lvar
13318 +#define DECLARE_LOCAL_IRQ_LOCK(lvar)           extern __typeof__(const int) lvar
13320 +static inline void local_irq_lock_init(int lvar) { }
13322 +#define local_lock(lvar)                       preempt_disable()
13323 +#define local_unlock(lvar)                     preempt_enable()
13324 +#define local_lock_irq(lvar)                   local_irq_disable()
13325 +#define local_unlock_irq(lvar)                 local_irq_enable()
13326 +#define local_lock_irqsave(lvar, flags)                local_irq_save(flags)
13327 +#define local_unlock_irqrestore(lvar, flags)   local_irq_restore(flags)
13329 +#define local_spin_trylock_irq(lvar, lock)     spin_trylock_irq(lock)
13330 +#define local_spin_lock_irq(lvar, lock)                spin_lock_irq(lock)
13331 +#define local_spin_unlock_irq(lvar, lock)      spin_unlock_irq(lock)
13332 +#define local_spin_lock_irqsave(lvar, lock, flags)     \
13333 +       spin_lock_irqsave(lock, flags)
13334 +#define local_spin_unlock_irqrestore(lvar, lock, flags)        \
13335 +       spin_unlock_irqrestore(lock, flags)
13337 +#define get_locked_var(lvar, var)              get_cpu_var(var)
13338 +#define put_locked_var(lvar, var)              put_cpu_var(var)
13340 +#define local_lock_cpu(lvar)                   get_cpu()
13341 +#define local_unlock_cpu(lvar)                 put_cpu()
13343 +#endif
13345 +#endif
13346 diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
13347 index 2ccccbfcd532..4b016eb6265b 100644
13348 --- a/include/linux/mm_types.h
13349 +++ b/include/linux/mm_types.h
13350 @@ -11,6 +11,7 @@
13351  #include <linux/completion.h>
13352  #include <linux/cpumask.h>
13353  #include <linux/uprobes.h>
13354 +#include <linux/rcupdate.h>
13355  #include <linux/page-flags-layout.h>
13356  #include <asm/page.h>
13357  #include <asm/mmu.h>
13358 @@ -505,6 +506,9 @@ struct mm_struct {
13359         bool tlb_flush_pending;
13360  #endif
13361         struct uprobes_state uprobes_state;
13362 +#ifdef CONFIG_PREEMPT_RT_BASE
13363 +       struct rcu_head delayed_drop;
13364 +#endif
13365  #ifdef CONFIG_X86_INTEL_MPX
13366         /* address of the bounds directory */
13367         void __user *bd_addr;
13368 diff --git a/include/linux/module.h b/include/linux/module.h
13369 index b229a9961d02..5fea847cf95c 100644
13370 --- a/include/linux/module.h
13371 +++ b/include/linux/module.h
13372 @@ -500,6 +500,7 @@ static inline int module_is_live(struct module *mod)
13373  struct module *__module_text_address(unsigned long addr);
13374  struct module *__module_address(unsigned long addr);
13375  bool is_module_address(unsigned long addr);
13376 +bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr);
13377  bool is_module_percpu_address(unsigned long addr);
13378  bool is_module_text_address(unsigned long addr);
13380 @@ -665,6 +666,11 @@ static inline bool is_module_percpu_address(unsigned long addr)
13381         return false;
13384 +static inline bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr)
13386 +       return false;
13389  static inline bool is_module_text_address(unsigned long addr)
13391         return false;
13392 diff --git a/include/linux/mutex.h b/include/linux/mutex.h
13393 index 2cb7531e7d7a..b3fdfc820216 100644
13394 --- a/include/linux/mutex.h
13395 +++ b/include/linux/mutex.h
13396 @@ -19,6 +19,17 @@
13397  #include <asm/processor.h>
13398  #include <linux/osq_lock.h>
13400 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
13401 +# define __DEP_MAP_MUTEX_INITIALIZER(lockname) \
13402 +       , .dep_map = { .name = #lockname }
13403 +#else
13404 +# define __DEP_MAP_MUTEX_INITIALIZER(lockname)
13405 +#endif
13407 +#ifdef CONFIG_PREEMPT_RT_FULL
13408 +# include <linux/mutex_rt.h>
13409 +#else
13411  /*
13412   * Simple, straightforward mutexes with strict semantics:
13413   *
13414 @@ -99,13 +110,6 @@ do {                                                        \
13415  static inline void mutex_destroy(struct mutex *lock) {}
13416  #endif
13418 -#ifdef CONFIG_DEBUG_LOCK_ALLOC
13419 -# define __DEP_MAP_MUTEX_INITIALIZER(lockname) \
13420 -               , .dep_map = { .name = #lockname }
13421 -#else
13422 -# define __DEP_MAP_MUTEX_INITIALIZER(lockname)
13423 -#endif
13425  #define __MUTEX_INITIALIZER(lockname) \
13426                 { .count = ATOMIC_INIT(1) \
13427                 , .wait_lock = __SPIN_LOCK_UNLOCKED(lockname.wait_lock) \
13428 @@ -173,6 +177,8 @@ extern int __must_check mutex_lock_killable(struct mutex *lock);
13429  extern int mutex_trylock(struct mutex *lock);
13430  extern void mutex_unlock(struct mutex *lock);
13432 +#endif /* !PREEMPT_RT_FULL */
13434  extern int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock);
13436  #endif /* __LINUX_MUTEX_H */
13437 diff --git a/include/linux/mutex_rt.h b/include/linux/mutex_rt.h
13438 new file mode 100644
13439 index 000000000000..e0284edec655
13440 --- /dev/null
13441 +++ b/include/linux/mutex_rt.h
13442 @@ -0,0 +1,89 @@
13443 +#ifndef __LINUX_MUTEX_RT_H
13444 +#define __LINUX_MUTEX_RT_H
13446 +#ifndef __LINUX_MUTEX_H
13447 +#error "Please include mutex.h"
13448 +#endif
13450 +#include <linux/rtmutex.h>
13452 +/* FIXME: Just for __lockfunc */
13453 +#include <linux/spinlock.h>
13455 +struct mutex {
13456 +       struct rt_mutex         lock;
13457 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
13458 +       struct lockdep_map      dep_map;
13459 +#endif
13462 +#define __MUTEX_INITIALIZER(mutexname)                                 \
13463 +       {                                                               \
13464 +               .lock = __RT_MUTEX_INITIALIZER(mutexname.lock)          \
13465 +               __DEP_MAP_MUTEX_INITIALIZER(mutexname)                  \
13466 +       }
13468 +#define DEFINE_MUTEX(mutexname)                                                \
13469 +       struct mutex mutexname = __MUTEX_INITIALIZER(mutexname)
13471 +extern void __mutex_do_init(struct mutex *lock, const char *name, struct lock_class_key *key);
13472 +extern void __lockfunc _mutex_lock(struct mutex *lock);
13473 +extern int __lockfunc _mutex_lock_interruptible(struct mutex *lock);
13474 +extern int __lockfunc _mutex_lock_killable(struct mutex *lock);
13475 +extern void __lockfunc _mutex_lock_nested(struct mutex *lock, int subclass);
13476 +extern void __lockfunc _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest_lock);
13477 +extern int __lockfunc _mutex_lock_interruptible_nested(struct mutex *lock, int subclass);
13478 +extern int __lockfunc _mutex_lock_killable_nested(struct mutex *lock, int subclass);
13479 +extern int __lockfunc _mutex_trylock(struct mutex *lock);
13480 +extern void __lockfunc _mutex_unlock(struct mutex *lock);
13482 +#define mutex_is_locked(l)             rt_mutex_is_locked(&(l)->lock)
13483 +#define mutex_lock(l)                  _mutex_lock(l)
13484 +#define mutex_lock_interruptible(l)    _mutex_lock_interruptible(l)
13485 +#define mutex_lock_killable(l)         _mutex_lock_killable(l)
13486 +#define mutex_trylock(l)               _mutex_trylock(l)
13487 +#define mutex_unlock(l)                        _mutex_unlock(l)
13489 +#ifdef CONFIG_DEBUG_MUTEXES
13490 +#define mutex_destroy(l)               rt_mutex_destroy(&(l)->lock)
13491 +#else
13492 +static inline void mutex_destroy(struct mutex *lock) {}
13493 +#endif
13495 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
13496 +# define mutex_lock_nested(l, s)       _mutex_lock_nested(l, s)
13497 +# define mutex_lock_interruptible_nested(l, s) \
13498 +                                       _mutex_lock_interruptible_nested(l, s)
13499 +# define mutex_lock_killable_nested(l, s) \
13500 +                                       _mutex_lock_killable_nested(l, s)
13502 +# define mutex_lock_nest_lock(lock, nest_lock)                         \
13503 +do {                                                                   \
13504 +       typecheck(struct lockdep_map *, &(nest_lock)->dep_map);         \
13505 +       _mutex_lock_nest_lock(lock, &(nest_lock)->dep_map);             \
13506 +} while (0)
13508 +#else
13509 +# define mutex_lock_nested(l, s)       _mutex_lock(l)
13510 +# define mutex_lock_interruptible_nested(l, s) \
13511 +                                       _mutex_lock_interruptible(l)
13512 +# define mutex_lock_killable_nested(l, s) \
13513 +                                       _mutex_lock_killable(l)
13514 +# define mutex_lock_nest_lock(lock, nest_lock) mutex_lock(lock)
13515 +#endif
13517 +# define mutex_init(mutex)                             \
13518 +do {                                                   \
13519 +       static struct lock_class_key __key;             \
13520 +                                                       \
13521 +       rt_mutex_init(&(mutex)->lock);                  \
13522 +       __mutex_do_init((mutex), #mutex, &__key);       \
13523 +} while (0)
13525 +# define __mutex_init(mutex, name, key)                        \
13526 +do {                                                   \
13527 +       rt_mutex_init(&(mutex)->lock);                  \
13528 +       __mutex_do_init((mutex), name, key);            \
13529 +} while (0)
13531 +#endif
13532 diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
13533 index 4035bbe40971..81ebe70d107d 100644
13534 --- a/include/linux/netdevice.h
13535 +++ b/include/linux/netdevice.h
13536 @@ -390,7 +390,19 @@ typedef enum rx_handler_result rx_handler_result_t;
13537  typedef rx_handler_result_t rx_handler_func_t(struct sk_buff **pskb);
13539  void __napi_schedule(struct napi_struct *n);
13542 + * When PREEMPT_RT_FULL is defined, all device interrupt handlers
13543 + * run as threads, and they can also be preempted (without PREEMPT_RT
13544 + * interrupt threads can not be preempted). Which means that calling
13545 + * __napi_schedule_irqoff() from an interrupt handler can be preempted
13546 + * and can corrupt the napi->poll_list.
13547 + */
13548 +#ifdef CONFIG_PREEMPT_RT_FULL
13549 +#define __napi_schedule_irqoff(n) __napi_schedule(n)
13550 +#else
13551  void __napi_schedule_irqoff(struct napi_struct *n);
13552 +#endif
13554  static inline bool napi_disable_pending(struct napi_struct *n)
13556 @@ -2288,11 +2300,20 @@ void netdev_freemem(struct net_device *dev);
13557  void synchronize_net(void);
13558  int init_dummy_netdev(struct net_device *dev);
13560 +#ifdef CONFIG_PREEMPT_RT_FULL
13561 +static inline int dev_recursion_level(void)
13563 +       return current->xmit_recursion;
13566 +#else
13568  DECLARE_PER_CPU(int, xmit_recursion);
13569  static inline int dev_recursion_level(void)
13571         return this_cpu_read(xmit_recursion);
13573 +#endif
13575  struct net_device *dev_get_by_index(struct net *net, int ifindex);
13576  struct net_device *__dev_get_by_index(struct net *net, int ifindex);
13577 @@ -2610,6 +2631,7 @@ struct softnet_data {
13578         unsigned int            dropped;
13579         struct sk_buff_head     input_pkt_queue;
13580         struct napi_struct      backlog;
13581 +       struct sk_buff_head     tofree_queue;
13583  };
13585 diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
13586 index 04078e8a4803..a61c9609e32f 100644
13587 --- a/include/linux/netfilter/x_tables.h
13588 +++ b/include/linux/netfilter/x_tables.h
13589 @@ -4,6 +4,7 @@
13591  #include <linux/netdevice.h>
13592  #include <linux/static_key.h>
13593 +#include <linux/locallock.h>
13594  #include <uapi/linux/netfilter/x_tables.h>
13596  /**
13597 @@ -289,6 +290,8 @@ void xt_free_table_info(struct xt_table_info *info);
13598   */
13599  DECLARE_PER_CPU(seqcount_t, xt_recseq);
13601 +DECLARE_LOCAL_IRQ_LOCK(xt_write_lock);
13603  /* xt_tee_enabled - true if x_tables needs to handle reentrancy
13604   *
13605   * Enabled if current ip(6)tables ruleset has at least one -j TEE rule.
13606 @@ -309,6 +312,9 @@ static inline unsigned int xt_write_recseq_begin(void)
13608         unsigned int addend;
13610 +       /* RT protection */
13611 +       local_lock(xt_write_lock);
13613         /*
13614          * Low order bit of sequence is set if we already
13615          * called xt_write_recseq_begin().
13616 @@ -339,6 +345,7 @@ static inline void xt_write_recseq_end(unsigned int addend)
13617         /* this is kind of a write_seqcount_end(), but addend is 0 or 1 */
13618         smp_wmb();
13619         __this_cpu_add(xt_recseq.sequence, addend);
13620 +       local_unlock(xt_write_lock);
13623  /*
13624 diff --git a/include/linux/notifier.h b/include/linux/notifier.h
13625 index d14a4c362465..2e4414a0c1c4 100644
13626 --- a/include/linux/notifier.h
13627 +++ b/include/linux/notifier.h
13628 @@ -6,7 +6,7 @@
13629   *
13630   *                             Alan Cox <Alan.Cox@linux.org>
13631   */
13634  #ifndef _LINUX_NOTIFIER_H
13635  #define _LINUX_NOTIFIER_H
13636  #include <linux/errno.h>
13637 @@ -42,9 +42,7 @@
13638   * in srcu_notifier_call_chain(): no cache bounces and no memory barriers.
13639   * As compensation, srcu_notifier_chain_unregister() is rather expensive.
13640   * SRCU notifier chains should be used when the chain will be called very
13641 - * often but notifier_blocks will seldom be removed.  Also, SRCU notifier
13642 - * chains are slightly more difficult to use because they require special
13643 - * runtime initialization.
13644 + * often but notifier_blocks will seldom be removed.
13645   */
13647  typedef        int (*notifier_fn_t)(struct notifier_block *nb,
13648 @@ -88,7 +86,7 @@ struct srcu_notifier_head {
13649                 (name)->head = NULL;            \
13650         } while (0)
13652 -/* srcu_notifier_heads must be initialized and cleaned up dynamically */
13653 +/* srcu_notifier_heads must be cleaned up dynamically */
13654  extern void srcu_init_notifier_head(struct srcu_notifier_head *nh);
13655  #define srcu_cleanup_notifier_head(name)       \
13656                 cleanup_srcu_struct(&(name)->srcu);
13657 @@ -101,7 +99,13 @@ extern void srcu_init_notifier_head(struct srcu_notifier_head *nh);
13658                 .head = NULL }
13659  #define RAW_NOTIFIER_INIT(name)        {                               \
13660                 .head = NULL }
13661 -/* srcu_notifier_heads cannot be initialized statically */
13663 +#define SRCU_NOTIFIER_INIT(name, pcpu)                         \
13664 +       {                                                       \
13665 +               .mutex = __MUTEX_INITIALIZER(name.mutex),       \
13666 +               .head = NULL,                                   \
13667 +               .srcu = __SRCU_STRUCT_INIT(name.srcu, pcpu),    \
13668 +       }
13670  #define ATOMIC_NOTIFIER_HEAD(name)                             \
13671         struct atomic_notifier_head name =                      \
13672 @@ -113,6 +117,18 @@ extern void srcu_init_notifier_head(struct srcu_notifier_head *nh);
13673         struct raw_notifier_head name =                         \
13674                 RAW_NOTIFIER_INIT(name)
13676 +#define _SRCU_NOTIFIER_HEAD(name, mod)                         \
13677 +       static DEFINE_PER_CPU(struct srcu_struct_array,         \
13678 +                       name##_head_srcu_array);                \
13679 +       mod struct srcu_notifier_head name =                    \
13680 +                       SRCU_NOTIFIER_INIT(name, name##_head_srcu_array)
13682 +#define SRCU_NOTIFIER_HEAD(name)                               \
13683 +       _SRCU_NOTIFIER_HEAD(name, )
13685 +#define SRCU_NOTIFIER_HEAD_STATIC(name)                                \
13686 +       _SRCU_NOTIFIER_HEAD(name, static)
13688  #ifdef __KERNEL__
13690  extern int atomic_notifier_chain_register(struct atomic_notifier_head *nh,
13691 @@ -182,12 +198,12 @@ static inline int notifier_to_errno(int ret)
13693  /*
13694   *     Declared notifiers so far. I can imagine quite a few more chains
13695 - *     over time (eg laptop power reset chains, reboot chain (to clean 
13696 + *     over time (eg laptop power reset chains, reboot chain (to clean
13697   *     device units up), device [un]mount chain, module load/unload chain,
13698 - *     low memory chain, screenblank chain (for plug in modular screenblankers) 
13699 + *     low memory chain, screenblank chain (for plug in modular screenblankers)
13700   *     VC switch chains (for loadable kernel svgalib VC switch helpers) etc...
13701   */
13704  /* CPU notfiers are defined in include/linux/cpu.h. */
13706  /* netdevice notifiers are defined in include/linux/netdevice.h */
13707 diff --git a/include/linux/percpu.h b/include/linux/percpu.h
13708 index caebf2a758dc..4ecc057b6e27 100644
13709 --- a/include/linux/percpu.h
13710 +++ b/include/linux/percpu.h
13711 @@ -24,6 +24,35 @@
13712          PERCPU_MODULE_RESERVE)
13713  #endif
13715 +#ifdef CONFIG_PREEMPT_RT_FULL
13717 +#define get_local_var(var) (*({                \
13718 +              migrate_disable();       \
13719 +              this_cpu_ptr(&var);      }))
13721 +#define put_local_var(var) do {        \
13722 +       (void)&(var);           \
13723 +       migrate_enable();       \
13724 +} while (0)
13726 +# define get_local_ptr(var) ({         \
13727 +               migrate_disable();      \
13728 +               this_cpu_ptr(var);      })
13730 +# define put_local_ptr(var) do {       \
13731 +       (void)(var);                    \
13732 +       migrate_enable();               \
13733 +} while (0)
13735 +#else
13737 +#define get_local_var(var)     get_cpu_var(var)
13738 +#define put_local_var(var)     put_cpu_var(var)
13739 +#define get_local_ptr(var)     get_cpu_ptr(var)
13740 +#define put_local_ptr(var)     put_cpu_ptr(var)
13742 +#endif
13744  /* minimum unit size, also is the maximum supported allocation size */
13745  #define PCPU_MIN_UNIT_SIZE             PFN_ALIGN(32 << 10)
13747 @@ -116,6 +145,7 @@ extern int __init pcpu_page_first_chunk(size_t reserved_size,
13748  #endif
13750  extern void __percpu *__alloc_reserved_percpu(size_t size, size_t align);
13751 +extern bool __is_kernel_percpu_address(unsigned long addr, unsigned long *can_addr);
13752  extern bool is_kernel_percpu_address(unsigned long addr);
13754  #if !defined(CONFIG_SMP) || !defined(CONFIG_HAVE_SETUP_PER_CPU_AREA)
13755 diff --git a/include/linux/pid.h b/include/linux/pid.h
13756 index 23705a53abba..2cc64b779f03 100644
13757 --- a/include/linux/pid.h
13758 +++ b/include/linux/pid.h
13759 @@ -2,6 +2,7 @@
13760  #define _LINUX_PID_H
13762  #include <linux/rcupdate.h>
13763 +#include <linux/atomic.h>
13765  enum pid_type
13767 diff --git a/include/linux/preempt.h b/include/linux/preempt.h
13768 index 75e4e30677f1..1cfb1cb72354 100644
13769 --- a/include/linux/preempt.h
13770 +++ b/include/linux/preempt.h
13771 @@ -50,7 +50,11 @@
13772  #define HARDIRQ_OFFSET (1UL << HARDIRQ_SHIFT)
13773  #define NMI_OFFSET     (1UL << NMI_SHIFT)
13775 -#define SOFTIRQ_DISABLE_OFFSET (2 * SOFTIRQ_OFFSET)
13776 +#ifndef CONFIG_PREEMPT_RT_FULL
13777 +# define SOFTIRQ_DISABLE_OFFSET                (2 * SOFTIRQ_OFFSET)
13778 +#else
13779 +# define SOFTIRQ_DISABLE_OFFSET                (0)
13780 +#endif
13782  /* We use the MSB mostly because its available */
13783  #define PREEMPT_NEED_RESCHED   0x80000000
13784 @@ -59,9 +63,15 @@
13785  #include <asm/preempt.h>
13787  #define hardirq_count()        (preempt_count() & HARDIRQ_MASK)
13788 -#define softirq_count()        (preempt_count() & SOFTIRQ_MASK)
13789  #define irq_count()    (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK \
13790                                  | NMI_MASK))
13791 +#ifndef CONFIG_PREEMPT_RT_FULL
13792 +# define softirq_count()       (preempt_count() & SOFTIRQ_MASK)
13793 +# define in_serving_softirq()  (softirq_count() & SOFTIRQ_OFFSET)
13794 +#else
13795 +# define softirq_count()       (0UL)
13796 +extern int in_serving_softirq(void);
13797 +#endif
13799  /*
13800   * Are we doing bottom half or hardware interrupt processing?
13801 @@ -72,7 +82,6 @@
13802  #define in_irq()               (hardirq_count())
13803  #define in_softirq()           (softirq_count())
13804  #define in_interrupt()         (irq_count())
13805 -#define in_serving_softirq()   (softirq_count() & SOFTIRQ_OFFSET)
13807  /*
13808   * Are we in NMI context?
13809 @@ -91,7 +100,11 @@
13810  /*
13811   * The preempt_count offset after spin_lock()
13812   */
13813 +#if !defined(CONFIG_PREEMPT_RT_FULL)
13814  #define PREEMPT_LOCK_OFFSET    PREEMPT_DISABLE_OFFSET
13815 +#else
13816 +#define PREEMPT_LOCK_OFFSET    0
13817 +#endif
13819  /*
13820   * The preempt_count offset needed for things like:
13821 @@ -140,6 +153,20 @@ extern void preempt_count_sub(int val);
13822  #define preempt_count_inc() preempt_count_add(1)
13823  #define preempt_count_dec() preempt_count_sub(1)
13825 +#ifdef CONFIG_PREEMPT_LAZY
13826 +#define add_preempt_lazy_count(val)    do { preempt_lazy_count() += (val); } while (0)
13827 +#define sub_preempt_lazy_count(val)    do { preempt_lazy_count() -= (val); } while (0)
13828 +#define inc_preempt_lazy_count()       add_preempt_lazy_count(1)
13829 +#define dec_preempt_lazy_count()       sub_preempt_lazy_count(1)
13830 +#define preempt_lazy_count()           (current_thread_info()->preempt_lazy_count)
13831 +#else
13832 +#define add_preempt_lazy_count(val)    do { } while (0)
13833 +#define sub_preempt_lazy_count(val)    do { } while (0)
13834 +#define inc_preempt_lazy_count()       do { } while (0)
13835 +#define dec_preempt_lazy_count()       do { } while (0)
13836 +#define preempt_lazy_count()           (0)
13837 +#endif
13839  #ifdef CONFIG_PREEMPT_COUNT
13841  #define preempt_disable() \
13842 @@ -148,13 +175,25 @@ do { \
13843         barrier(); \
13844  } while (0)
13846 +#define preempt_lazy_disable() \
13847 +do { \
13848 +       inc_preempt_lazy_count(); \
13849 +       barrier(); \
13850 +} while (0)
13852  #define sched_preempt_enable_no_resched() \
13853  do { \
13854         barrier(); \
13855         preempt_count_dec(); \
13856  } while (0)
13858 -#define preempt_enable_no_resched() sched_preempt_enable_no_resched()
13859 +#ifdef CONFIG_PREEMPT_RT_BASE
13860 +# define preempt_enable_no_resched() sched_preempt_enable_no_resched()
13861 +# define preempt_check_resched_rt() preempt_check_resched()
13862 +#else
13863 +# define preempt_enable_no_resched() preempt_enable()
13864 +# define preempt_check_resched_rt() barrier();
13865 +#endif
13867  #define preemptible()  (preempt_count() == 0 && !irqs_disabled())
13869 @@ -179,6 +218,13 @@ do { \
13870                 __preempt_schedule(); \
13871  } while (0)
13873 +#define preempt_lazy_enable() \
13874 +do { \
13875 +       dec_preempt_lazy_count(); \
13876 +       barrier(); \
13877 +       preempt_check_resched(); \
13878 +} while (0)
13880  #else /* !CONFIG_PREEMPT */
13881  #define preempt_enable() \
13882  do { \
13883 @@ -224,6 +270,7 @@ do { \
13884  #define preempt_disable_notrace()              barrier()
13885  #define preempt_enable_no_resched_notrace()    barrier()
13886  #define preempt_enable_notrace()               barrier()
13887 +#define preempt_check_resched_rt()             barrier()
13888  #define preemptible()                          0
13890  #endif /* CONFIG_PREEMPT_COUNT */
13891 @@ -244,10 +291,31 @@ do { \
13892  } while (0)
13893  #define preempt_fold_need_resched() \
13894  do { \
13895 -       if (tif_need_resched()) \
13896 +       if (tif_need_resched_now()) \
13897                 set_preempt_need_resched(); \
13898  } while (0)
13900 +#ifdef CONFIG_PREEMPT_RT_FULL
13901 +# define preempt_disable_rt()          preempt_disable()
13902 +# define preempt_enable_rt()           preempt_enable()
13903 +# define preempt_disable_nort()                barrier()
13904 +# define preempt_enable_nort()         barrier()
13905 +# ifdef CONFIG_SMP
13906 +   extern void migrate_disable(void);
13907 +   extern void migrate_enable(void);
13908 +# else /* CONFIG_SMP */
13909 +#  define migrate_disable()            barrier()
13910 +#  define migrate_enable()             barrier()
13911 +# endif /* CONFIG_SMP */
13912 +#else
13913 +# define preempt_disable_rt()          barrier()
13914 +# define preempt_enable_rt()           barrier()
13915 +# define preempt_disable_nort()                preempt_disable()
13916 +# define preempt_enable_nort()         preempt_enable()
13917 +# define migrate_disable()             preempt_disable()
13918 +# define migrate_enable()              preempt_enable()
13919 +#endif
13921  #ifdef CONFIG_PREEMPT_NOTIFIERS
13923  struct preempt_notifier;
13924 diff --git a/include/linux/printk.h b/include/linux/printk.h
13925 index 9729565c25ff..9cdca696b718 100644
13926 --- a/include/linux/printk.h
13927 +++ b/include/linux/printk.h
13928 @@ -117,9 +117,11 @@ int no_printk(const char *fmt, ...)
13929  #ifdef CONFIG_EARLY_PRINTK
13930  extern asmlinkage __printf(1, 2)
13931  void early_printk(const char *fmt, ...);
13932 +extern void printk_kill(void);
13933  #else
13934  static inline __printf(1, 2) __cold
13935  void early_printk(const char *s, ...) { }
13936 +static inline void printk_kill(void) { }
13937  #endif
13939  typedef __printf(1, 0) int (*printk_func_t)(const char *fmt, va_list args);
13940 diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
13941 index 5d5174b59802..327dddaf4c8f 100644
13942 --- a/include/linux/radix-tree.h
13943 +++ b/include/linux/radix-tree.h
13944 @@ -279,6 +279,8 @@ unsigned int radix_tree_gang_lookup_slot(struct radix_tree_root *root,
13945                         unsigned long first_index, unsigned int max_items);
13946  int radix_tree_preload(gfp_t gfp_mask);
13947  int radix_tree_maybe_preload(gfp_t gfp_mask);
13948 +void radix_tree_preload_end(void);
13950  void radix_tree_init(void);
13951  void *radix_tree_tag_set(struct radix_tree_root *root,
13952                         unsigned long index, unsigned int tag);
13953 @@ -301,11 +303,6 @@ unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root,
13954  int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag);
13955  unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item);
13957 -static inline void radix_tree_preload_end(void)
13959 -       preempt_enable();
13962  /**
13963   * struct radix_tree_iter - radix tree iterator state
13964   *
13965 diff --git a/include/linux/random.h b/include/linux/random.h
13966 index a75840c1aa71..1a804361670c 100644
13967 --- a/include/linux/random.h
13968 +++ b/include/linux/random.h
13969 @@ -20,7 +20,7 @@ struct random_ready_callback {
13970  extern void add_device_randomness(const void *, unsigned int);
13971  extern void add_input_randomness(unsigned int type, unsigned int code,
13972                                  unsigned int value);
13973 -extern void add_interrupt_randomness(int irq, int irq_flags);
13974 +extern void add_interrupt_randomness(int irq, int irq_flags, __u64 ip);
13976  extern void get_random_bytes(void *buf, int nbytes);
13977  extern int add_random_ready_callback(struct random_ready_callback *rdy);
13978 diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h
13979 index a5aa7ae671f4..24ddffd25492 100644
13980 --- a/include/linux/rbtree.h
13981 +++ b/include/linux/rbtree.h
13982 @@ -31,7 +31,6 @@
13984  #include <linux/kernel.h>
13985  #include <linux/stddef.h>
13986 -#include <linux/rcupdate.h>
13988  struct rb_node {
13989         unsigned long  __rb_parent_color;
13990 @@ -86,14 +85,8 @@ static inline void rb_link_node(struct rb_node *node, struct rb_node *parent,
13991         *rb_link = node;
13994 -static inline void rb_link_node_rcu(struct rb_node *node, struct rb_node *parent,
13995 -                                   struct rb_node **rb_link)
13997 -       node->__rb_parent_color = (unsigned long)parent;
13998 -       node->rb_left = node->rb_right = NULL;
14000 -       rcu_assign_pointer(*rb_link, node);
14002 +void rb_link_node_rcu(struct rb_node *node, struct rb_node *parent,
14003 +                     struct rb_node **rb_link);
14005  #define rb_entry_safe(ptr, type, member) \
14006         ({ typeof(ptr) ____ptr = (ptr); \
14007 diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
14008 index a0189ba67fde..c2f5f955163d 100644
14009 --- a/include/linux/rcupdate.h
14010 +++ b/include/linux/rcupdate.h
14011 @@ -169,6 +169,9 @@ void call_rcu(struct rcu_head *head,
14013  #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
14015 +#ifdef CONFIG_PREEMPT_RT_FULL
14016 +#define call_rcu_bh    call_rcu
14017 +#else
14018  /**
14019   * call_rcu_bh() - Queue an RCU for invocation after a quicker grace period.
14020   * @head: structure to be used for queueing the RCU updates.
14021 @@ -192,6 +195,7 @@ void call_rcu(struct rcu_head *head,
14022   */
14023  void call_rcu_bh(struct rcu_head *head,
14024                  rcu_callback_t func);
14025 +#endif
14027  /**
14028   * call_rcu_sched() - Queue an RCU for invocation after sched grace period.
14029 @@ -292,6 +296,11 @@ void synchronize_rcu(void);
14030   * types of kernel builds, the rcu_read_lock() nesting depth is unknowable.
14031   */
14032  #define rcu_preempt_depth() (current->rcu_read_lock_nesting)
14033 +#ifndef CONFIG_PREEMPT_RT_FULL
14034 +#define sched_rcu_preempt_depth()      rcu_preempt_depth()
14035 +#else
14036 +static inline int sched_rcu_preempt_depth(void) { return 0; }
14037 +#endif
14039  #else /* #ifdef CONFIG_PREEMPT_RCU */
14041 @@ -317,6 +326,8 @@ static inline int rcu_preempt_depth(void)
14042         return 0;
14045 +#define sched_rcu_preempt_depth()      rcu_preempt_depth()
14047  #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
14049  /* Internal to kernel */
14050 @@ -489,7 +500,14 @@ extern struct lockdep_map rcu_callback_map;
14051  int debug_lockdep_rcu_enabled(void);
14053  int rcu_read_lock_held(void);
14054 +#ifdef CONFIG_PREEMPT_RT_FULL
14055 +static inline int rcu_read_lock_bh_held(void)
14057 +       return rcu_read_lock_held();
14059 +#else
14060  int rcu_read_lock_bh_held(void);
14061 +#endif
14063  /**
14064   * rcu_read_lock_sched_held() - might we be in RCU-sched read-side critical section?
14065 @@ -937,10 +955,14 @@ static inline void rcu_read_unlock(void)
14066  static inline void rcu_read_lock_bh(void)
14068         local_bh_disable();
14069 +#ifdef CONFIG_PREEMPT_RT_FULL
14070 +       rcu_read_lock();
14071 +#else
14072         __acquire(RCU_BH);
14073         rcu_lock_acquire(&rcu_bh_lock_map);
14074         RCU_LOCKDEP_WARN(!rcu_is_watching(),
14075                          "rcu_read_lock_bh() used illegally while idle");
14076 +#endif
14079  /*
14080 @@ -950,10 +972,14 @@ static inline void rcu_read_lock_bh(void)
14081   */
14082  static inline void rcu_read_unlock_bh(void)
14084 +#ifdef CONFIG_PREEMPT_RT_FULL
14085 +       rcu_read_unlock();
14086 +#else
14087         RCU_LOCKDEP_WARN(!rcu_is_watching(),
14088                          "rcu_read_unlock_bh() used illegally while idle");
14089         rcu_lock_release(&rcu_bh_lock_map);
14090         __release(RCU_BH);
14091 +#endif
14092         local_bh_enable();
14095 diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
14096 index 60d15a080d7c..436c9e62bfc6 100644
14097 --- a/include/linux/rcutree.h
14098 +++ b/include/linux/rcutree.h
14099 @@ -44,7 +44,11 @@ static inline void rcu_virt_note_context_switch(int cpu)
14100         rcu_note_context_switch();
14103 +#ifdef CONFIG_PREEMPT_RT_FULL
14104 +# define synchronize_rcu_bh    synchronize_rcu
14105 +#else
14106  void synchronize_rcu_bh(void);
14107 +#endif
14108  void synchronize_sched_expedited(void);
14109  void synchronize_rcu_expedited(void);
14111 @@ -72,7 +76,11 @@ static inline void synchronize_rcu_bh_expedited(void)
14114  void rcu_barrier(void);
14115 +#ifdef CONFIG_PREEMPT_RT_FULL
14116 +# define rcu_barrier_bh                rcu_barrier
14117 +#else
14118  void rcu_barrier_bh(void);
14119 +#endif
14120  void rcu_barrier_sched(void);
14121  unsigned long get_state_synchronize_rcu(void);
14122  void cond_synchronize_rcu(unsigned long oldstate);
14123 @@ -85,12 +93,10 @@ unsigned long rcu_batches_started(void);
14124  unsigned long rcu_batches_started_bh(void);
14125  unsigned long rcu_batches_started_sched(void);
14126  unsigned long rcu_batches_completed(void);
14127 -unsigned long rcu_batches_completed_bh(void);
14128  unsigned long rcu_batches_completed_sched(void);
14129  void show_rcu_gp_kthreads(void);
14131  void rcu_force_quiescent_state(void);
14132 -void rcu_bh_force_quiescent_state(void);
14133  void rcu_sched_force_quiescent_state(void);
14135  void rcu_idle_enter(void);
14136 @@ -105,6 +111,14 @@ extern int rcu_scheduler_active __read_mostly;
14138  bool rcu_is_watching(void);
14140 +#ifndef CONFIG_PREEMPT_RT_FULL
14141 +void rcu_bh_force_quiescent_state(void);
14142 +unsigned long rcu_batches_completed_bh(void);
14143 +#else
14144 +# define rcu_bh_force_quiescent_state  rcu_force_quiescent_state
14145 +# define rcu_batches_completed_bh      rcu_batches_completed
14146 +#endif
14148  void rcu_all_qs(void);
14150  #endif /* __LINUX_RCUTREE_H */
14151 diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h
14152 index 1abba5ce2a2f..30211c627511 100644
14153 --- a/include/linux/rtmutex.h
14154 +++ b/include/linux/rtmutex.h
14155 @@ -13,11 +13,15 @@
14156  #define __LINUX_RT_MUTEX_H
14158  #include <linux/linkage.h>
14159 +#include <linux/spinlock_types_raw.h>
14160  #include <linux/rbtree.h>
14161 -#include <linux/spinlock_types.h>
14163  extern int max_lock_depth; /* for sysctl */
14165 +#ifdef CONFIG_DEBUG_MUTEXES
14166 +#include <linux/debug_locks.h>
14167 +#endif
14169  /**
14170   * The rt_mutex structure
14171   *
14172 @@ -31,8 +35,8 @@ struct rt_mutex {
14173         struct rb_root          waiters;
14174         struct rb_node          *waiters_leftmost;
14175         struct task_struct      *owner;
14176 -#ifdef CONFIG_DEBUG_RT_MUTEXES
14177         int                     save_state;
14178 +#ifdef CONFIG_DEBUG_RT_MUTEXES
14179         const char              *name, *file;
14180         int                     line;
14181         void                    *magic;
14182 @@ -55,22 +59,33 @@ struct hrtimer_sleeper;
14183  # define rt_mutex_debug_check_no_locks_held(task)      do { } while (0)
14184  #endif
14186 +# define rt_mutex_init(mutex)                                  \
14187 +       do {                                                    \
14188 +               raw_spin_lock_init(&(mutex)->wait_lock);        \
14189 +               __rt_mutex_init(mutex, #mutex);                 \
14190 +       } while (0)
14192  #ifdef CONFIG_DEBUG_RT_MUTEXES
14193  # define __DEBUG_RT_MUTEX_INITIALIZER(mutexname) \
14194         , .name = #mutexname, .file = __FILE__, .line = __LINE__
14195 -# define rt_mutex_init(mutex)                  __rt_mutex_init(mutex, __func__)
14196   extern void rt_mutex_debug_task_free(struct task_struct *tsk);
14197  #else
14198  # define __DEBUG_RT_MUTEX_INITIALIZER(mutexname)
14199 -# define rt_mutex_init(mutex)                  __rt_mutex_init(mutex, NULL)
14200  # define rt_mutex_debug_task_free(t)                   do { } while (0)
14201  #endif
14203 -#define __RT_MUTEX_INITIALIZER(mutexname) \
14204 -       { .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \
14205 +#define __RT_MUTEX_INITIALIZER_PLAIN(mutexname) \
14206 +        .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \
14207         , .waiters = RB_ROOT \
14208         , .owner = NULL \
14209 -       __DEBUG_RT_MUTEX_INITIALIZER(mutexname)}
14210 +       __DEBUG_RT_MUTEX_INITIALIZER(mutexname)
14212 +#define __RT_MUTEX_INITIALIZER(mutexname) \
14213 +       { __RT_MUTEX_INITIALIZER_PLAIN(mutexname) }
14215 +#define __RT_MUTEX_INITIALIZER_SAVE_STATE(mutexname) \
14216 +       { __RT_MUTEX_INITIALIZER_PLAIN(mutexname)    \
14217 +       , .save_state = 1 }
14219  #define DEFINE_RT_MUTEX(mutexname) \
14220         struct rt_mutex mutexname = __RT_MUTEX_INITIALIZER(mutexname)
14221 @@ -91,6 +106,7 @@ extern void rt_mutex_destroy(struct rt_mutex *lock);
14223  extern void rt_mutex_lock(struct rt_mutex *lock);
14224  extern int rt_mutex_lock_interruptible(struct rt_mutex *lock);
14225 +extern int rt_mutex_lock_killable(struct rt_mutex *lock);
14226  extern int rt_mutex_timed_lock(struct rt_mutex *lock,
14227                                struct hrtimer_sleeper *timeout);
14229 diff --git a/include/linux/rwlock_rt.h b/include/linux/rwlock_rt.h
14230 new file mode 100644
14231 index 000000000000..49ed2d45d3be
14232 --- /dev/null
14233 +++ b/include/linux/rwlock_rt.h
14234 @@ -0,0 +1,99 @@
14235 +#ifndef __LINUX_RWLOCK_RT_H
14236 +#define __LINUX_RWLOCK_RT_H
14238 +#ifndef __LINUX_SPINLOCK_H
14239 +#error Do not include directly. Use spinlock.h
14240 +#endif
14242 +#define rwlock_init(rwl)                               \
14243 +do {                                                   \
14244 +       static struct lock_class_key __key;             \
14245 +                                                       \
14246 +       rt_mutex_init(&(rwl)->lock);                    \
14247 +       __rt_rwlock_init(rwl, #rwl, &__key);            \
14248 +} while (0)
14250 +extern void __lockfunc rt_write_lock(rwlock_t *rwlock);
14251 +extern void __lockfunc rt_read_lock(rwlock_t *rwlock);
14252 +extern int __lockfunc rt_write_trylock(rwlock_t *rwlock);
14253 +extern int __lockfunc rt_write_trylock_irqsave(rwlock_t *trylock, unsigned long *flags);
14254 +extern int __lockfunc rt_read_trylock(rwlock_t *rwlock);
14255 +extern void __lockfunc rt_write_unlock(rwlock_t *rwlock);
14256 +extern void __lockfunc rt_read_unlock(rwlock_t *rwlock);
14257 +extern unsigned long __lockfunc rt_write_lock_irqsave(rwlock_t *rwlock);
14258 +extern unsigned long __lockfunc rt_read_lock_irqsave(rwlock_t *rwlock);
14259 +extern void __rt_rwlock_init(rwlock_t *rwlock, char *name, struct lock_class_key *key);
14261 +#define read_trylock(lock)     __cond_lock(lock, rt_read_trylock(lock))
14262 +#define write_trylock(lock)    __cond_lock(lock, rt_write_trylock(lock))
14264 +#define write_trylock_irqsave(lock, flags)     \
14265 +       __cond_lock(lock, rt_write_trylock_irqsave(lock, &flags))
14267 +#define read_lock_irqsave(lock, flags)                 \
14268 +       do {                                            \
14269 +               typecheck(unsigned long, flags);        \
14270 +               flags = rt_read_lock_irqsave(lock);     \
14271 +       } while (0)
14273 +#define write_lock_irqsave(lock, flags)                        \
14274 +       do {                                            \
14275 +               typecheck(unsigned long, flags);        \
14276 +               flags = rt_write_lock_irqsave(lock);    \
14277 +       } while (0)
14279 +#define read_lock(lock)                rt_read_lock(lock)
14281 +#define read_lock_bh(lock)                             \
14282 +       do {                                            \
14283 +               local_bh_disable();                     \
14284 +               rt_read_lock(lock);                     \
14285 +       } while (0)
14287 +#define read_lock_irq(lock)    read_lock(lock)
14289 +#define write_lock(lock)       rt_write_lock(lock)
14291 +#define write_lock_bh(lock)                            \
14292 +       do {                                            \
14293 +               local_bh_disable();                     \
14294 +               rt_write_lock(lock);                    \
14295 +       } while (0)
14297 +#define write_lock_irq(lock)   write_lock(lock)
14299 +#define read_unlock(lock)      rt_read_unlock(lock)
14301 +#define read_unlock_bh(lock)                           \
14302 +       do {                                            \
14303 +               rt_read_unlock(lock);                   \
14304 +               local_bh_enable();                      \
14305 +       } while (0)
14307 +#define read_unlock_irq(lock)  read_unlock(lock)
14309 +#define write_unlock(lock)     rt_write_unlock(lock)
14311 +#define write_unlock_bh(lock)                          \
14312 +       do {                                            \
14313 +               rt_write_unlock(lock);                  \
14314 +               local_bh_enable();                      \
14315 +       } while (0)
14317 +#define write_unlock_irq(lock) write_unlock(lock)
14319 +#define read_unlock_irqrestore(lock, flags)            \
14320 +       do {                                            \
14321 +               typecheck(unsigned long, flags);        \
14322 +               (void) flags;                           \
14323 +               rt_read_unlock(lock);                   \
14324 +       } while (0)
14326 +#define write_unlock_irqrestore(lock, flags) \
14327 +       do {                                            \
14328 +               typecheck(unsigned long, flags);        \
14329 +               (void) flags;                           \
14330 +               rt_write_unlock(lock);                  \
14331 +       } while (0)
14333 +#endif
14334 diff --git a/include/linux/rwlock_types.h b/include/linux/rwlock_types.h
14335 index cc0072e93e36..d0da966ad7a0 100644
14336 --- a/include/linux/rwlock_types.h
14337 +++ b/include/linux/rwlock_types.h
14338 @@ -1,6 +1,10 @@
14339  #ifndef __LINUX_RWLOCK_TYPES_H
14340  #define __LINUX_RWLOCK_TYPES_H
14342 +#if !defined(__LINUX_SPINLOCK_TYPES_H)
14343 +# error "Do not include directly, include spinlock_types.h"
14344 +#endif
14346  /*
14347   * include/linux/rwlock_types.h - generic rwlock type definitions
14348   *                               and initializers
14349 @@ -43,6 +47,7 @@ typedef struct {
14350                                 RW_DEP_MAP_INIT(lockname) }
14351  #endif
14353 -#define DEFINE_RWLOCK(x)       rwlock_t x = __RW_LOCK_UNLOCKED(x)
14354 +#define DEFINE_RWLOCK(name) \
14355 +       rwlock_t name __cacheline_aligned_in_smp = __RW_LOCK_UNLOCKED(name)
14357  #endif /* __LINUX_RWLOCK_TYPES_H */
14358 diff --git a/include/linux/rwlock_types_rt.h b/include/linux/rwlock_types_rt.h
14359 new file mode 100644
14360 index 000000000000..b13832119591
14361 --- /dev/null
14362 +++ b/include/linux/rwlock_types_rt.h
14363 @@ -0,0 +1,33 @@
14364 +#ifndef __LINUX_RWLOCK_TYPES_RT_H
14365 +#define __LINUX_RWLOCK_TYPES_RT_H
14367 +#ifndef __LINUX_SPINLOCK_TYPES_H
14368 +#error "Do not include directly. Include spinlock_types.h instead"
14369 +#endif
14372 + * rwlocks - rtmutex which allows single reader recursion
14373 + */
14374 +typedef struct {
14375 +       struct rt_mutex         lock;
14376 +       int                     read_depth;
14377 +       unsigned int            break_lock;
14378 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
14379 +       struct lockdep_map      dep_map;
14380 +#endif
14381 +} rwlock_t;
14383 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
14384 +# define RW_DEP_MAP_INIT(lockname)     .dep_map = { .name = #lockname }
14385 +#else
14386 +# define RW_DEP_MAP_INIT(lockname)
14387 +#endif
14389 +#define __RW_LOCK_UNLOCKED(name) \
14390 +       { .lock = __RT_MUTEX_INITIALIZER_SAVE_STATE(name.lock), \
14391 +         RW_DEP_MAP_INIT(name) }
14393 +#define DEFINE_RWLOCK(name) \
14394 +       rwlock_t name __cacheline_aligned_in_smp = __RW_LOCK_UNLOCKED(name)
14396 +#endif
14397 diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
14398 index 8f498cdde280..2b2148431f14 100644
14399 --- a/include/linux/rwsem.h
14400 +++ b/include/linux/rwsem.h
14401 @@ -18,6 +18,10 @@
14402  #include <linux/osq_lock.h>
14403  #endif
14405 +#ifdef CONFIG_PREEMPT_RT_FULL
14406 +#include <linux/rwsem_rt.h>
14407 +#else /* PREEMPT_RT_FULL */
14409  struct rw_semaphore;
14411  #ifdef CONFIG_RWSEM_GENERIC_SPINLOCK
14412 @@ -177,4 +181,6 @@ extern void up_read_non_owner(struct rw_semaphore *sem);
14413  # define up_read_non_owner(sem)                        up_read(sem)
14414  #endif
14416 +#endif /* !PREEMPT_RT_FULL */
14418  #endif /* _LINUX_RWSEM_H */
14419 diff --git a/include/linux/rwsem_rt.h b/include/linux/rwsem_rt.h
14420 new file mode 100644
14421 index 000000000000..f97860b2e2a4
14422 --- /dev/null
14423 +++ b/include/linux/rwsem_rt.h
14424 @@ -0,0 +1,152 @@
14425 +#ifndef _LINUX_RWSEM_RT_H
14426 +#define _LINUX_RWSEM_RT_H
14428 +#ifndef _LINUX_RWSEM_H
14429 +#error "Include rwsem.h"
14430 +#endif
14433 + * RW-semaphores are a spinlock plus a reader-depth count.
14434 + *
14435 + * Note that the semantics are different from the usual
14436 + * Linux rw-sems, in PREEMPT_RT mode we do not allow
14437 + * multiple readers to hold the lock at once, we only allow
14438 + * a read-lock owner to read-lock recursively. This is
14439 + * better for latency, makes the implementation inherently
14440 + * fair and makes it simpler as well.
14441 + */
14443 +#include <linux/rtmutex.h>
14445 +struct rw_semaphore {
14446 +       struct rt_mutex         lock;
14447 +       int                     read_depth;
14448 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
14449 +       struct lockdep_map      dep_map;
14450 +#endif
14453 +#define __RWSEM_INITIALIZER(name) \
14454 +       { .lock = __RT_MUTEX_INITIALIZER(name.lock), \
14455 +         RW_DEP_MAP_INIT(name) }
14457 +#define DECLARE_RWSEM(lockname) \
14458 +       struct rw_semaphore lockname = __RWSEM_INITIALIZER(lockname)
14460 +extern void  __rt_rwsem_init(struct rw_semaphore *rwsem, const char *name,
14461 +                                    struct lock_class_key *key);
14463 +#define __rt_init_rwsem(sem, name, key)                        \
14464 +       do {                                            \
14465 +               rt_mutex_init(&(sem)->lock);            \
14466 +               __rt_rwsem_init((sem), (name), (key));\
14467 +       } while (0)
14469 +#define __init_rwsem(sem, name, key) __rt_init_rwsem(sem, name, key)
14471 +# define rt_init_rwsem(sem)                            \
14472 +do {                                                   \
14473 +       static struct lock_class_key __key;             \
14474 +                                                       \
14475 +       __rt_init_rwsem((sem), #sem, &__key);           \
14476 +} while (0)
14478 +extern void rt_down_write(struct rw_semaphore *rwsem);
14479 +extern void rt_down_read_nested(struct rw_semaphore *rwsem, int subclass);
14480 +extern void rt_down_write_nested(struct rw_semaphore *rwsem, int subclass);
14481 +extern void rt_down_write_nested_lock(struct rw_semaphore *rwsem,
14482 +                                     struct lockdep_map *nest);
14483 +extern void rt__down_read(struct rw_semaphore *rwsem);
14484 +extern void rt_down_read(struct rw_semaphore *rwsem);
14485 +extern int  rt_down_write_trylock(struct rw_semaphore *rwsem);
14486 +extern int  rt__down_read_trylock(struct rw_semaphore *rwsem);
14487 +extern int  rt_down_read_trylock(struct rw_semaphore *rwsem);
14488 +extern void __rt_up_read(struct rw_semaphore *rwsem);
14489 +extern void rt_up_read(struct rw_semaphore *rwsem);
14490 +extern void rt_up_write(struct rw_semaphore *rwsem);
14491 +extern void rt_downgrade_write(struct rw_semaphore *rwsem);
14493 +#define init_rwsem(sem)                rt_init_rwsem(sem)
14494 +#define rwsem_is_locked(s)     rt_mutex_is_locked(&(s)->lock)
14496 +static inline int rwsem_is_contended(struct rw_semaphore *sem)
14498 +       /* rt_mutex_has_waiters() */
14499 +       return !RB_EMPTY_ROOT(&sem->lock.waiters);
14502 +static inline void __down_read(struct rw_semaphore *sem)
14504 +       rt__down_read(sem);
14507 +static inline void down_read(struct rw_semaphore *sem)
14509 +       rt_down_read(sem);
14512 +static inline int __down_read_trylock(struct rw_semaphore *sem)
14514 +       return rt__down_read_trylock(sem);
14517 +static inline int down_read_trylock(struct rw_semaphore *sem)
14519 +       return rt_down_read_trylock(sem);
14522 +static inline void down_write(struct rw_semaphore *sem)
14524 +       rt_down_write(sem);
14527 +static inline int down_write_trylock(struct rw_semaphore *sem)
14529 +       return rt_down_write_trylock(sem);
14532 +static inline void __up_read(struct rw_semaphore *sem)
14534 +       __rt_up_read(sem);
14537 +static inline void up_read(struct rw_semaphore *sem)
14539 +       rt_up_read(sem);
14542 +static inline void up_write(struct rw_semaphore *sem)
14544 +       rt_up_write(sem);
14547 +static inline void downgrade_write(struct rw_semaphore *sem)
14549 +       rt_downgrade_write(sem);
14552 +static inline void down_read_nested(struct rw_semaphore *sem, int subclass)
14554 +       return rt_down_read_nested(sem, subclass);
14557 +static inline void down_write_nested(struct rw_semaphore *sem, int subclass)
14559 +       rt_down_write_nested(sem, subclass);
14561 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
14562 +static inline void down_write_nest_lock(struct rw_semaphore *sem,
14563 +               struct rw_semaphore *nest_lock)
14565 +       rt_down_write_nested_lock(sem, &nest_lock->dep_map);
14568 +#else
14570 +static inline void down_write_nest_lock(struct rw_semaphore *sem,
14571 +               struct rw_semaphore *nest_lock)
14573 +       rt_down_write_nested_lock(sem, NULL);
14575 +#endif
14576 +#endif
14577 diff --git a/include/linux/sched.h b/include/linux/sched.h
14578 index 352213b360d7..1c7aa9446684 100644
14579 --- a/include/linux/sched.h
14580 +++ b/include/linux/sched.h
14581 @@ -26,6 +26,7 @@ struct sched_param {
14582  #include <linux/nodemask.h>
14583  #include <linux/mm_types.h>
14584  #include <linux/preempt.h>
14585 +#include <asm/kmap_types.h>
14587  #include <asm/page.h>
14588  #include <asm/ptrace.h>
14589 @@ -182,8 +183,6 @@ extern void update_cpu_load_nohz(void);
14590  static inline void update_cpu_load_nohz(void) { }
14591  #endif
14593 -extern unsigned long get_parent_ip(unsigned long addr);
14595  extern void dump_cpu_task(int cpu);
14597  struct seq_file;
14598 @@ -242,10 +241,7 @@ extern char ___assert_task_state[1 - 2*!!(
14599                                  TASK_UNINTERRUPTIBLE | __TASK_STOPPED | \
14600                                  __TASK_TRACED | EXIT_ZOMBIE | EXIT_DEAD)
14602 -#define task_is_traced(task)   ((task->state & __TASK_TRACED) != 0)
14603  #define task_is_stopped(task)  ((task->state & __TASK_STOPPED) != 0)
14604 -#define task_is_stopped_or_traced(task)        \
14605 -                       ((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0)
14606  #define task_contributes_to_load(task) \
14607                                 ((task->state & TASK_UNINTERRUPTIBLE) != 0 && \
14608                                  (task->flags & PF_FROZEN) == 0 && \
14609 @@ -311,6 +307,11 @@ extern char ___assert_task_state[1 - 2*!!(
14611  #endif
14613 +#define __set_current_state_no_track(state_value)      \
14614 +       do { current->state = (state_value); } while (0)
14615 +#define set_current_state_no_track(state_value)                \
14616 +       set_mb(current->state, (state_value))
14618  /* Task command name length */
14619  #define TASK_COMM_LEN 16
14621 @@ -970,8 +971,18 @@ struct wake_q_head {
14622         struct wake_q_head name = { WAKE_Q_TAIL, &name.first }
14624  extern void wake_q_add(struct wake_q_head *head,
14625 -                      struct task_struct *task);
14626 -extern void wake_up_q(struct wake_q_head *head);
14627 +                             struct task_struct *task);
14628 +extern void __wake_up_q(struct wake_q_head *head, bool sleeper);
14630 +static inline void wake_up_q(struct wake_q_head *head)
14632 +       __wake_up_q(head, false);
14635 +static inline void wake_up_q_sleeper(struct wake_q_head *head)
14637 +       __wake_up_q(head, true);
14640  /*
14641   * sched-domains (multiprocessor balancing) declarations:
14642 @@ -1379,6 +1390,7 @@ struct tlbflush_unmap_batch {
14644  struct task_struct {
14645         volatile long state;    /* -1 unrunnable, 0 runnable, >0 stopped */
14646 +       volatile long saved_state;      /* saved state for "spinlock sleepers" */
14647         void *stack;
14648         atomic_t usage;
14649         unsigned int flags;     /* per process flags, defined below */
14650 @@ -1415,6 +1427,12 @@ struct task_struct {
14651  #endif
14653         unsigned int policy;
14654 +#ifdef CONFIG_PREEMPT_RT_FULL
14655 +       int migrate_disable;
14656 +# ifdef CONFIG_SCHED_DEBUG
14657 +       int migrate_disable_atomic;
14658 +# endif
14659 +#endif
14660         int nr_cpus_allowed;
14661         cpumask_t cpus_allowed;
14663 @@ -1526,11 +1544,14 @@ struct task_struct {
14664         cputime_t gtime;
14665         struct prev_cputime prev_cputime;
14666  #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
14667 -       seqlock_t vtime_seqlock;
14668 +       seqcount_t vtime_seqcount;
14669         unsigned long long vtime_snap;
14670         enum {
14671 -               VTIME_SLEEPING = 0,
14672 +               /* Task is sleeping or running in a CPU with VTIME inactive */
14673 +               VTIME_INACTIVE = 0,
14674 +               /* Task runs in userspace in a CPU with VTIME active */
14675                 VTIME_USER,
14676 +               /* Task runs in kernelspace in a CPU with VTIME active */
14677                 VTIME_SYS,
14678         } vtime_snap_whence;
14679  #endif
14680 @@ -1542,6 +1563,9 @@ struct task_struct {
14682         struct task_cputime cputime_expires;
14683         struct list_head cpu_timers[3];
14684 +#ifdef CONFIG_PREEMPT_RT_BASE
14685 +       struct task_struct *posix_timer_list;
14686 +#endif
14688  /* process credentials */
14689         const struct cred __rcu *ptracer_cred; /* Tracer's credentials at attach */
14690 @@ -1573,10 +1597,15 @@ struct task_struct {
14691  /* signal handlers */
14692         struct signal_struct *signal;
14693         struct sighand_struct *sighand;
14694 +       struct sigqueue *sigqueue_cache;
14696         sigset_t blocked, real_blocked;
14697         sigset_t saved_sigmask; /* restored if set_restore_sigmask() was used */
14698         struct sigpending pending;
14699 +#ifdef CONFIG_PREEMPT_RT_FULL
14700 +       /* TODO: move me into ->restart_block ? */
14701 +       struct siginfo forced_info;
14702 +#endif
14704         unsigned long sas_ss_sp;
14705         size_t sas_ss_size;
14706 @@ -1800,6 +1829,12 @@ struct task_struct {
14707         unsigned long trace;
14708         /* bitmask and counter of trace recursion */
14709         unsigned long trace_recursion;
14710 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
14711 +       u64 preempt_timestamp_hist;
14712 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
14713 +       long timer_offset;
14714 +#endif
14715 +#endif
14716  #endif /* CONFIG_TRACING */
14717  #ifdef CONFIG_MEMCG
14718         struct mem_cgroup *memcg_in_oom;
14719 @@ -1816,9 +1851,23 @@ struct task_struct {
14720         unsigned int    sequential_io;
14721         unsigned int    sequential_io_avg;
14722  #endif
14723 +#ifdef CONFIG_PREEMPT_RT_BASE
14724 +       struct rcu_head put_rcu;
14725 +       int softirq_nestcnt;
14726 +       unsigned int softirqs_raised;
14727 +#endif
14728 +#ifdef CONFIG_PREEMPT_RT_FULL
14729 +# if defined CONFIG_HIGHMEM || defined CONFIG_X86_32
14730 +       int kmap_idx;
14731 +       pte_t kmap_pte[KM_TYPE_NR];
14732 +# endif
14733 +#endif
14734  #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
14735         unsigned long   task_state_change;
14736  #endif
14737 +#ifdef CONFIG_PREEMPT_RT_FULL
14738 +       int xmit_recursion;
14739 +#endif
14740         int pagefault_disabled;
14741  /* CPU-specific state of this task */
14742         struct thread_struct thread;
14743 @@ -1836,9 +1885,6 @@ extern int arch_task_struct_size __read_mostly;
14744  # define arch_task_struct_size (sizeof(struct task_struct))
14745  #endif
14747 -/* Future-safe accessor for struct task_struct's cpus_allowed. */
14748 -#define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
14750  #define TNF_MIGRATED   0x01
14751  #define TNF_NO_GROUP   0x02
14752  #define TNF_SHARED     0x04
14753 @@ -2028,6 +2074,15 @@ extern struct pid *cad_pid;
14754  extern void free_task(struct task_struct *tsk);
14755  #define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0)
14757 +#ifdef CONFIG_PREEMPT_RT_BASE
14758 +extern void __put_task_struct_cb(struct rcu_head *rhp);
14760 +static inline void put_task_struct(struct task_struct *t)
14762 +       if (atomic_dec_and_test(&t->usage))
14763 +               call_rcu(&t->put_rcu, __put_task_struct_cb);
14765 +#else
14766  extern void __put_task_struct(struct task_struct *t);
14768  static inline void put_task_struct(struct task_struct *t)
14769 @@ -2035,6 +2090,7 @@ static inline void put_task_struct(struct task_struct *t)
14770         if (atomic_dec_and_test(&t->usage))
14771                 __put_task_struct(t);
14773 +#endif
14775  #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
14776  extern void task_cputime(struct task_struct *t,
14777 @@ -2073,6 +2129,7 @@ extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut,
14778  /*
14779   * Per process flags
14780   */
14781 +#define PF_IN_SOFTIRQ  0x00000001      /* Task is serving softirq */
14782  #define PF_EXITING     0x00000004      /* getting shut down */
14783  #define PF_EXITPIDONE  0x00000008      /* pi exit done on shut down */
14784  #define PF_VCPU                0x00000010      /* I'm a virtual CPU */
14785 @@ -2237,6 +2294,10 @@ extern void do_set_cpus_allowed(struct task_struct *p,
14787  extern int set_cpus_allowed_ptr(struct task_struct *p,
14788                                 const struct cpumask *new_mask);
14789 +int migrate_me(void);
14790 +void tell_sched_cpu_down_begin(int cpu);
14791 +void tell_sched_cpu_down_done(int cpu);
14793  #else
14794  static inline void do_set_cpus_allowed(struct task_struct *p,
14795                                       const struct cpumask *new_mask)
14796 @@ -2249,6 +2310,9 @@ static inline int set_cpus_allowed_ptr(struct task_struct *p,
14797                 return -EINVAL;
14798         return 0;
14800 +static inline int migrate_me(void) { return 0; }
14801 +static inline void tell_sched_cpu_down_begin(int cpu) { }
14802 +static inline void tell_sched_cpu_down_done(int cpu) { }
14803  #endif
14805  #ifdef CONFIG_NO_HZ_COMMON
14806 @@ -2458,6 +2522,7 @@ extern void xtime_update(unsigned long ticks);
14808  extern int wake_up_state(struct task_struct *tsk, unsigned int state);
14809  extern int wake_up_process(struct task_struct *tsk);
14810 +extern int wake_up_lock_sleeper(struct task_struct * tsk);
14811  extern void wake_up_new_task(struct task_struct *tsk);
14812  #ifdef CONFIG_SMP
14813   extern void kick_process(struct task_struct *tsk);
14814 @@ -2581,12 +2646,24 @@ extern struct mm_struct * mm_alloc(void);
14816  /* mmdrop drops the mm and the page tables */
14817  extern void __mmdrop(struct mm_struct *);
14819  static inline void mmdrop(struct mm_struct * mm)
14821         if (unlikely(atomic_dec_and_test(&mm->mm_count)))
14822                 __mmdrop(mm);
14825 +#ifdef CONFIG_PREEMPT_RT_BASE
14826 +extern void __mmdrop_delayed(struct rcu_head *rhp);
14827 +static inline void mmdrop_delayed(struct mm_struct *mm)
14829 +       if (atomic_dec_and_test(&mm->mm_count))
14830 +               call_rcu(&mm->delayed_drop, __mmdrop_delayed);
14832 +#else
14833 +# define mmdrop_delayed(mm)    mmdrop(mm)
14834 +#endif
14836  /* mmput gets rid of the mappings and all user-space */
14837  extern void mmput(struct mm_struct *);
14838  /* Grab a reference to a task's mm, if it is not already going away */
14839 @@ -2896,6 +2973,43 @@ static inline int test_tsk_need_resched(struct task_struct *tsk)
14840         return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED));
14843 +#ifdef CONFIG_PREEMPT_LAZY
14844 +static inline void set_tsk_need_resched_lazy(struct task_struct *tsk)
14846 +       set_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY);
14849 +static inline void clear_tsk_need_resched_lazy(struct task_struct *tsk)
14851 +       clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY);
14854 +static inline int test_tsk_need_resched_lazy(struct task_struct *tsk)
14856 +       return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY));
14859 +static inline int need_resched_lazy(void)
14861 +       return test_thread_flag(TIF_NEED_RESCHED_LAZY);
14864 +static inline int need_resched_now(void)
14866 +       return test_thread_flag(TIF_NEED_RESCHED);
14869 +#else
14870 +static inline void clear_tsk_need_resched_lazy(struct task_struct *tsk) { }
14871 +static inline int need_resched_lazy(void) { return 0; }
14873 +static inline int need_resched_now(void)
14875 +       return test_thread_flag(TIF_NEED_RESCHED);
14878 +#endif
14880  static inline int restart_syscall(void)
14882         set_tsk_thread_flag(current, TIF_SIGPENDING);
14883 @@ -2927,6 +3041,51 @@ static inline int signal_pending_state(long state, struct task_struct *p)
14884         return (state & TASK_INTERRUPTIBLE) || __fatal_signal_pending(p);
14887 +static inline bool __task_is_stopped_or_traced(struct task_struct *task)
14889 +       if (task->state & (__TASK_STOPPED | __TASK_TRACED))
14890 +               return true;
14891 +#ifdef CONFIG_PREEMPT_RT_FULL
14892 +       if (task->saved_state & (__TASK_STOPPED | __TASK_TRACED))
14893 +               return true;
14894 +#endif
14895 +       return false;
14898 +static inline bool task_is_stopped_or_traced(struct task_struct *task)
14900 +       bool traced_stopped;
14902 +#ifdef CONFIG_PREEMPT_RT_FULL
14903 +       unsigned long flags;
14905 +       raw_spin_lock_irqsave(&task->pi_lock, flags);
14906 +       traced_stopped = __task_is_stopped_or_traced(task);
14907 +       raw_spin_unlock_irqrestore(&task->pi_lock, flags);
14908 +#else
14909 +       traced_stopped = __task_is_stopped_or_traced(task);
14910 +#endif
14911 +       return traced_stopped;
14914 +static inline bool task_is_traced(struct task_struct *task)
14916 +       bool traced = false;
14918 +       if (task->state & __TASK_TRACED)
14919 +               return true;
14920 +#ifdef CONFIG_PREEMPT_RT_FULL
14921 +       /* in case the task is sleeping on tasklist_lock */
14922 +       raw_spin_lock_irq(&task->pi_lock);
14923 +       if (task->state & __TASK_TRACED)
14924 +               traced = true;
14925 +       else if (task->saved_state & __TASK_TRACED)
14926 +               traced = true;
14927 +       raw_spin_unlock_irq(&task->pi_lock);
14928 +#endif
14929 +       return traced;
14932  /*
14933   * cond_resched() and cond_resched_lock(): latency reduction via
14934   * explicit rescheduling in places that are safe. The return
14935 @@ -2948,12 +3107,16 @@ extern int __cond_resched_lock(spinlock_t *lock);
14936         __cond_resched_lock(lock);                              \
14937  })
14939 +#ifndef CONFIG_PREEMPT_RT_FULL
14940  extern int __cond_resched_softirq(void);
14942  #define cond_resched_softirq() ({                                      \
14943         ___might_sleep(__FILE__, __LINE__, SOFTIRQ_DISABLE_OFFSET);     \
14944         __cond_resched_softirq();                                       \
14945  })
14946 +#else
14947 +# define cond_resched_softirq()                cond_resched()
14948 +#endif
14950  static inline void cond_resched_rcu(void)
14952 @@ -3115,6 +3278,31 @@ static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
14954  #endif /* CONFIG_SMP */
14956 +static inline int __migrate_disabled(struct task_struct *p)
14958 +#ifdef CONFIG_PREEMPT_RT_FULL
14959 +       return p->migrate_disable;
14960 +#else
14961 +       return 0;
14962 +#endif
14965 +/* Future-safe accessor for struct task_struct's cpus_allowed. */
14966 +static inline const struct cpumask *tsk_cpus_allowed(struct task_struct *p)
14968 +       if (__migrate_disabled(p))
14969 +               return cpumask_of(task_cpu(p));
14971 +       return &p->cpus_allowed;
14974 +static inline int tsk_nr_cpus_allowed(struct task_struct *p)
14976 +       if (__migrate_disabled(p))
14977 +               return 1;
14978 +       return p->nr_cpus_allowed;
14981  extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask);
14982  extern long sched_getaffinity(pid_t pid, struct cpumask *mask);
14984 diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
14985 index e0582106ef4f..b14f4d2368aa 100644
14986 --- a/include/linux/seqlock.h
14987 +++ b/include/linux/seqlock.h
14988 @@ -220,20 +220,30 @@ static inline int read_seqcount_retry(const seqcount_t *s, unsigned start)
14989         return __read_seqcount_retry(s, start);
14994 -static inline void raw_write_seqcount_begin(seqcount_t *s)
14995 +static inline void __raw_write_seqcount_begin(seqcount_t *s)
14997         s->sequence++;
14998         smp_wmb();
15001 -static inline void raw_write_seqcount_end(seqcount_t *s)
15002 +static inline void raw_write_seqcount_begin(seqcount_t *s)
15004 +       preempt_disable_rt();
15005 +       __raw_write_seqcount_begin(s);
15008 +static inline void __raw_write_seqcount_end(seqcount_t *s)
15010         smp_wmb();
15011         s->sequence++;
15014 +static inline void raw_write_seqcount_end(seqcount_t *s)
15016 +       __raw_write_seqcount_end(s);
15017 +       preempt_enable_rt();
15020  /**
15021   * raw_write_seqcount_barrier - do a seq write barrier
15022   * @s: pointer to seqcount_t
15023 @@ -425,10 +435,32 @@ typedef struct {
15024  /*
15025   * Read side functions for starting and finalizing a read side section.
15026   */
15027 +#ifndef CONFIG_PREEMPT_RT_FULL
15028  static inline unsigned read_seqbegin(const seqlock_t *sl)
15030         return read_seqcount_begin(&sl->seqcount);
15032 +#else
15034 + * Starvation safe read side for RT
15035 + */
15036 +static inline unsigned read_seqbegin(seqlock_t *sl)
15038 +       unsigned ret;
15040 +repeat:
15041 +       ret = ACCESS_ONCE(sl->seqcount.sequence);
15042 +       if (unlikely(ret & 1)) {
15043 +               /*
15044 +                * Take the lock and let the writer proceed (i.e. evtl
15045 +                * boost it), otherwise we could loop here forever.
15046 +                */
15047 +               spin_unlock_wait(&sl->lock);
15048 +               goto repeat;
15049 +       }
15050 +       return ret;
15052 +#endif
15054  static inline unsigned read_seqretry(const seqlock_t *sl, unsigned start)
15056 @@ -443,36 +475,36 @@ static inline unsigned read_seqretry(const seqlock_t *sl, unsigned start)
15057  static inline void write_seqlock(seqlock_t *sl)
15059         spin_lock(&sl->lock);
15060 -       write_seqcount_begin(&sl->seqcount);
15061 +       __raw_write_seqcount_begin(&sl->seqcount);
15064  static inline void write_sequnlock(seqlock_t *sl)
15066 -       write_seqcount_end(&sl->seqcount);
15067 +       __raw_write_seqcount_end(&sl->seqcount);
15068         spin_unlock(&sl->lock);
15071  static inline void write_seqlock_bh(seqlock_t *sl)
15073         spin_lock_bh(&sl->lock);
15074 -       write_seqcount_begin(&sl->seqcount);
15075 +       __raw_write_seqcount_begin(&sl->seqcount);
15078  static inline void write_sequnlock_bh(seqlock_t *sl)
15080 -       write_seqcount_end(&sl->seqcount);
15081 +       __raw_write_seqcount_end(&sl->seqcount);
15082         spin_unlock_bh(&sl->lock);
15085  static inline void write_seqlock_irq(seqlock_t *sl)
15087         spin_lock_irq(&sl->lock);
15088 -       write_seqcount_begin(&sl->seqcount);
15089 +       __raw_write_seqcount_begin(&sl->seqcount);
15092  static inline void write_sequnlock_irq(seqlock_t *sl)
15094 -       write_seqcount_end(&sl->seqcount);
15095 +       __raw_write_seqcount_end(&sl->seqcount);
15096         spin_unlock_irq(&sl->lock);
15099 @@ -481,7 +513,7 @@ static inline unsigned long __write_seqlock_irqsave(seqlock_t *sl)
15100         unsigned long flags;
15102         spin_lock_irqsave(&sl->lock, flags);
15103 -       write_seqcount_begin(&sl->seqcount);
15104 +       __raw_write_seqcount_begin(&sl->seqcount);
15105         return flags;
15108 @@ -491,7 +523,7 @@ static inline unsigned long __write_seqlock_irqsave(seqlock_t *sl)
15109  static inline void
15110  write_sequnlock_irqrestore(seqlock_t *sl, unsigned long flags)
15112 -       write_seqcount_end(&sl->seqcount);
15113 +       __raw_write_seqcount_end(&sl->seqcount);
15114         spin_unlock_irqrestore(&sl->lock, flags);
15117 diff --git a/include/linux/signal.h b/include/linux/signal.h
15118 index d80259afb9e5..ddd1e6866a54 100644
15119 --- a/include/linux/signal.h
15120 +++ b/include/linux/signal.h
15121 @@ -233,6 +233,7 @@ static inline void init_sigpending(struct sigpending *sig)
15124  extern void flush_sigqueue(struct sigpending *queue);
15125 +extern void flush_task_sigqueue(struct task_struct *tsk);
15127  /* Test if 'sig' is valid signal. Use this instead of testing _NSIG directly */
15128  static inline int valid_signal(unsigned long sig)
15129 diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
15130 index d443d9ab0236..2d1c7f9b7fd0 100644
15131 --- a/include/linux/skbuff.h
15132 +++ b/include/linux/skbuff.h
15133 @@ -203,6 +203,7 @@ struct sk_buff_head {
15135         __u32           qlen;
15136         spinlock_t      lock;
15137 +       raw_spinlock_t  raw_lock;
15138  };
15140  struct sk_buff;
15141 @@ -1465,6 +1466,12 @@ static inline void skb_queue_head_init(struct sk_buff_head *list)
15142         __skb_queue_head_init(list);
15145 +static inline void skb_queue_head_init_raw(struct sk_buff_head *list)
15147 +       raw_spin_lock_init(&list->raw_lock);
15148 +       __skb_queue_head_init(list);
15151  static inline void skb_queue_head_init_class(struct sk_buff_head *list,
15152                 struct lock_class_key *class)
15154 diff --git a/include/linux/smp.h b/include/linux/smp.h
15155 index c4414074bd88..e6ab36aeaaab 100644
15156 --- a/include/linux/smp.h
15157 +++ b/include/linux/smp.h
15158 @@ -185,6 +185,9 @@ static inline void smp_init(void) { }
15159  #define get_cpu()              ({ preempt_disable(); smp_processor_id(); })
15160  #define put_cpu()              preempt_enable()
15162 +#define get_cpu_light()                ({ migrate_disable(); smp_processor_id(); })
15163 +#define put_cpu_light()                migrate_enable()
15165  /*
15166   * Callback to arch code if there's nosmp or maxcpus=0 on the
15167   * boot command line:
15168 diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
15169 index 47dd0cebd204..02928fa5499d 100644
15170 --- a/include/linux/spinlock.h
15171 +++ b/include/linux/spinlock.h
15172 @@ -271,7 +271,11 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock)
15173  #define raw_spin_can_lock(lock)        (!raw_spin_is_locked(lock))
15175  /* Include rwlock functions */
15176 -#include <linux/rwlock.h>
15177 +#ifdef CONFIG_PREEMPT_RT_FULL
15178 +# include <linux/rwlock_rt.h>
15179 +#else
15180 +# include <linux/rwlock.h>
15181 +#endif
15183  /*
15184   * Pull the _spin_*()/_read_*()/_write_*() functions/declarations:
15185 @@ -282,6 +286,10 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock)
15186  # include <linux/spinlock_api_up.h>
15187  #endif
15189 +#ifdef CONFIG_PREEMPT_RT_FULL
15190 +# include <linux/spinlock_rt.h>
15191 +#else /* PREEMPT_RT_FULL */
15193  /*
15194   * Map the spin_lock functions to the raw variants for PREEMPT_RT=n
15195   */
15196 @@ -347,6 +355,12 @@ static __always_inline void spin_unlock(spinlock_t *lock)
15197         raw_spin_unlock(&lock->rlock);
15200 +static __always_inline int spin_unlock_no_deboost(spinlock_t *lock)
15202 +       raw_spin_unlock(&lock->rlock);
15203 +       return 0;
15206  static __always_inline void spin_unlock_bh(spinlock_t *lock)
15208         raw_spin_unlock_bh(&lock->rlock);
15209 @@ -416,4 +430,6 @@ extern int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock);
15210  #define atomic_dec_and_lock(atomic, lock) \
15211                 __cond_lock(lock, _atomic_dec_and_lock(atomic, lock))
15213 +#endif /* !PREEMPT_RT_FULL */
15215  #endif /* __LINUX_SPINLOCK_H */
15216 diff --git a/include/linux/spinlock_api_smp.h b/include/linux/spinlock_api_smp.h
15217 index 5344268e6e62..043263f30e81 100644
15218 --- a/include/linux/spinlock_api_smp.h
15219 +++ b/include/linux/spinlock_api_smp.h
15220 @@ -189,6 +189,8 @@ static inline int __raw_spin_trylock_bh(raw_spinlock_t *lock)
15221         return 0;
15224 -#include <linux/rwlock_api_smp.h>
15225 +#ifndef CONFIG_PREEMPT_RT_FULL
15226 +# include <linux/rwlock_api_smp.h>
15227 +#endif
15229  #endif /* __LINUX_SPINLOCK_API_SMP_H */
15230 diff --git a/include/linux/spinlock_rt.h b/include/linux/spinlock_rt.h
15231 new file mode 100644
15232 index 000000000000..7eb87584e843
15233 --- /dev/null
15234 +++ b/include/linux/spinlock_rt.h
15235 @@ -0,0 +1,165 @@
15236 +#ifndef __LINUX_SPINLOCK_RT_H
15237 +#define __LINUX_SPINLOCK_RT_H
15239 +#ifndef __LINUX_SPINLOCK_H
15240 +#error Do not include directly. Use spinlock.h
15241 +#endif
15243 +#include <linux/bug.h>
15245 +extern void
15246 +__rt_spin_lock_init(spinlock_t *lock, char *name, struct lock_class_key *key);
15248 +#define spin_lock_init(slock)                          \
15249 +do {                                                   \
15250 +       static struct lock_class_key __key;             \
15251 +                                                       \
15252 +       rt_mutex_init(&(slock)->lock);                  \
15253 +       __rt_spin_lock_init(slock, #slock, &__key);     \
15254 +} while (0)
15256 +void __lockfunc rt_spin_lock__no_mg(spinlock_t *lock);
15257 +void __lockfunc rt_spin_unlock__no_mg(spinlock_t *lock);
15258 +int __lockfunc rt_spin_trylock__no_mg(spinlock_t *lock);
15260 +extern void __lockfunc rt_spin_lock(spinlock_t *lock);
15261 +extern unsigned long __lockfunc rt_spin_lock_trace_flags(spinlock_t *lock);
15262 +extern void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int subclass);
15263 +extern void __lockfunc rt_spin_unlock(spinlock_t *lock);
15264 +extern int __lockfunc rt_spin_unlock_no_deboost(spinlock_t *lock);
15265 +extern void __lockfunc rt_spin_unlock_wait(spinlock_t *lock);
15266 +extern int __lockfunc rt_spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags);
15267 +extern int __lockfunc rt_spin_trylock_bh(spinlock_t *lock);
15268 +extern int __lockfunc rt_spin_trylock(spinlock_t *lock);
15269 +extern int atomic_dec_and_spin_lock(atomic_t *atomic, spinlock_t *lock);
15272 + * lockdep-less calls, for derived types like rwlock:
15273 + * (for trylock they can use rt_mutex_trylock() directly.
15274 + */
15275 +extern void __lockfunc __rt_spin_lock__no_mg(struct rt_mutex *lock);
15276 +extern void __lockfunc __rt_spin_lock(struct rt_mutex *lock);
15277 +extern void __lockfunc __rt_spin_unlock(struct rt_mutex *lock);
15278 +extern int __lockfunc __rt_spin_trylock(struct rt_mutex *lock);
15280 +#define spin_lock(lock)                        rt_spin_lock(lock)
15282 +#define spin_lock_bh(lock)                     \
15283 +       do {                                    \
15284 +               local_bh_disable();             \
15285 +               rt_spin_lock(lock);             \
15286 +       } while (0)
15288 +#define spin_lock_irq(lock)            spin_lock(lock)
15290 +#define spin_do_trylock(lock)          __cond_lock(lock, rt_spin_trylock(lock))
15292 +#define spin_trylock(lock)                     \
15293 +({                                             \
15294 +       int __locked;                           \
15295 +       __locked = spin_do_trylock(lock);       \
15296 +       __locked;                               \
15299 +#ifdef CONFIG_LOCKDEP
15300 +# define spin_lock_nested(lock, subclass)              \
15301 +       do {                                            \
15302 +               rt_spin_lock_nested(lock, subclass);    \
15303 +       } while (0)
15305 +#define spin_lock_bh_nested(lock, subclass)            \
15306 +       do {                                            \
15307 +               local_bh_disable();                     \
15308 +               rt_spin_lock_nested(lock, subclass);    \
15309 +       } while (0)
15311 +# define spin_lock_irqsave_nested(lock, flags, subclass) \
15312 +       do {                                             \
15313 +               typecheck(unsigned long, flags);         \
15314 +               flags = 0;                               \
15315 +               rt_spin_lock_nested(lock, subclass);     \
15316 +       } while (0)
15317 +#else
15318 +# define spin_lock_nested(lock, subclass)      spin_lock(lock)
15319 +# define spin_lock_bh_nested(lock, subclass)   spin_lock_bh(lock)
15321 +# define spin_lock_irqsave_nested(lock, flags, subclass) \
15322 +       do {                                             \
15323 +               typecheck(unsigned long, flags);         \
15324 +               flags = 0;                               \
15325 +               spin_lock(lock);                         \
15326 +       } while (0)
15327 +#endif
15329 +#define spin_lock_irqsave(lock, flags)                  \
15330 +       do {                                             \
15331 +               typecheck(unsigned long, flags);         \
15332 +               flags = 0;                               \
15333 +               spin_lock(lock);                         \
15334 +       } while (0)
15336 +static inline unsigned long spin_lock_trace_flags(spinlock_t *lock)
15338 +       unsigned long flags = 0;
15339 +#ifdef CONFIG_TRACE_IRQFLAGS
15340 +       flags = rt_spin_lock_trace_flags(lock);
15341 +#else
15342 +       spin_lock(lock); /* lock_local */
15343 +#endif
15344 +       return flags;
15347 +/* FIXME: we need rt_spin_lock_nest_lock */
15348 +#define spin_lock_nest_lock(lock, nest_lock) spin_lock_nested(lock, 0)
15350 +#define spin_unlock(lock)                      rt_spin_unlock(lock)
15351 +#define spin_unlock_no_deboost(lock)           rt_spin_unlock_no_deboost(lock)
15353 +#define spin_unlock_bh(lock)                           \
15354 +       do {                                            \
15355 +               rt_spin_unlock(lock);                   \
15356 +               local_bh_enable();                      \
15357 +       } while (0)
15359 +#define spin_unlock_irq(lock)          spin_unlock(lock)
15361 +#define spin_unlock_irqrestore(lock, flags)            \
15362 +       do {                                            \
15363 +               typecheck(unsigned long, flags);        \
15364 +               (void) flags;                           \
15365 +               spin_unlock(lock);                      \
15366 +       } while (0)
15368 +#define spin_trylock_bh(lock)  __cond_lock(lock, rt_spin_trylock_bh(lock))
15369 +#define spin_trylock_irq(lock) spin_trylock(lock)
15371 +#define spin_trylock_irqsave(lock, flags)      \
15372 +       rt_spin_trylock_irqsave(lock, &(flags))
15374 +#define spin_unlock_wait(lock)         rt_spin_unlock_wait(lock)
15376 +#ifdef CONFIG_GENERIC_LOCKBREAK
15377 +# define spin_is_contended(lock)       ((lock)->break_lock)
15378 +#else
15379 +# define spin_is_contended(lock)       (((void)(lock), 0))
15380 +#endif
15382 +static inline int spin_can_lock(spinlock_t *lock)
15384 +       return !rt_mutex_is_locked(&lock->lock);
15387 +static inline int spin_is_locked(spinlock_t *lock)
15389 +       return rt_mutex_is_locked(&lock->lock);
15392 +static inline void assert_spin_locked(spinlock_t *lock)
15394 +       BUG_ON(!spin_is_locked(lock));
15397 +#define atomic_dec_and_lock(atomic, lock) \
15398 +       atomic_dec_and_spin_lock(atomic, lock)
15400 +#endif
15401 diff --git a/include/linux/spinlock_types.h b/include/linux/spinlock_types.h
15402 index 73548eb13a5d..10bac715ea96 100644
15403 --- a/include/linux/spinlock_types.h
15404 +++ b/include/linux/spinlock_types.h
15405 @@ -9,80 +9,15 @@
15406   * Released under the General Public License (GPL).
15407   */
15409 -#if defined(CONFIG_SMP)
15410 -# include <asm/spinlock_types.h>
15411 -#else
15412 -# include <linux/spinlock_types_up.h>
15413 -#endif
15415 -#include <linux/lockdep.h>
15417 -typedef struct raw_spinlock {
15418 -       arch_spinlock_t raw_lock;
15419 -#ifdef CONFIG_GENERIC_LOCKBREAK
15420 -       unsigned int break_lock;
15421 -#endif
15422 -#ifdef CONFIG_DEBUG_SPINLOCK
15423 -       unsigned int magic, owner_cpu;
15424 -       void *owner;
15425 -#endif
15426 -#ifdef CONFIG_DEBUG_LOCK_ALLOC
15427 -       struct lockdep_map dep_map;
15428 -#endif
15429 -} raw_spinlock_t;
15431 -#define SPINLOCK_MAGIC         0xdead4ead
15433 -#define SPINLOCK_OWNER_INIT    ((void *)-1L)
15435 -#ifdef CONFIG_DEBUG_LOCK_ALLOC
15436 -# define SPIN_DEP_MAP_INIT(lockname)   .dep_map = { .name = #lockname }
15437 -#else
15438 -# define SPIN_DEP_MAP_INIT(lockname)
15439 -#endif
15440 +#include <linux/spinlock_types_raw.h>
15442 -#ifdef CONFIG_DEBUG_SPINLOCK
15443 -# define SPIN_DEBUG_INIT(lockname)             \
15444 -       .magic = SPINLOCK_MAGIC,                \
15445 -       .owner_cpu = -1,                        \
15446 -       .owner = SPINLOCK_OWNER_INIT,
15447 +#ifndef CONFIG_PREEMPT_RT_FULL
15448 +# include <linux/spinlock_types_nort.h>
15449 +# include <linux/rwlock_types.h>
15450  #else
15451 -# define SPIN_DEBUG_INIT(lockname)
15452 +# include <linux/rtmutex.h>
15453 +# include <linux/spinlock_types_rt.h>
15454 +# include <linux/rwlock_types_rt.h>
15455  #endif
15457 -#define __RAW_SPIN_LOCK_INITIALIZER(lockname)  \
15458 -       {                                       \
15459 -       .raw_lock = __ARCH_SPIN_LOCK_UNLOCKED,  \
15460 -       SPIN_DEBUG_INIT(lockname)               \
15461 -       SPIN_DEP_MAP_INIT(lockname) }
15463 -#define __RAW_SPIN_LOCK_UNLOCKED(lockname)     \
15464 -       (raw_spinlock_t) __RAW_SPIN_LOCK_INITIALIZER(lockname)
15466 -#define DEFINE_RAW_SPINLOCK(x) raw_spinlock_t x = __RAW_SPIN_LOCK_UNLOCKED(x)
15468 -typedef struct spinlock {
15469 -       union {
15470 -               struct raw_spinlock rlock;
15472 -#ifdef CONFIG_DEBUG_LOCK_ALLOC
15473 -# define LOCK_PADSIZE (offsetof(struct raw_spinlock, dep_map))
15474 -               struct {
15475 -                       u8 __padding[LOCK_PADSIZE];
15476 -                       struct lockdep_map dep_map;
15477 -               };
15478 -#endif
15479 -       };
15480 -} spinlock_t;
15482 -#define __SPIN_LOCK_INITIALIZER(lockname) \
15483 -       { { .rlock = __RAW_SPIN_LOCK_INITIALIZER(lockname) } }
15485 -#define __SPIN_LOCK_UNLOCKED(lockname) \
15486 -       (spinlock_t ) __SPIN_LOCK_INITIALIZER(lockname)
15488 -#define DEFINE_SPINLOCK(x)     spinlock_t x = __SPIN_LOCK_UNLOCKED(x)
15490 -#include <linux/rwlock_types.h>
15492  #endif /* __LINUX_SPINLOCK_TYPES_H */
15493 diff --git a/include/linux/spinlock_types_nort.h b/include/linux/spinlock_types_nort.h
15494 new file mode 100644
15495 index 000000000000..f1dac1fb1d6a
15496 --- /dev/null
15497 +++ b/include/linux/spinlock_types_nort.h
15498 @@ -0,0 +1,33 @@
15499 +#ifndef __LINUX_SPINLOCK_TYPES_NORT_H
15500 +#define __LINUX_SPINLOCK_TYPES_NORT_H
15502 +#ifndef __LINUX_SPINLOCK_TYPES_H
15503 +#error "Do not include directly. Include spinlock_types.h instead"
15504 +#endif
15507 + * The non RT version maps spinlocks to raw_spinlocks
15508 + */
15509 +typedef struct spinlock {
15510 +       union {
15511 +               struct raw_spinlock rlock;
15513 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
15514 +# define LOCK_PADSIZE (offsetof(struct raw_spinlock, dep_map))
15515 +               struct {
15516 +                       u8 __padding[LOCK_PADSIZE];
15517 +                       struct lockdep_map dep_map;
15518 +               };
15519 +#endif
15520 +       };
15521 +} spinlock_t;
15523 +#define __SPIN_LOCK_INITIALIZER(lockname) \
15524 +       { { .rlock = __RAW_SPIN_LOCK_INITIALIZER(lockname) } }
15526 +#define __SPIN_LOCK_UNLOCKED(lockname) \
15527 +       (spinlock_t ) __SPIN_LOCK_INITIALIZER(lockname)
15529 +#define DEFINE_SPINLOCK(x)     spinlock_t x = __SPIN_LOCK_UNLOCKED(x)
15531 +#endif
15532 diff --git a/include/linux/spinlock_types_raw.h b/include/linux/spinlock_types_raw.h
15533 new file mode 100644
15534 index 000000000000..edffc4d53fc9
15535 --- /dev/null
15536 +++ b/include/linux/spinlock_types_raw.h
15537 @@ -0,0 +1,56 @@
15538 +#ifndef __LINUX_SPINLOCK_TYPES_RAW_H
15539 +#define __LINUX_SPINLOCK_TYPES_RAW_H
15541 +#if defined(CONFIG_SMP)
15542 +# include <asm/spinlock_types.h>
15543 +#else
15544 +# include <linux/spinlock_types_up.h>
15545 +#endif
15547 +#include <linux/lockdep.h>
15549 +typedef struct raw_spinlock {
15550 +       arch_spinlock_t raw_lock;
15551 +#ifdef CONFIG_GENERIC_LOCKBREAK
15552 +       unsigned int break_lock;
15553 +#endif
15554 +#ifdef CONFIG_DEBUG_SPINLOCK
15555 +       unsigned int magic, owner_cpu;
15556 +       void *owner;
15557 +#endif
15558 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
15559 +       struct lockdep_map dep_map;
15560 +#endif
15561 +} raw_spinlock_t;
15563 +#define SPINLOCK_MAGIC         0xdead4ead
15565 +#define SPINLOCK_OWNER_INIT    ((void *)-1L)
15567 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
15568 +# define SPIN_DEP_MAP_INIT(lockname)   .dep_map = { .name = #lockname }
15569 +#else
15570 +# define SPIN_DEP_MAP_INIT(lockname)
15571 +#endif
15573 +#ifdef CONFIG_DEBUG_SPINLOCK
15574 +# define SPIN_DEBUG_INIT(lockname)             \
15575 +       .magic = SPINLOCK_MAGIC,                \
15576 +       .owner_cpu = -1,                        \
15577 +       .owner = SPINLOCK_OWNER_INIT,
15578 +#else
15579 +# define SPIN_DEBUG_INIT(lockname)
15580 +#endif
15582 +#define __RAW_SPIN_LOCK_INITIALIZER(lockname)  \
15583 +       {                                       \
15584 +       .raw_lock = __ARCH_SPIN_LOCK_UNLOCKED,  \
15585 +       SPIN_DEBUG_INIT(lockname)               \
15586 +       SPIN_DEP_MAP_INIT(lockname) }
15588 +#define __RAW_SPIN_LOCK_UNLOCKED(lockname)     \
15589 +       (raw_spinlock_t) __RAW_SPIN_LOCK_INITIALIZER(lockname)
15591 +#define DEFINE_RAW_SPINLOCK(x) raw_spinlock_t x = __RAW_SPIN_LOCK_UNLOCKED(x)
15593 +#endif
15594 diff --git a/include/linux/spinlock_types_rt.h b/include/linux/spinlock_types_rt.h
15595 new file mode 100644
15596 index 000000000000..9fd431967abc
15597 --- /dev/null
15598 +++ b/include/linux/spinlock_types_rt.h
15599 @@ -0,0 +1,51 @@
15600 +#ifndef __LINUX_SPINLOCK_TYPES_RT_H
15601 +#define __LINUX_SPINLOCK_TYPES_RT_H
15603 +#ifndef __LINUX_SPINLOCK_TYPES_H
15604 +#error "Do not include directly. Include spinlock_types.h instead"
15605 +#endif
15607 +#include <linux/cache.h>
15610 + * PREEMPT_RT: spinlocks - an RT mutex plus lock-break field:
15611 + */
15612 +typedef struct spinlock {
15613 +       struct rt_mutex         lock;
15614 +       unsigned int            break_lock;
15615 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
15616 +       struct lockdep_map      dep_map;
15617 +#endif
15618 +} spinlock_t;
15620 +#ifdef CONFIG_DEBUG_RT_MUTEXES
15621 +# define __RT_SPIN_INITIALIZER(name) \
15622 +       { \
15623 +       .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), \
15624 +       .save_state = 1, \
15625 +       .file = __FILE__, \
15626 +       .line = __LINE__ , \
15627 +       }
15628 +#else
15629 +# define __RT_SPIN_INITIALIZER(name) \
15630 +       {                                                               \
15631 +       .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock),          \
15632 +       .save_state = 1, \
15633 +       }
15634 +#endif
15637 +.wait_list = PLIST_HEAD_INIT_RAW((name).lock.wait_list, (name).lock.wait_lock)
15640 +#define __SPIN_LOCK_UNLOCKED(name)                     \
15641 +       { .lock = __RT_SPIN_INITIALIZER(name.lock),             \
15642 +         SPIN_DEP_MAP_INIT(name) }
15644 +#define __DEFINE_SPINLOCK(name) \
15645 +       spinlock_t name = __SPIN_LOCK_UNLOCKED(name)
15647 +#define DEFINE_SPINLOCK(name) \
15648 +       spinlock_t name __cacheline_aligned_in_smp = __SPIN_LOCK_UNLOCKED(name)
15650 +#endif
15651 diff --git a/include/linux/srcu.h b/include/linux/srcu.h
15652 index f5f80c5643ac..ec1a8f01563c 100644
15653 --- a/include/linux/srcu.h
15654 +++ b/include/linux/srcu.h
15655 @@ -84,10 +84,10 @@ int init_srcu_struct(struct srcu_struct *sp);
15657  void process_srcu(struct work_struct *work);
15659 -#define __SRCU_STRUCT_INIT(name)                                       \
15660 +#define __SRCU_STRUCT_INIT(name, pcpu_name)                            \
15661         {                                                               \
15662                 .completed = -300,                                      \
15663 -               .per_cpu_ref = &name##_srcu_array,                      \
15664 +               .per_cpu_ref = &pcpu_name,                              \
15665                 .queue_lock = __SPIN_LOCK_UNLOCKED(name.queue_lock),    \
15666                 .running = false,                                       \
15667                 .batch_queue = RCU_BATCH_INIT(name.batch_queue),        \
15668 @@ -104,7 +104,7 @@ void process_srcu(struct work_struct *work);
15669   */
15670  #define __DEFINE_SRCU(name, is_static)                                 \
15671         static DEFINE_PER_CPU(struct srcu_struct_array, name##_srcu_array);\
15672 -       is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name)
15673 +       is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name, name##_srcu_array)
15674  #define DEFINE_SRCU(name)              __DEFINE_SRCU(name, /* not static */)
15675  #define DEFINE_STATIC_SRCU(name)       __DEFINE_SRCU(name, static)
15677 diff --git a/include/linux/suspend.h b/include/linux/suspend.h
15678 index 8b6ec7ef0854..9b77d4cc929f 100644
15679 --- a/include/linux/suspend.h
15680 +++ b/include/linux/suspend.h
15681 @@ -194,6 +194,12 @@ struct platform_freeze_ops {
15682         void (*end)(void);
15683  };
15685 +#if defined(CONFIG_SUSPEND) || defined(CONFIG_HIBERNATION)
15686 +extern bool pm_in_action;
15687 +#else
15688 +# define pm_in_action false
15689 +#endif
15691  #ifdef CONFIG_SUSPEND
15692  /**
15693   * suspend_set_ops - set platform dependent suspend operations
15694 diff --git a/include/linux/swait.h b/include/linux/swait.h
15695 new file mode 100644
15696 index 000000000000..83f004a72320
15697 --- /dev/null
15698 +++ b/include/linux/swait.h
15699 @@ -0,0 +1,173 @@
15700 +#ifndef _LINUX_SWAIT_H
15701 +#define _LINUX_SWAIT_H
15703 +#include <linux/list.h>
15704 +#include <linux/stddef.h>
15705 +#include <linux/spinlock.h>
15706 +#include <asm/current.h>
15709 + * Simple wait queues
15710 + *
15711 + * While these are very similar to the other/complex wait queues (wait.h) the
15712 + * most important difference is that the simple waitqueue allows for
15713 + * deterministic behaviour -- IOW it has strictly bounded IRQ and lock hold
15714 + * times.
15715 + *
15716 + * In order to make this so, we had to drop a fair number of features of the
15717 + * other waitqueue code; notably:
15718 + *
15719 + *  - mixing INTERRUPTIBLE and UNINTERRUPTIBLE sleeps on the same waitqueue;
15720 + *    all wakeups are TASK_NORMAL in order to avoid O(n) lookups for the right
15721 + *    sleeper state.
15722 + *
15723 + *  - the exclusive mode; because this requires preserving the list order
15724 + *    and this is hard.
15725 + *
15726 + *  - custom wake functions; because you cannot give any guarantees about
15727 + *    random code.
15728 + *
15729 + * As a side effect of this; the data structures are slimmer.
15730 + *
15731 + * One would recommend using this wait queue where possible.
15732 + */
15734 +struct task_struct;
15736 +struct swait_queue_head {
15737 +       raw_spinlock_t          lock;
15738 +       struct list_head        task_list;
15741 +struct swait_queue {
15742 +       struct task_struct      *task;
15743 +       struct list_head        task_list;
15746 +#define __SWAITQUEUE_INITIALIZER(name) {                               \
15747 +       .task           = current,                                      \
15748 +       .task_list      = LIST_HEAD_INIT((name).task_list),             \
15751 +#define DECLARE_SWAITQUEUE(name)                                       \
15752 +       struct swait_queue name = __SWAITQUEUE_INITIALIZER(name)
15754 +#define __SWAIT_QUEUE_HEAD_INITIALIZER(name) {                         \
15755 +       .lock           = __RAW_SPIN_LOCK_UNLOCKED(name.lock),          \
15756 +       .task_list      = LIST_HEAD_INIT((name).task_list),             \
15759 +#define DECLARE_SWAIT_QUEUE_HEAD(name)                                 \
15760 +       struct swait_queue_head name = __SWAIT_QUEUE_HEAD_INITIALIZER(name)
15762 +extern void __init_swait_queue_head(struct swait_queue_head *q, const char *name,
15763 +                                   struct lock_class_key *key);
15765 +#define init_swait_queue_head(q)                               \
15766 +       do {                                                    \
15767 +               static struct lock_class_key __key;             \
15768 +               __init_swait_queue_head((q), #q, &__key);       \
15769 +       } while (0)
15771 +#ifdef CONFIG_LOCKDEP
15772 +# define __SWAIT_QUEUE_HEAD_INIT_ONSTACK(name)                 \
15773 +       ({ init_swait_queue_head(&name); name; })
15774 +# define DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(name)                        \
15775 +       struct swait_queue_head name = __SWAIT_QUEUE_HEAD_INIT_ONSTACK(name)
15776 +#else
15777 +# define DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(name)                        \
15778 +       DECLARE_SWAIT_QUEUE_HEAD(name)
15779 +#endif
15781 +static inline int swait_active(struct swait_queue_head *q)
15783 +       return !list_empty(&q->task_list);
15786 +extern void swake_up(struct swait_queue_head *q);
15787 +extern void swake_up_all(struct swait_queue_head *q);
15788 +extern void swake_up_locked(struct swait_queue_head *q);
15789 +extern void swake_up_all_locked(struct swait_queue_head *q);
15791 +extern void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait);
15792 +extern void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int state);
15793 +extern long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait, int state);
15795 +extern void __finish_swait(struct swait_queue_head *q, struct swait_queue *wait);
15796 +extern void finish_swait(struct swait_queue_head *q, struct swait_queue *wait);
15798 +/* as per ___wait_event() but for swait, therefore "exclusive == 0" */
15799 +#define ___swait_event(wq, condition, state, ret, cmd)                 \
15800 +({                                                                     \
15801 +       struct swait_queue __wait;                                      \
15802 +       long __ret = ret;                                               \
15803 +                                                                       \
15804 +       INIT_LIST_HEAD(&__wait.task_list);                              \
15805 +       for (;;) {                                                      \
15806 +               long __int = prepare_to_swait_event(&wq, &__wait, state);\
15807 +                                                                       \
15808 +               if (condition)                                          \
15809 +                       break;                                          \
15810 +                                                                       \
15811 +               if (___wait_is_interruptible(state) && __int) {         \
15812 +                       __ret = __int;                                  \
15813 +                       break;                                          \
15814 +               }                                                       \
15815 +                                                                       \
15816 +               cmd;                                                    \
15817 +       }                                                               \
15818 +       finish_swait(&wq, &__wait);                                     \
15819 +       __ret;                                                          \
15822 +#define __swait_event(wq, condition)                                   \
15823 +       (void)___swait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0,    \
15824 +                           schedule())
15826 +#define swait_event(wq, condition)                                     \
15827 +do {                                                                   \
15828 +       if (condition)                                                  \
15829 +               break;                                                  \
15830 +       __swait_event(wq, condition);                                   \
15831 +} while (0)
15833 +#define __swait_event_timeout(wq, condition, timeout)                  \
15834 +       ___swait_event(wq, ___wait_cond_timeout(condition),             \
15835 +                     TASK_UNINTERRUPTIBLE, timeout,                    \
15836 +                     __ret = schedule_timeout(__ret))
15838 +#define swait_event_timeout(wq, condition, timeout)                    \
15839 +({                                                                     \
15840 +       long __ret = timeout;                                           \
15841 +       if (!___wait_cond_timeout(condition))                           \
15842 +               __ret = __swait_event_timeout(wq, condition, timeout);  \
15843 +       __ret;                                                          \
15846 +#define __swait_event_interruptible(wq, condition)                     \
15847 +       ___swait_event(wq, condition, TASK_INTERRUPTIBLE, 0,            \
15848 +                     schedule())
15850 +#define swait_event_interruptible(wq, condition)                       \
15851 +({                                                                     \
15852 +       int __ret = 0;                                                  \
15853 +       if (!(condition))                                               \
15854 +               __ret = __swait_event_interruptible(wq, condition);     \
15855 +       __ret;                                                          \
15858 +#define __swait_event_interruptible_timeout(wq, condition, timeout)    \
15859 +       ___swait_event(wq, ___wait_cond_timeout(condition),             \
15860 +                     TASK_INTERRUPTIBLE, timeout,                      \
15861 +                     __ret = schedule_timeout(__ret))
15863 +#define swait_event_interruptible_timeout(wq, condition, timeout)      \
15864 +({                                                                     \
15865 +       long __ret = timeout;                                           \
15866 +       if (!___wait_cond_timeout(condition))                           \
15867 +               __ret = __swait_event_interruptible_timeout(wq,         \
15868 +                                               condition, timeout);    \
15869 +       __ret;                                                          \
15872 +#endif /* _LINUX_SWAIT_H */
15873 diff --git a/include/linux/swap.h b/include/linux/swap.h
15874 index d8ca2eaa3a8b..19e038054914 100644
15875 --- a/include/linux/swap.h
15876 +++ b/include/linux/swap.h
15877 @@ -11,6 +11,7 @@
15878  #include <linux/fs.h>
15879  #include <linux/atomic.h>
15880  #include <linux/page-flags.h>
15881 +#include <linux/locallock.h>
15882  #include <asm/page.h>
15884  struct notifier_block;
15885 @@ -252,7 +253,8 @@ struct swap_info_struct {
15886  void *workingset_eviction(struct address_space *mapping, struct page *page);
15887  bool workingset_refault(void *shadow);
15888  void workingset_activation(struct page *page);
15889 -extern struct list_lru workingset_shadow_nodes;
15890 +extern struct list_lru __workingset_shadow_nodes;
15891 +DECLARE_LOCAL_IRQ_LOCK(workingset_shadow_lock);
15893  static inline unsigned int workingset_node_pages(struct radix_tree_node *node)
15895 @@ -298,6 +300,7 @@ extern unsigned long nr_free_pagecache_pages(void);
15898  /* linux/mm/swap.c */
15899 +DECLARE_LOCAL_IRQ_LOCK(swapvec_lock);
15900  extern void lru_cache_add(struct page *);
15901  extern void lru_cache_add_anon(struct page *page);
15902  extern void lru_cache_add_file(struct page *page);
15903 diff --git a/include/linux/swork.h b/include/linux/swork.h
15904 new file mode 100644
15905 index 000000000000..f175fa9a6016
15906 --- /dev/null
15907 +++ b/include/linux/swork.h
15908 @@ -0,0 +1,24 @@
15909 +#ifndef _LINUX_SWORK_H
15910 +#define _LINUX_SWORK_H
15912 +#include <linux/list.h>
15914 +struct swork_event {
15915 +       struct list_head item;
15916 +       unsigned long flags;
15917 +       void (*func)(struct swork_event *);
15920 +static inline void INIT_SWORK(struct swork_event *event,
15921 +                             void (*func)(struct swork_event *))
15923 +       event->flags = 0;
15924 +       event->func = func;
15927 +bool swork_queue(struct swork_event *sev);
15929 +int swork_get(void);
15930 +void swork_put(void);
15932 +#endif /* _LINUX_SWORK_H */
15933 diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
15934 index ff307b548ed3..be9f9dc6a4e1 100644
15935 --- a/include/linux/thread_info.h
15936 +++ b/include/linux/thread_info.h
15937 @@ -102,7 +102,17 @@ static inline int test_ti_thread_flag(struct thread_info *ti, int flag)
15938  #define test_thread_flag(flag) \
15939         test_ti_thread_flag(current_thread_info(), flag)
15941 -#define tif_need_resched() test_thread_flag(TIF_NEED_RESCHED)
15942 +#ifdef CONFIG_PREEMPT_LAZY
15943 +#define tif_need_resched()     (test_thread_flag(TIF_NEED_RESCHED) || \
15944 +                                test_thread_flag(TIF_NEED_RESCHED_LAZY))
15945 +#define tif_need_resched_now() (test_thread_flag(TIF_NEED_RESCHED))
15946 +#define tif_need_resched_lazy()        test_thread_flag(TIF_NEED_RESCHED_LAZY))
15948 +#else
15949 +#define tif_need_resched()     test_thread_flag(TIF_NEED_RESCHED)
15950 +#define tif_need_resched_now() test_thread_flag(TIF_NEED_RESCHED)
15951 +#define tif_need_resched_lazy()        0
15952 +#endif
15954  #if defined TIF_RESTORE_SIGMASK && !defined HAVE_SET_RESTORE_SIGMASK
15955  /*
15956 diff --git a/include/linux/timer.h b/include/linux/timer.h
15957 index 61aa61dc410c..299d2b78591f 100644
15958 --- a/include/linux/timer.h
15959 +++ b/include/linux/timer.h
15960 @@ -225,7 +225,7 @@ extern void add_timer(struct timer_list *timer);
15962  extern int try_to_del_timer_sync(struct timer_list *timer);
15964 -#ifdef CONFIG_SMP
15965 +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
15966    extern int del_timer_sync(struct timer_list *timer);
15967  #else
15968  # define del_timer_sync(t)             del_timer(t)
15969 diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
15970 index 925730bc9fc1..a591f414da6f 100644
15971 --- a/include/linux/trace_events.h
15972 +++ b/include/linux/trace_events.h
15973 @@ -66,6 +66,9 @@ struct trace_entry {
15974         unsigned char           flags;
15975         unsigned char           preempt_count;
15976         int                     pid;
15977 +       unsigned short          migrate_disable;
15978 +       unsigned short          padding;
15979 +       unsigned char           preempt_lazy_count;
15980  };
15982  #define TRACE_EVENT_TYPE_MAX                                           \
15983 diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
15984 index 558129af828a..cf5c472bbc79 100644
15985 --- a/include/linux/uaccess.h
15986 +++ b/include/linux/uaccess.h
15987 @@ -24,6 +24,7 @@ static __always_inline void pagefault_disabled_dec(void)
15988   */
15989  static inline void pagefault_disable(void)
15991 +       migrate_disable();
15992         pagefault_disabled_inc();
15993         /*
15994          * make sure to have issued the store before a pagefault
15995 @@ -40,6 +41,7 @@ static inline void pagefault_enable(void)
15996          */
15997         barrier();
15998         pagefault_disabled_dec();
15999 +       migrate_enable();
16002  /*
16003 diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
16004 index 4a29c75b146e..0a294e950df8 100644
16005 --- a/include/linux/uprobes.h
16006 +++ b/include/linux/uprobes.h
16007 @@ -27,6 +27,7 @@
16008  #include <linux/errno.h>
16009  #include <linux/rbtree.h>
16010  #include <linux/types.h>
16011 +#include <linux/wait.h>
16013  struct vm_area_struct;
16014  struct mm_struct;
16015 diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
16016 index 3e5d9075960f..7eaa847cd5a5 100644
16017 --- a/include/linux/vmstat.h
16018 +++ b/include/linux/vmstat.h
16019 @@ -33,7 +33,9 @@ DECLARE_PER_CPU(struct vm_event_state, vm_event_states);
16020   */
16021  static inline void __count_vm_event(enum vm_event_item item)
16023 +       preempt_disable_rt();
16024         raw_cpu_inc(vm_event_states.event[item]);
16025 +       preempt_enable_rt();
16028  static inline void count_vm_event(enum vm_event_item item)
16029 @@ -43,7 +45,9 @@ static inline void count_vm_event(enum vm_event_item item)
16031  static inline void __count_vm_events(enum vm_event_item item, long delta)
16033 +       preempt_disable_rt();
16034         raw_cpu_add(vm_event_states.event[item], delta);
16035 +       preempt_enable_rt();
16038  static inline void count_vm_events(enum vm_event_item item, long delta)
16039 diff --git a/include/linux/wait.h b/include/linux/wait.h
16040 index 513b36f04dfd..981c8a840f96 100644
16041 --- a/include/linux/wait.h
16042 +++ b/include/linux/wait.h
16043 @@ -8,6 +8,7 @@
16044  #include <linux/spinlock.h>
16045  #include <asm/current.h>
16046  #include <uapi/linux/wait.h>
16047 +#include <linux/atomic.h>
16049  typedef struct __wait_queue wait_queue_t;
16050  typedef int (*wait_queue_func_t)(wait_queue_t *wait, unsigned mode, int flags, void *key);
16051 diff --git a/include/net/dst.h b/include/net/dst.h
16052 index c7329dcd90cc..35c3dba16728 100644
16053 --- a/include/net/dst.h
16054 +++ b/include/net/dst.h
16055 @@ -437,7 +437,7 @@ static inline void dst_confirm(struct dst_entry *dst)
16056  static inline int dst_neigh_output(struct dst_entry *dst, struct neighbour *n,
16057                                    struct sk_buff *skb)
16059 -       const struct hh_cache *hh;
16060 +       struct hh_cache *hh;
16062         if (dst->pending_confirm) {
16063                 unsigned long now = jiffies;
16064 diff --git a/include/net/neighbour.h b/include/net/neighbour.h
16065 index 8b683841e574..bf656008f6e7 100644
16066 --- a/include/net/neighbour.h
16067 +++ b/include/net/neighbour.h
16068 @@ -446,7 +446,7 @@ static inline int neigh_hh_bridge(struct hh_cache *hh, struct sk_buff *skb)
16070  #endif
16072 -static inline int neigh_hh_output(const struct hh_cache *hh, struct sk_buff *skb)
16073 +static inline int neigh_hh_output(struct hh_cache *hh, struct sk_buff *skb)
16075         unsigned int seq;
16076         int hh_len;
16077 @@ -501,7 +501,7 @@ struct neighbour_cb {
16079  #define NEIGH_CB(skb)  ((struct neighbour_cb *)(skb)->cb)
16081 -static inline void neigh_ha_snapshot(char *dst, const struct neighbour *n,
16082 +static inline void neigh_ha_snapshot(char *dst, struct neighbour *n,
16083                                      const struct net_device *dev)
16085         unsigned int seq;
16086 diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
16087 index c68926b4899c..dd0751e76065 100644
16088 --- a/include/net/netns/ipv4.h
16089 +++ b/include/net/netns/ipv4.h
16090 @@ -70,6 +70,7 @@ struct netns_ipv4 {
16092         int sysctl_icmp_echo_ignore_all;
16093         int sysctl_icmp_echo_ignore_broadcasts;
16094 +       int sysctl_icmp_echo_sysrq;
16095         int sysctl_icmp_ignore_bogus_error_responses;
16096         int sysctl_icmp_ratelimit;
16097         int sysctl_icmp_ratemask;
16098 diff --git a/include/trace/events/hist.h b/include/trace/events/hist.h
16099 new file mode 100644
16100 index 000000000000..f7710de1b1f3
16101 --- /dev/null
16102 +++ b/include/trace/events/hist.h
16103 @@ -0,0 +1,73 @@
16104 +#undef TRACE_SYSTEM
16105 +#define TRACE_SYSTEM hist
16107 +#if !defined(_TRACE_HIST_H) || defined(TRACE_HEADER_MULTI_READ)
16108 +#define _TRACE_HIST_H
16110 +#include "latency_hist.h"
16111 +#include <linux/tracepoint.h>
16113 +#if !defined(CONFIG_PREEMPT_OFF_HIST) && !defined(CONFIG_INTERRUPT_OFF_HIST)
16114 +#define trace_preemptirqsoff_hist(a, b)
16115 +#define trace_preemptirqsoff_hist_rcuidle(a, b)
16116 +#else
16117 +TRACE_EVENT(preemptirqsoff_hist,
16119 +       TP_PROTO(int reason, int starthist),
16121 +       TP_ARGS(reason, starthist),
16123 +       TP_STRUCT__entry(
16124 +               __field(int,    reason)
16125 +               __field(int,    starthist)
16126 +       ),
16128 +       TP_fast_assign(
16129 +               __entry->reason         = reason;
16130 +               __entry->starthist      = starthist;
16131 +       ),
16133 +       TP_printk("reason=%s starthist=%s", getaction(__entry->reason),
16134 +                 __entry->starthist ? "start" : "stop")
16136 +#endif
16138 +#ifndef CONFIG_MISSED_TIMER_OFFSETS_HIST
16139 +#define trace_hrtimer_interrupt(a, b, c, d)
16140 +#else
16141 +TRACE_EVENT(hrtimer_interrupt,
16143 +       TP_PROTO(int cpu, long long offset, struct task_struct *curr,
16144 +               struct task_struct *task),
16146 +       TP_ARGS(cpu, offset, curr, task),
16148 +       TP_STRUCT__entry(
16149 +               __field(int,            cpu)
16150 +               __field(long long,      offset)
16151 +               __array(char,           ccomm,  TASK_COMM_LEN)
16152 +               __field(int,            cprio)
16153 +               __array(char,           tcomm,  TASK_COMM_LEN)
16154 +               __field(int,            tprio)
16155 +       ),
16157 +       TP_fast_assign(
16158 +               __entry->cpu    = cpu;
16159 +               __entry->offset = offset;
16160 +               memcpy(__entry->ccomm, curr->comm, TASK_COMM_LEN);
16161 +               __entry->cprio  = curr->prio;
16162 +               memcpy(__entry->tcomm, task != NULL ? task->comm : "<none>",
16163 +                       task != NULL ? TASK_COMM_LEN : 7);
16164 +               __entry->tprio  = task != NULL ? task->prio : -1;
16165 +       ),
16167 +       TP_printk("cpu=%d offset=%lld curr=%s[%d] thread=%s[%d]",
16168 +               __entry->cpu, __entry->offset, __entry->ccomm,
16169 +               __entry->cprio, __entry->tcomm, __entry->tprio)
16171 +#endif
16173 +#endif /* _TRACE_HIST_H */
16175 +/* This part must be outside protection */
16176 +#include <trace/define_trace.h>
16177 diff --git a/include/trace/events/latency_hist.h b/include/trace/events/latency_hist.h
16178 new file mode 100644
16179 index 000000000000..d3f2fbd560b1
16180 --- /dev/null
16181 +++ b/include/trace/events/latency_hist.h
16182 @@ -0,0 +1,29 @@
16183 +#ifndef _LATENCY_HIST_H
16184 +#define _LATENCY_HIST_H
16186 +enum hist_action {
16187 +       IRQS_ON,
16188 +       PREEMPT_ON,
16189 +       TRACE_STOP,
16190 +       IRQS_OFF,
16191 +       PREEMPT_OFF,
16192 +       TRACE_START,
16195 +static char *actions[] = {
16196 +       "IRQS_ON",
16197 +       "PREEMPT_ON",
16198 +       "TRACE_STOP",
16199 +       "IRQS_OFF",
16200 +       "PREEMPT_OFF",
16201 +       "TRACE_START",
16204 +static inline char *getaction(int action)
16206 +       if (action >= 0 && action <= sizeof(actions)/sizeof(actions[0]))
16207 +               return actions[action];
16208 +       return "unknown";
16211 +#endif /* _LATENCY_HIST_H */
16212 diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
16213 index fff846b512e6..73614ce1d204 100644
16214 --- a/include/trace/events/writeback.h
16215 +++ b/include/trace/events/writeback.h
16216 @@ -134,58 +134,28 @@ DEFINE_EVENT(writeback_dirty_inode_template, writeback_dirty_inode,
16217  #ifdef CREATE_TRACE_POINTS
16218  #ifdef CONFIG_CGROUP_WRITEBACK
16220 -static inline size_t __trace_wb_cgroup_size(struct bdi_writeback *wb)
16221 +static inline unsigned int __trace_wb_assign_cgroup(struct bdi_writeback *wb)
16223 -       return kernfs_path_len(wb->memcg_css->cgroup->kn) + 1;
16224 +       return wb->memcg_css->cgroup->kn->ino;
16227 -static inline void __trace_wb_assign_cgroup(char *buf, struct bdi_writeback *wb)
16229 -       struct cgroup *cgrp = wb->memcg_css->cgroup;
16230 -       char *path;
16232 -       path = cgroup_path(cgrp, buf, kernfs_path_len(cgrp->kn) + 1);
16233 -       WARN_ON_ONCE(path != buf);
16236 -static inline size_t __trace_wbc_cgroup_size(struct writeback_control *wbc)
16238 -       if (wbc->wb)
16239 -               return __trace_wb_cgroup_size(wbc->wb);
16240 -       else
16241 -               return 2;
16244 -static inline void __trace_wbc_assign_cgroup(char *buf,
16245 -                                            struct writeback_control *wbc)
16246 +static inline unsigned int __trace_wbc_assign_cgroup(struct writeback_control *wbc)
16248         if (wbc->wb)
16249 -               __trace_wb_assign_cgroup(buf, wbc->wb);
16250 +               return __trace_wb_assign_cgroup(wbc->wb);
16251         else
16252 -               strcpy(buf, "/");
16253 +               return -1U;
16256  #else  /* CONFIG_CGROUP_WRITEBACK */
16258 -static inline size_t __trace_wb_cgroup_size(struct bdi_writeback *wb)
16260 -       return 2;
16263 -static inline void __trace_wb_assign_cgroup(char *buf, struct bdi_writeback *wb)
16265 -       strcpy(buf, "/");
16268 -static inline size_t __trace_wbc_cgroup_size(struct writeback_control *wbc)
16269 +static inline unsigned int __trace_wb_assign_cgroup(struct bdi_writeback *wb)
16271 -       return 2;
16272 +       return -1U;
16275 -static inline void __trace_wbc_assign_cgroup(char *buf,
16276 -                                            struct writeback_control *wbc)
16277 +static inline unsigned int __trace_wbc_assign_cgroup(struct writeback_control *wbc)
16279 -       strcpy(buf, "/");
16280 +       return -1U;
16283  #endif /* CONFIG_CGROUP_WRITEBACK */
16284 @@ -201,7 +171,7 @@ DECLARE_EVENT_CLASS(writeback_write_inode_template,
16285                 __array(char, name, 32)
16286                 __field(unsigned long, ino)
16287                 __field(int, sync_mode)
16288 -               __dynamic_array(char, cgroup, __trace_wbc_cgroup_size(wbc))
16289 +               __field(unsigned int, cgroup_ino)
16290         ),
16292         TP_fast_assign(
16293 @@ -209,14 +179,14 @@ DECLARE_EVENT_CLASS(writeback_write_inode_template,
16294                         dev_name(inode_to_bdi(inode)->dev), 32);
16295                 __entry->ino            = inode->i_ino;
16296                 __entry->sync_mode      = wbc->sync_mode;
16297 -               __trace_wbc_assign_cgroup(__get_str(cgroup), wbc);
16298 +               __entry->cgroup_ino     = __trace_wbc_assign_cgroup(wbc);
16299         ),
16301 -       TP_printk("bdi %s: ino=%lu sync_mode=%d cgroup=%s",
16302 +       TP_printk("bdi %s: ino=%lu sync_mode=%d cgroup_ino=%u",
16303                 __entry->name,
16304                 __entry->ino,
16305                 __entry->sync_mode,
16306 -               __get_str(cgroup)
16307 +               __entry->cgroup_ino
16308         )
16309  );
16311 @@ -246,7 +216,7 @@ DECLARE_EVENT_CLASS(writeback_work_class,
16312                 __field(int, range_cyclic)
16313                 __field(int, for_background)
16314                 __field(int, reason)
16315 -               __dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb))
16316 +               __field(unsigned int, cgroup_ino)
16317         ),
16318         TP_fast_assign(
16319                 strncpy(__entry->name,
16320 @@ -258,10 +228,10 @@ DECLARE_EVENT_CLASS(writeback_work_class,
16321                 __entry->range_cyclic = work->range_cyclic;
16322                 __entry->for_background = work->for_background;
16323                 __entry->reason = work->reason;
16324 -               __trace_wb_assign_cgroup(__get_str(cgroup), wb);
16325 +               __entry->cgroup_ino = __trace_wb_assign_cgroup(wb);
16326         ),
16327         TP_printk("bdi %s: sb_dev %d:%d nr_pages=%ld sync_mode=%d "
16328 -                 "kupdate=%d range_cyclic=%d background=%d reason=%s cgroup=%s",
16329 +                 "kupdate=%d range_cyclic=%d background=%d reason=%s cgroup_ino=%u",
16330                   __entry->name,
16331                   MAJOR(__entry->sb_dev), MINOR(__entry->sb_dev),
16332                   __entry->nr_pages,
16333 @@ -270,7 +240,7 @@ DECLARE_EVENT_CLASS(writeback_work_class,
16334                   __entry->range_cyclic,
16335                   __entry->for_background,
16336                   __print_symbolic(__entry->reason, WB_WORK_REASON),
16337 -                 __get_str(cgroup)
16338 +                 __entry->cgroup_ino
16339         )
16340  );
16341  #define DEFINE_WRITEBACK_WORK_EVENT(name) \
16342 @@ -300,15 +270,15 @@ DECLARE_EVENT_CLASS(writeback_class,
16343         TP_ARGS(wb),
16344         TP_STRUCT__entry(
16345                 __array(char, name, 32)
16346 -               __dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb))
16347 +               __field(unsigned int, cgroup_ino)
16348         ),
16349         TP_fast_assign(
16350                 strncpy(__entry->name, dev_name(wb->bdi->dev), 32);
16351 -               __trace_wb_assign_cgroup(__get_str(cgroup), wb);
16352 +               __entry->cgroup_ino = __trace_wb_assign_cgroup(wb);
16353         ),
16354 -       TP_printk("bdi %s: cgroup=%s",
16355 +       TP_printk("bdi %s: cgroup_ino=%u",
16356                   __entry->name,
16357 -                 __get_str(cgroup)
16358 +                 __entry->cgroup_ino
16359         )
16360  );
16361  #define DEFINE_WRITEBACK_EVENT(name) \
16362 @@ -347,7 +317,7 @@ DECLARE_EVENT_CLASS(wbc_class,
16363                 __field(int, range_cyclic)
16364                 __field(long, range_start)
16365                 __field(long, range_end)
16366 -               __dynamic_array(char, cgroup, __trace_wbc_cgroup_size(wbc))
16367 +               __field(unsigned int, cgroup_ino)
16368         ),
16370         TP_fast_assign(
16371 @@ -361,12 +331,12 @@ DECLARE_EVENT_CLASS(wbc_class,
16372                 __entry->range_cyclic   = wbc->range_cyclic;
16373                 __entry->range_start    = (long)wbc->range_start;
16374                 __entry->range_end      = (long)wbc->range_end;
16375 -               __trace_wbc_assign_cgroup(__get_str(cgroup), wbc);
16376 +               __entry->cgroup_ino     = __trace_wbc_assign_cgroup(wbc);
16377         ),
16379         TP_printk("bdi %s: towrt=%ld skip=%ld mode=%d kupd=%d "
16380                 "bgrd=%d reclm=%d cyclic=%d "
16381 -               "start=0x%lx end=0x%lx cgroup=%s",
16382 +               "start=0x%lx end=0x%lx cgroup_ino=%u",
16383                 __entry->name,
16384                 __entry->nr_to_write,
16385                 __entry->pages_skipped,
16386 @@ -377,7 +347,7 @@ DECLARE_EVENT_CLASS(wbc_class,
16387                 __entry->range_cyclic,
16388                 __entry->range_start,
16389                 __entry->range_end,
16390 -               __get_str(cgroup)
16391 +               __entry->cgroup_ino
16392         )
16395 @@ -398,7 +368,7 @@ TRACE_EVENT(writeback_queue_io,
16396                 __field(long,           age)
16397                 __field(int,            moved)
16398                 __field(int,            reason)
16399 -               __dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb))
16400 +               __field(unsigned int,   cgroup_ino)
16401         ),
16402         TP_fast_assign(
16403                 unsigned long *older_than_this = work->older_than_this;
16404 @@ -408,15 +378,15 @@ TRACE_EVENT(writeback_queue_io,
16405                                   (jiffies - *older_than_this) * 1000 / HZ : -1;
16406                 __entry->moved  = moved;
16407                 __entry->reason = work->reason;
16408 -               __trace_wb_assign_cgroup(__get_str(cgroup), wb);
16409 +               __entry->cgroup_ino     = __trace_wb_assign_cgroup(wb);
16410         ),
16411 -       TP_printk("bdi %s: older=%lu age=%ld enqueue=%d reason=%s cgroup=%s",
16412 +       TP_printk("bdi %s: older=%lu age=%ld enqueue=%d reason=%s cgroup_ino=%u",
16413                 __entry->name,
16414                 __entry->older, /* older_than_this in jiffies */
16415                 __entry->age,   /* older_than_this in relative milliseconds */
16416                 __entry->moved,
16417                 __print_symbolic(__entry->reason, WB_WORK_REASON),
16418 -               __get_str(cgroup)
16419 +               __entry->cgroup_ino
16420         )
16421  );
16423 @@ -484,7 +454,7 @@ TRACE_EVENT(bdi_dirty_ratelimit,
16424                 __field(unsigned long,  dirty_ratelimit)
16425                 __field(unsigned long,  task_ratelimit)
16426                 __field(unsigned long,  balanced_dirty_ratelimit)
16427 -               __dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb))
16428 +               __field(unsigned int,   cgroup_ino)
16429         ),
16431         TP_fast_assign(
16432 @@ -496,13 +466,13 @@ TRACE_EVENT(bdi_dirty_ratelimit,
16433                 __entry->task_ratelimit = KBps(task_ratelimit);
16434                 __entry->balanced_dirty_ratelimit =
16435                                         KBps(wb->balanced_dirty_ratelimit);
16436 -               __trace_wb_assign_cgroup(__get_str(cgroup), wb);
16437 +               __entry->cgroup_ino     = __trace_wb_assign_cgroup(wb);
16438         ),
16440         TP_printk("bdi %s: "
16441                   "write_bw=%lu awrite_bw=%lu dirty_rate=%lu "
16442                   "dirty_ratelimit=%lu task_ratelimit=%lu "
16443 -                 "balanced_dirty_ratelimit=%lu cgroup=%s",
16444 +                 "balanced_dirty_ratelimit=%lu cgroup_ino=%u",
16445                   __entry->bdi,
16446                   __entry->write_bw,            /* write bandwidth */
16447                   __entry->avg_write_bw,        /* avg write bandwidth */
16448 @@ -510,7 +480,7 @@ TRACE_EVENT(bdi_dirty_ratelimit,
16449                   __entry->dirty_ratelimit,     /* base ratelimit */
16450                   __entry->task_ratelimit, /* ratelimit with position control */
16451                   __entry->balanced_dirty_ratelimit, /* the balanced ratelimit */
16452 -                 __get_str(cgroup)
16453 +                 __entry->cgroup_ino
16454         )
16455  );
16457 @@ -548,7 +518,7 @@ TRACE_EVENT(balance_dirty_pages,
16458                 __field(         long,  pause)
16459                 __field(unsigned long,  period)
16460                 __field(         long,  think)
16461 -               __dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb))
16462 +               __field(unsigned int,   cgroup_ino)
16463         ),
16465         TP_fast_assign(
16466 @@ -571,7 +541,7 @@ TRACE_EVENT(balance_dirty_pages,
16467                 __entry->period         = period * 1000 / HZ;
16468                 __entry->pause          = pause * 1000 / HZ;
16469                 __entry->paused         = (jiffies - start_time) * 1000 / HZ;
16470 -               __trace_wb_assign_cgroup(__get_str(cgroup), wb);
16471 +               __entry->cgroup_ino     = __trace_wb_assign_cgroup(wb);
16472         ),
16475 @@ -580,7 +550,7 @@ TRACE_EVENT(balance_dirty_pages,
16476                   "bdi_setpoint=%lu bdi_dirty=%lu "
16477                   "dirty_ratelimit=%lu task_ratelimit=%lu "
16478                   "dirtied=%u dirtied_pause=%u "
16479 -                 "paused=%lu pause=%ld period=%lu think=%ld cgroup=%s",
16480 +                 "paused=%lu pause=%ld period=%lu think=%ld cgroup_ino=%u",
16481                   __entry->bdi,
16482                   __entry->limit,
16483                   __entry->setpoint,
16484 @@ -595,7 +565,7 @@ TRACE_EVENT(balance_dirty_pages,
16485                   __entry->pause,       /* ms */
16486                   __entry->period,      /* ms */
16487                   __entry->think,       /* ms */
16488 -                 __get_str(cgroup)
16489 +                 __entry->cgroup_ino
16490           )
16491  );
16493 @@ -609,8 +579,7 @@ TRACE_EVENT(writeback_sb_inodes_requeue,
16494                 __field(unsigned long, ino)
16495                 __field(unsigned long, state)
16496                 __field(unsigned long, dirtied_when)
16497 -               __dynamic_array(char, cgroup,
16498 -                               __trace_wb_cgroup_size(inode_to_wb(inode)))
16499 +               __field(unsigned int, cgroup_ino)
16500         ),
16502         TP_fast_assign(
16503 @@ -619,16 +588,16 @@ TRACE_EVENT(writeback_sb_inodes_requeue,
16504                 __entry->ino            = inode->i_ino;
16505                 __entry->state          = inode->i_state;
16506                 __entry->dirtied_when   = inode->dirtied_when;
16507 -               __trace_wb_assign_cgroup(__get_str(cgroup), inode_to_wb(inode));
16508 +               __entry->cgroup_ino     = __trace_wb_assign_cgroup(inode_to_wb(inode));
16509         ),
16511 -       TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu cgroup=%s",
16512 +       TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu cgroup_ino=%u",
16513                   __entry->name,
16514                   __entry->ino,
16515                   show_inode_state(__entry->state),
16516                   __entry->dirtied_when,
16517                   (jiffies - __entry->dirtied_when) / HZ,
16518 -                 __get_str(cgroup)
16519 +                 __entry->cgroup_ino
16520         )
16521  );
16523 @@ -684,7 +653,7 @@ DECLARE_EVENT_CLASS(writeback_single_inode_template,
16524                 __field(unsigned long, writeback_index)
16525                 __field(long, nr_to_write)
16526                 __field(unsigned long, wrote)
16527 -               __dynamic_array(char, cgroup, __trace_wbc_cgroup_size(wbc))
16528 +               __field(unsigned int, cgroup_ino)
16529         ),
16531         TP_fast_assign(
16532 @@ -696,11 +665,11 @@ DECLARE_EVENT_CLASS(writeback_single_inode_template,
16533                 __entry->writeback_index = inode->i_mapping->writeback_index;
16534                 __entry->nr_to_write    = nr_to_write;
16535                 __entry->wrote          = nr_to_write - wbc->nr_to_write;
16536 -               __trace_wbc_assign_cgroup(__get_str(cgroup), wbc);
16537 +               __entry->cgroup_ino     = __trace_wbc_assign_cgroup(wbc);
16538         ),
16540         TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu "
16541 -                 "index=%lu to_write=%ld wrote=%lu cgroup=%s",
16542 +                 "index=%lu to_write=%ld wrote=%lu cgroup_ino=%u",
16543                   __entry->name,
16544                   __entry->ino,
16545                   show_inode_state(__entry->state),
16546 @@ -709,7 +678,7 @@ DECLARE_EVENT_CLASS(writeback_single_inode_template,
16547                   __entry->writeback_index,
16548                   __entry->nr_to_write,
16549                   __entry->wrote,
16550 -                 __get_str(cgroup)
16551 +                 __entry->cgroup_ino
16552         )
16553  );
16555 diff --git a/init/Kconfig b/init/Kconfig
16556 index 235c7a2c0d20..a7c81c0911da 100644
16557 --- a/init/Kconfig
16558 +++ b/init/Kconfig
16559 @@ -498,7 +498,7 @@ config TINY_RCU
16561  config RCU_EXPERT
16562         bool "Make expert-level adjustments to RCU configuration"
16563 -       default n
16564 +       default y if PREEMPT_RT_FULL
16565         help
16566           This option needs to be enabled if you wish to make
16567           expert-level adjustments to RCU configuration.  By default,
16568 @@ -614,7 +614,7 @@ config RCU_FANOUT_LEAF
16570  config RCU_FAST_NO_HZ
16571         bool "Accelerate last non-dyntick-idle CPU's grace periods"
16572 -       depends on NO_HZ_COMMON && SMP && RCU_EXPERT
16573 +       depends on NO_HZ_COMMON && SMP && RCU_EXPERT && !PREEMPT_RT_FULL
16574         default n
16575         help
16576           This option permits CPUs to enter dynticks-idle state even if
16577 @@ -641,7 +641,7 @@ config TREE_RCU_TRACE
16578  config RCU_BOOST
16579         bool "Enable RCU priority boosting"
16580         depends on RT_MUTEXES && PREEMPT_RCU && RCU_EXPERT
16581 -       default n
16582 +       default y if PREEMPT_RT_FULL
16583         help
16584           This option boosts the priority of preempted RCU readers that
16585           block the current preemptible RCU grace period for too long.
16586 @@ -1106,6 +1106,7 @@ config CFS_BANDWIDTH
16587  config RT_GROUP_SCHED
16588         bool "Group scheduling for SCHED_RR/FIFO"
16589         depends on CGROUP_SCHED
16590 +       depends on !PREEMPT_RT_FULL
16591         default n
16592         help
16593           This feature lets you explicitly allocate real CPU bandwidth
16594 @@ -1719,6 +1720,7 @@ choice
16596  config SLAB
16597         bool "SLAB"
16598 +       depends on !PREEMPT_RT_FULL
16599         help
16600           The regular slab allocator that is established and known to work
16601           well in all environments. It organizes cache hot objects in
16602 @@ -1737,6 +1739,7 @@ config SLUB
16603  config SLOB
16604         depends on EXPERT
16605         bool "SLOB (Simple Allocator)"
16606 +       depends on !PREEMPT_RT_FULL
16607         help
16608            SLOB replaces the stock allocator with a drastically simpler
16609            allocator. SLOB is generally more space efficient but
16610 @@ -1746,7 +1749,7 @@ endchoice
16612  config SLUB_CPU_PARTIAL
16613         default y
16614 -       depends on SLUB && SMP
16615 +       depends on SLUB && SMP && !PREEMPT_RT_FULL
16616         bool "SLUB per cpu partial cache"
16617         help
16618           Per cpu partial caches accellerate objects allocation and freeing
16619 diff --git a/init/Makefile b/init/Makefile
16620 index 7bc47ee31c36..88cf473554e0 100644
16621 --- a/init/Makefile
16622 +++ b/init/Makefile
16623 @@ -33,4 +33,4 @@ silent_chk_compile.h = :
16624  include/generated/compile.h: FORCE
16625         @$($(quiet)chk_compile.h)
16626         $(Q)$(CONFIG_SHELL) $(srctree)/scripts/mkcompile_h $@ \
16627 -       "$(UTS_MACHINE)" "$(CONFIG_SMP)" "$(CONFIG_PREEMPT)" "$(CC) $(KBUILD_CFLAGS)"
16628 +       "$(UTS_MACHINE)" "$(CONFIG_SMP)" "$(CONFIG_PREEMPT)" "$(CONFIG_PREEMPT_RT_FULL)" "$(CC) $(KBUILD_CFLAGS)"
16629 diff --git a/init/main.c b/init/main.c
16630 index 9e64d7097f1a..4a76e629c137 100644
16631 --- a/init/main.c
16632 +++ b/init/main.c
16633 @@ -530,6 +530,7 @@ asmlinkage __visible void __init start_kernel(void)
16634         setup_command_line(command_line);
16635         setup_nr_cpu_ids();
16636         setup_per_cpu_areas();
16637 +       softirq_early_init();
16638         smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */
16640         build_all_zonelists(NULL, NULL);
16641 diff --git a/ipc/msg.c b/ipc/msg.c
16642 index c6521c205cb4..996d89023552 100644
16643 --- a/ipc/msg.c
16644 +++ b/ipc/msg.c
16645 @@ -183,20 +183,14 @@ static void ss_wakeup(struct list_head *h, int kill)
16646         }
16649 -static void expunge_all(struct msg_queue *msq, int res)
16650 +static void expunge_all(struct msg_queue *msq, int res,
16651 +                       struct wake_q_head *wake_q)
16653         struct msg_receiver *msr, *t;
16655         list_for_each_entry_safe(msr, t, &msq->q_receivers, r_list) {
16656 -               msr->r_msg = NULL; /* initialize expunge ordering */
16657 -               wake_up_process(msr->r_tsk);
16658 -               /*
16659 -                * Ensure that the wakeup is visible before setting r_msg as
16660 -                * the receiving end depends on it: either spinning on a nil,
16661 -                * or dealing with -EAGAIN cases. See lockless receive part 1
16662 -                * and 2 in do_msgrcv().
16663 -                */
16664 -               smp_wmb(); /* barrier (B) */
16666 +               wake_q_add(wake_q, msr->r_tsk);
16667                 msr->r_msg = ERR_PTR(res);
16668         }
16670 @@ -213,11 +207,13 @@ static void freeque(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
16672         struct msg_msg *msg, *t;
16673         struct msg_queue *msq = container_of(ipcp, struct msg_queue, q_perm);
16674 +       WAKE_Q(wake_q);
16676 -       expunge_all(msq, -EIDRM);
16677 +       expunge_all(msq, -EIDRM, &wake_q);
16678         ss_wakeup(&msq->q_senders, 1);
16679         msg_rmid(ns, msq);
16680         ipc_unlock_object(&msq->q_perm);
16681 +       wake_up_q(&wake_q);
16682         rcu_read_unlock();
16684         list_for_each_entry_safe(msg, t, &msq->q_messages, m_list) {
16685 @@ -342,6 +338,7 @@ static int msgctl_down(struct ipc_namespace *ns, int msqid, int cmd,
16686         struct kern_ipc_perm *ipcp;
16687         struct msqid64_ds uninitialized_var(msqid64);
16688         struct msg_queue *msq;
16689 +       WAKE_Q(wake_q);
16690         int err;
16692         if (cmd == IPC_SET) {
16693 @@ -389,7 +386,7 @@ static int msgctl_down(struct ipc_namespace *ns, int msqid, int cmd,
16694                 /* sleeping receivers might be excluded by
16695                  * stricter permissions.
16696                  */
16697 -               expunge_all(msq, -EAGAIN);
16698 +               expunge_all(msq, -EAGAIN, &wake_q);
16699                 /* sleeping senders might be able to send
16700                  * due to a larger queue size.
16701                  */
16702 @@ -402,6 +399,7 @@ static int msgctl_down(struct ipc_namespace *ns, int msqid, int cmd,
16704  out_unlock0:
16705         ipc_unlock_object(&msq->q_perm);
16706 +       wake_up_q(&wake_q);
16707  out_unlock1:
16708         rcu_read_unlock();
16709  out_up:
16710 @@ -566,7 +564,8 @@ static int testmsg(struct msg_msg *msg, long type, int mode)
16711         return 0;
16714 -static inline int pipelined_send(struct msg_queue *msq, struct msg_msg *msg)
16715 +static inline int pipelined_send(struct msg_queue *msq, struct msg_msg *msg,
16716 +                                struct wake_q_head *wake_q)
16718         struct msg_receiver *msr, *t;
16720 @@ -577,27 +576,13 @@ static inline int pipelined_send(struct msg_queue *msq, struct msg_msg *msg)
16722                         list_del(&msr->r_list);
16723                         if (msr->r_maxsize < msg->m_ts) {
16724 -                               /* initialize pipelined send ordering */
16725 -                               msr->r_msg = NULL;
16726 -                               wake_up_process(msr->r_tsk);
16727 -                               /* barrier (B) see barrier comment below */
16728 -                               smp_wmb();
16729 +                               wake_q_add(wake_q, msr->r_tsk);
16730                                 msr->r_msg = ERR_PTR(-E2BIG);
16731                         } else {
16732 -                               msr->r_msg = NULL;
16733                                 msq->q_lrpid = task_pid_vnr(msr->r_tsk);
16734                                 msq->q_rtime = get_seconds();
16735 -                               wake_up_process(msr->r_tsk);
16736 -                               /*
16737 -                                * Ensure that the wakeup is visible before
16738 -                                * setting r_msg, as the receiving can otherwise
16739 -                                * exit - once r_msg is set, the receiver can
16740 -                                * continue. See lockless receive part 1 and 2
16741 -                                * in do_msgrcv(). Barrier (B).
16742 -                                */
16743 -                               smp_wmb();
16744 +                               wake_q_add(wake_q, msr->r_tsk);
16745                                 msr->r_msg = msg;
16747                                 return 1;
16748                         }
16749                 }
16750 @@ -613,6 +598,7 @@ long do_msgsnd(int msqid, long mtype, void __user *mtext,
16751         struct msg_msg *msg;
16752         int err;
16753         struct ipc_namespace *ns;
16754 +       WAKE_Q(wake_q);
16756         ns = current->nsproxy->ipc_ns;
16758 @@ -698,7 +684,7 @@ long do_msgsnd(int msqid, long mtype, void __user *mtext,
16759         msq->q_lspid = task_tgid_vnr(current);
16760         msq->q_stime = get_seconds();
16762 -       if (!pipelined_send(msq, msg)) {
16763 +       if (!pipelined_send(msq, msg, &wake_q)) {
16764                 /* no one is waiting for this message, enqueue it */
16765                 list_add_tail(&msg->m_list, &msq->q_messages);
16766                 msq->q_cbytes += msgsz;
16767 @@ -712,6 +698,7 @@ long do_msgsnd(int msqid, long mtype, void __user *mtext,
16769  out_unlock0:
16770         ipc_unlock_object(&msq->q_perm);
16771 +       wake_up_q(&wake_q);
16772  out_unlock1:
16773         rcu_read_unlock();
16774         if (msg != NULL)
16775 @@ -932,57 +919,25 @@ long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, int msgfl
16776                 rcu_read_lock();
16778                 /* Lockless receive, part 2:
16779 -                * Wait until pipelined_send or expunge_all are outside of
16780 -                * wake_up_process(). There is a race with exit(), see
16781 -                * ipc/mqueue.c for the details. The correct serialization
16782 -                * ensures that a receiver cannot continue without the wakeup
16783 -                * being visibible _before_ setting r_msg:
16784 +                * The work in pipelined_send() and expunge_all():
16785 +                * - Set pointer to message
16786 +                * - Queue the receiver task for later wakeup
16787 +                * - Wake up the process after the lock is dropped.
16788                  *
16789 -                * CPU 0                             CPU 1
16790 -                * <loop receiver>
16791 -                *   smp_rmb(); (A) <-- pair -.      <waker thread>
16792 -                *   <load ->r_msg>           |        msr->r_msg = NULL;
16793 -                *                            |        wake_up_process();
16794 -                * <continue>                 `------> smp_wmb(); (B)
16795 -                *                                     msr->r_msg = msg;
16796 -                *
16797 -                * Where (A) orders the message value read and where (B) orders
16798 -                * the write to the r_msg -- done in both pipelined_send and
16799 -                * expunge_all.
16800 +                * Should the process wake up before this wakeup (due to a
16801 +                * signal) it will either see the message and continue â€¦
16802                  */
16803 -               for (;;) {
16804 -                       /*
16805 -                        * Pairs with writer barrier in pipelined_send
16806 -                        * or expunge_all.
16807 -                        */
16808 -                       smp_rmb(); /* barrier (A) */
16809 -                       msg = (struct msg_msg *)msr_d.r_msg;
16810 -                       if (msg)
16811 -                               break;
16813 -                       /*
16814 -                        * The cpu_relax() call is a compiler barrier
16815 -                        * which forces everything in this loop to be
16816 -                        * re-loaded.
16817 -                        */
16818 -                       cpu_relax();
16819 -               }
16821 -               /* Lockless receive, part 3:
16822 -                * If there is a message or an error then accept it without
16823 -                * locking.
16824 -                */
16825 +               msg = (struct msg_msg *)msr_d.r_msg;
16826                 if (msg != ERR_PTR(-EAGAIN))
16827                         goto out_unlock1;
16829 -               /* Lockless receive, part 3:
16830 -                * Acquire the queue spinlock.
16831 -                */
16832 +                /*
16833 +                 * â€¦ or see -EAGAIN, acquire the lock to check the message
16834 +                 * again.
16835 +                 */
16836                 ipc_lock_object(&msq->q_perm);
16838 -               /* Lockless receive, part 4:
16839 -                * Repeat test after acquiring the spinlock.
16840 -                */
16841                 msg = (struct msg_msg *)msr_d.r_msg;
16842                 if (msg != ERR_PTR(-EAGAIN))
16843                         goto out_unlock0;
16844 diff --git a/ipc/sem.c b/ipc/sem.c
16845 index 9862c3d1c26d..ef34d7376697 100644
16846 --- a/ipc/sem.c
16847 +++ b/ipc/sem.c
16848 @@ -708,6 +708,13 @@ undo:
16849  static void wake_up_sem_queue_prepare(struct list_head *pt,
16850                                 struct sem_queue *q, int error)
16852 +#ifdef CONFIG_PREEMPT_RT_BASE
16853 +       struct task_struct *p = q->sleeper;
16854 +       get_task_struct(p);
16855 +       q->status = error;
16856 +       wake_up_process(p);
16857 +       put_task_struct(p);
16858 +#else
16859         if (list_empty(pt)) {
16860                 /*
16861                  * Hold preempt off so that we don't get preempted and have the
16862 @@ -719,6 +726,7 @@ static void wake_up_sem_queue_prepare(struct list_head *pt,
16863         q->pid = error;
16865         list_add_tail(&q->list, pt);
16866 +#endif
16869  /**
16870 @@ -732,6 +740,7 @@ static void wake_up_sem_queue_prepare(struct list_head *pt,
16871   */
16872  static void wake_up_sem_queue_do(struct list_head *pt)
16874 +#ifndef CONFIG_PREEMPT_RT_BASE
16875         struct sem_queue *q, *t;
16876         int did_something;
16878 @@ -744,6 +753,7 @@ static void wake_up_sem_queue_do(struct list_head *pt)
16879         }
16880         if (did_something)
16881                 preempt_enable();
16882 +#endif
16885  static void unlink_queue(struct sem_array *sma, struct sem_queue *q)
16886 diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
16887 index ebdb0043203a..b9e6aa7e5aa6 100644
16888 --- a/kernel/Kconfig.locks
16889 +++ b/kernel/Kconfig.locks
16890 @@ -225,11 +225,11 @@ config ARCH_SUPPORTS_ATOMIC_RMW
16892  config MUTEX_SPIN_ON_OWNER
16893         def_bool y
16894 -       depends on SMP && !DEBUG_MUTEXES && ARCH_SUPPORTS_ATOMIC_RMW
16895 +       depends on SMP && !DEBUG_MUTEXES && ARCH_SUPPORTS_ATOMIC_RMW && !PREEMPT_RT_FULL
16897  config RWSEM_SPIN_ON_OWNER
16898         def_bool y
16899 -       depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW
16900 +       depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW && !PREEMPT_RT_FULL
16902  config LOCK_SPIN_ON_OWNER
16903         def_bool y
16904 diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
16905 index 3f9c97419f02..11dbe26a8279 100644
16906 --- a/kernel/Kconfig.preempt
16907 +++ b/kernel/Kconfig.preempt
16908 @@ -1,3 +1,16 @@
16909 +config PREEMPT
16910 +       bool
16911 +       select PREEMPT_COUNT
16913 +config PREEMPT_RT_BASE
16914 +       bool
16915 +       select PREEMPT
16917 +config HAVE_PREEMPT_LAZY
16918 +       bool
16920 +config PREEMPT_LAZY
16921 +       def_bool y if HAVE_PREEMPT_LAZY && PREEMPT_RT_FULL
16923  choice
16924         prompt "Preemption Model"
16925 @@ -33,9 +46,9 @@ config PREEMPT_VOLUNTARY
16927           Select this if you are building a kernel for a desktop system.
16929 -config PREEMPT
16930 +config PREEMPT__LL
16931         bool "Preemptible Kernel (Low-Latency Desktop)"
16932 -       select PREEMPT_COUNT
16933 +       select PREEMPT
16934         select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK
16935         help
16936           This option reduces the latency of the kernel by making
16937 @@ -52,6 +65,22 @@ config PREEMPT
16938           embedded system with latency requirements in the milliseconds
16939           range.
16941 +config PREEMPT_RTB
16942 +       bool "Preemptible Kernel (Basic RT)"
16943 +       select PREEMPT_RT_BASE
16944 +       help
16945 +         This option is basically the same as (Low-Latency Desktop) but
16946 +         enables changes which are preliminary for the full preemptible
16947 +         RT kernel.
16949 +config PREEMPT_RT_FULL
16950 +       bool "Fully Preemptible Kernel (RT)"
16951 +       depends on IRQ_FORCED_THREADING
16952 +       select PREEMPT_RT_BASE
16953 +       select PREEMPT_RCU
16954 +       help
16955 +         All and everything
16957  endchoice
16959  config PREEMPT_COUNT
16960 diff --git a/kernel/cgroup.c b/kernel/cgroup.c
16961 index 4cb94b678e9f..8c41ee8a6fee 100644
16962 --- a/kernel/cgroup.c
16963 +++ b/kernel/cgroup.c
16964 @@ -4741,10 +4741,10 @@ static void css_free_rcu_fn(struct rcu_head *rcu_head)
16965         queue_work(cgroup_destroy_wq, &css->destroy_work);
16968 -static void css_release_work_fn(struct work_struct *work)
16969 +static void css_release_work_fn(struct swork_event *sev)
16971         struct cgroup_subsys_state *css =
16972 -               container_of(work, struct cgroup_subsys_state, destroy_work);
16973 +               container_of(sev, struct cgroup_subsys_state, destroy_swork);
16974         struct cgroup_subsys *ss = css->ss;
16975         struct cgroup *cgrp = css->cgroup;
16977 @@ -4783,8 +4783,8 @@ static void css_release(struct percpu_ref *ref)
16978         struct cgroup_subsys_state *css =
16979                 container_of(ref, struct cgroup_subsys_state, refcnt);
16981 -       INIT_WORK(&css->destroy_work, css_release_work_fn);
16982 -       queue_work(cgroup_destroy_wq, &css->destroy_work);
16983 +       INIT_SWORK(&css->destroy_swork, css_release_work_fn);
16984 +       swork_queue(&css->destroy_swork);
16987  static void init_and_link_css(struct cgroup_subsys_state *css,
16988 @@ -5401,6 +5401,7 @@ static int __init cgroup_wq_init(void)
16989          */
16990         cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
16991         BUG_ON(!cgroup_destroy_wq);
16992 +       BUG_ON(swork_get());
16994         /*
16995          * Used to destroy pidlists and separate to serve as flush domain.
16996 diff --git a/kernel/cpu.c b/kernel/cpu.c
16997 index 40d20bf5de28..0be18c1684d8 100644
16998 --- a/kernel/cpu.c
16999 +++ b/kernel/cpu.c
17000 @@ -75,8 +75,8 @@ static struct {
17001  #endif
17002  } cpu_hotplug = {
17003         .active_writer = NULL,
17004 -       .wq = __WAIT_QUEUE_HEAD_INITIALIZER(cpu_hotplug.wq),
17005         .lock = __MUTEX_INITIALIZER(cpu_hotplug.lock),
17006 +       .wq = __WAIT_QUEUE_HEAD_INITIALIZER(cpu_hotplug.wq),
17007  #ifdef CONFIG_DEBUG_LOCK_ALLOC
17008         .dep_map = {.name = "cpu_hotplug.lock" },
17009  #endif
17010 @@ -89,6 +89,289 @@ static struct {
17011  #define cpuhp_lock_acquire()      lock_map_acquire(&cpu_hotplug.dep_map)
17012  #define cpuhp_lock_release()      lock_map_release(&cpu_hotplug.dep_map)
17014 +/**
17015 + * hotplug_pcp - per cpu hotplug descriptor
17016 + * @unplug:    set when pin_current_cpu() needs to sync tasks
17017 + * @sync_tsk:  the task that waits for tasks to finish pinned sections
17018 + * @refcount:  counter of tasks in pinned sections
17019 + * @grab_lock: set when the tasks entering pinned sections should wait
17020 + * @synced:    notifier for @sync_tsk to tell cpu_down it's finished
17021 + * @mutex:     the mutex to make tasks wait (used when @grab_lock is true)
17022 + * @mutex_init:        zero if the mutex hasn't been initialized yet.
17023 + *
17024 + * Although @unplug and @sync_tsk may point to the same task, the @unplug
17025 + * is used as a flag and still exists after @sync_tsk has exited and
17026 + * @sync_tsk set to NULL.
17027 + */
17028 +struct hotplug_pcp {
17029 +       struct task_struct *unplug;
17030 +       struct task_struct *sync_tsk;
17031 +       int refcount;
17032 +       int grab_lock;
17033 +       struct completion synced;
17034 +       struct completion unplug_wait;
17035 +#ifdef CONFIG_PREEMPT_RT_FULL
17036 +       /*
17037 +        * Note, on PREEMPT_RT, the hotplug lock must save the state of
17038 +        * the task, otherwise the mutex will cause the task to fail
17039 +        * to sleep when required. (Because it's called from migrate_disable())
17040 +        *
17041 +        * The spinlock_t on PREEMPT_RT is a mutex that saves the task's
17042 +        * state.
17043 +        */
17044 +       spinlock_t lock;
17045 +#else
17046 +       struct mutex mutex;
17047 +#endif
17048 +       int mutex_init;
17051 +#ifdef CONFIG_PREEMPT_RT_FULL
17052 +# define hotplug_lock(hp) rt_spin_lock__no_mg(&(hp)->lock)
17053 +# define hotplug_unlock(hp) rt_spin_unlock__no_mg(&(hp)->lock)
17054 +#else
17055 +# define hotplug_lock(hp) mutex_lock(&(hp)->mutex)
17056 +# define hotplug_unlock(hp) mutex_unlock(&(hp)->mutex)
17057 +#endif
17059 +static DEFINE_PER_CPU(struct hotplug_pcp, hotplug_pcp);
17061 +/**
17062 + * pin_current_cpu - Prevent the current cpu from being unplugged
17063 + *
17064 + * Lightweight version of get_online_cpus() to prevent cpu from being
17065 + * unplugged when code runs in a migration disabled region.
17066 + *
17067 + * Must be called with preemption disabled (preempt_count = 1)!
17068 + */
17069 +void pin_current_cpu(void)
17071 +       struct hotplug_pcp *hp;
17072 +       int force = 0;
17074 +retry:
17075 +       hp = this_cpu_ptr(&hotplug_pcp);
17077 +       if (!hp->unplug || hp->refcount || force || preempt_count() > 1 ||
17078 +           hp->unplug == current) {
17079 +               hp->refcount++;
17080 +               return;
17081 +       }
17082 +       if (hp->grab_lock) {
17083 +               preempt_enable();
17084 +               hotplug_lock(hp);
17085 +               hotplug_unlock(hp);
17086 +       } else {
17087 +               preempt_enable();
17088 +               /*
17089 +                * Try to push this task off of this CPU.
17090 +                */
17091 +               if (!migrate_me()) {
17092 +                       preempt_disable();
17093 +                       hp = this_cpu_ptr(&hotplug_pcp);
17094 +                       if (!hp->grab_lock) {
17095 +                               /*
17096 +                                * Just let it continue it's already pinned
17097 +                                * or about to sleep.
17098 +                                */
17099 +                               force = 1;
17100 +                               goto retry;
17101 +                       }
17102 +                       preempt_enable();
17103 +               }
17104 +       }
17105 +       preempt_disable();
17106 +       goto retry;
17109 +/**
17110 + * unpin_current_cpu - Allow unplug of current cpu
17111 + *
17112 + * Must be called with preemption or interrupts disabled!
17113 + */
17114 +void unpin_current_cpu(void)
17116 +       struct hotplug_pcp *hp = this_cpu_ptr(&hotplug_pcp);
17118 +       WARN_ON(hp->refcount <= 0);
17120 +       /* This is safe. sync_unplug_thread is pinned to this cpu */
17121 +       if (!--hp->refcount && hp->unplug && hp->unplug != current)
17122 +               wake_up_process(hp->unplug);
17125 +static void wait_for_pinned_cpus(struct hotplug_pcp *hp)
17127 +       set_current_state(TASK_UNINTERRUPTIBLE);
17128 +       while (hp->refcount) {
17129 +               schedule_preempt_disabled();
17130 +               set_current_state(TASK_UNINTERRUPTIBLE);
17131 +       }
17134 +static int sync_unplug_thread(void *data)
17136 +       struct hotplug_pcp *hp = data;
17138 +       wait_for_completion(&hp->unplug_wait);
17139 +       preempt_disable();
17140 +       hp->unplug = current;
17141 +       wait_for_pinned_cpus(hp);
17143 +       /*
17144 +        * This thread will synchronize the cpu_down() with threads
17145 +        * that have pinned the CPU. When the pinned CPU count reaches
17146 +        * zero, we inform the cpu_down code to continue to the next step.
17147 +        */
17148 +       set_current_state(TASK_UNINTERRUPTIBLE);
17149 +       preempt_enable();
17150 +       complete(&hp->synced);
17152 +       /*
17153 +        * If all succeeds, the next step will need tasks to wait till
17154 +        * the CPU is offline before continuing. To do this, the grab_lock
17155 +        * is set and tasks going into pin_current_cpu() will block on the
17156 +        * mutex. But we still need to wait for those that are already in
17157 +        * pinned CPU sections. If the cpu_down() failed, the kthread_should_stop()
17158 +        * will kick this thread out.
17159 +        */
17160 +       while (!hp->grab_lock && !kthread_should_stop()) {
17161 +               schedule();
17162 +               set_current_state(TASK_UNINTERRUPTIBLE);
17163 +       }
17165 +       /* Make sure grab_lock is seen before we see a stale completion */
17166 +       smp_mb();
17168 +       /*
17169 +        * Now just before cpu_down() enters stop machine, we need to make
17170 +        * sure all tasks that are in pinned CPU sections are out, and new
17171 +        * tasks will now grab the lock, keeping them from entering pinned
17172 +        * CPU sections.
17173 +        */
17174 +       if (!kthread_should_stop()) {
17175 +               preempt_disable();
17176 +               wait_for_pinned_cpus(hp);
17177 +               preempt_enable();
17178 +               complete(&hp->synced);
17179 +       }
17181 +       set_current_state(TASK_UNINTERRUPTIBLE);
17182 +       while (!kthread_should_stop()) {
17183 +               schedule();
17184 +               set_current_state(TASK_UNINTERRUPTIBLE);
17185 +       }
17186 +       set_current_state(TASK_RUNNING);
17188 +       /*
17189 +        * Force this thread off this CPU as it's going down and
17190 +        * we don't want any more work on this CPU.
17191 +        */
17192 +       current->flags &= ~PF_NO_SETAFFINITY;
17193 +       set_cpus_allowed_ptr(current, cpu_present_mask);
17194 +       migrate_me();
17195 +       return 0;
17198 +static void __cpu_unplug_sync(struct hotplug_pcp *hp)
17200 +       wake_up_process(hp->sync_tsk);
17201 +       wait_for_completion(&hp->synced);
17204 +static void __cpu_unplug_wait(unsigned int cpu)
17206 +       struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
17208 +       complete(&hp->unplug_wait);
17209 +       wait_for_completion(&hp->synced);
17213 + * Start the sync_unplug_thread on the target cpu and wait for it to
17214 + * complete.
17215 + */
17216 +static int cpu_unplug_begin(unsigned int cpu)
17218 +       struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
17219 +       int err;
17221 +       /* Protected by cpu_hotplug.lock */
17222 +       if (!hp->mutex_init) {
17223 +#ifdef CONFIG_PREEMPT_RT_FULL
17224 +               spin_lock_init(&hp->lock);
17225 +#else
17226 +               mutex_init(&hp->mutex);
17227 +#endif
17228 +               hp->mutex_init = 1;
17229 +       }
17231 +       /* Inform the scheduler to migrate tasks off this CPU */
17232 +       tell_sched_cpu_down_begin(cpu);
17234 +       init_completion(&hp->synced);
17235 +       init_completion(&hp->unplug_wait);
17237 +       hp->sync_tsk = kthread_create(sync_unplug_thread, hp, "sync_unplug/%d", cpu);
17238 +       if (IS_ERR(hp->sync_tsk)) {
17239 +               err = PTR_ERR(hp->sync_tsk);
17240 +               hp->sync_tsk = NULL;
17241 +               return err;
17242 +       }
17243 +       kthread_bind(hp->sync_tsk, cpu);
17245 +       /*
17246 +        * Wait for tasks to get out of the pinned sections,
17247 +        * it's still OK if new tasks enter. Some CPU notifiers will
17248 +        * wait for tasks that are going to enter these sections and
17249 +        * we must not have them block.
17250 +        */
17251 +       wake_up_process(hp->sync_tsk);
17252 +       return 0;
17255 +static void cpu_unplug_sync(unsigned int cpu)
17257 +       struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
17259 +       init_completion(&hp->synced);
17260 +       /* The completion needs to be initialzied before setting grab_lock */
17261 +       smp_wmb();
17263 +       /* Grab the mutex before setting grab_lock */
17264 +       hotplug_lock(hp);
17265 +       hp->grab_lock = 1;
17267 +       /*
17268 +        * The CPU notifiers have been completed.
17269 +        * Wait for tasks to get out of pinned CPU sections and have new
17270 +        * tasks block until the CPU is completely down.
17271 +        */
17272 +       __cpu_unplug_sync(hp);
17274 +       /* All done with the sync thread */
17275 +       kthread_stop(hp->sync_tsk);
17276 +       hp->sync_tsk = NULL;
17279 +static void cpu_unplug_done(unsigned int cpu)
17281 +       struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
17283 +       hp->unplug = NULL;
17284 +       /* Let all tasks know cpu unplug is finished before cleaning up */
17285 +       smp_wmb();
17287 +       if (hp->sync_tsk)
17288 +               kthread_stop(hp->sync_tsk);
17290 +       if (hp->grab_lock) {
17291 +               hotplug_unlock(hp);
17292 +               /* protected by cpu_hotplug.lock */
17293 +               hp->grab_lock = 0;
17294 +       }
17295 +       tell_sched_cpu_down_done(cpu);
17298  void get_online_cpus(void)
17300 @@ -338,13 +621,15 @@ static int take_cpu_down(void *_param)
17301  /* Requires cpu_add_remove_lock to be held */
17302  static int _cpu_down(unsigned int cpu, int tasks_frozen)
17304 -       int err, nr_calls = 0;
17305 +       int mycpu, err, nr_calls = 0;
17306         void *hcpu = (void *)(long)cpu;
17307         unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
17308         struct take_cpu_down_param tcd_param = {
17309                 .mod = mod,
17310                 .hcpu = hcpu,
17311         };
17312 +       cpumask_var_t cpumask;
17313 +       cpumask_var_t cpumask_org;
17315         if (num_online_cpus() == 1)
17316                 return -EBUSY;
17317 @@ -352,7 +637,34 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
17318         if (!cpu_online(cpu))
17319                 return -EINVAL;
17321 +       /* Move the downtaker off the unplug cpu */
17322 +       if (!alloc_cpumask_var(&cpumask, GFP_KERNEL))
17323 +               return -ENOMEM;
17324 +       if (!alloc_cpumask_var(&cpumask_org, GFP_KERNEL))  {
17325 +               free_cpumask_var(cpumask);
17326 +               return -ENOMEM;
17327 +       }
17329 +       cpumask_copy(cpumask_org, tsk_cpus_allowed(current));
17330 +       cpumask_andnot(cpumask, cpu_online_mask, cpumask_of(cpu));
17331 +       set_cpus_allowed_ptr(current, cpumask);
17332 +       free_cpumask_var(cpumask);
17333 +       migrate_disable();
17334 +       mycpu = smp_processor_id();
17335 +       if (mycpu == cpu) {
17336 +               printk(KERN_ERR "Yuck! Still on unplug CPU\n!");
17337 +               migrate_enable();
17338 +               err = -EBUSY;
17339 +               goto restore_cpus;
17340 +       }
17341 +       migrate_enable();
17343         cpu_hotplug_begin();
17344 +       err = cpu_unplug_begin(cpu);
17345 +       if (err) {
17346 +               printk("cpu_unplug_begin(%d) failed\n", cpu);
17347 +               goto out_cancel;
17348 +       }
17350         err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls);
17351         if (err) {
17352 @@ -378,8 +690,12 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
17353         else
17354                 synchronize_rcu();
17356 +       __cpu_unplug_wait(cpu);
17357         smpboot_park_threads(cpu);
17359 +       /* Notifiers are done. Don't let any more tasks pin this CPU. */
17360 +       cpu_unplug_sync(cpu);
17362         /*
17363          * Prevent irq alloc/free while the dying cpu reorganizes the
17364          * interrupt affinities.
17365 @@ -424,9 +740,14 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
17366         check_for_tasks(cpu);
17368  out_release:
17369 +       cpu_unplug_done(cpu);
17370 +out_cancel:
17371         cpu_hotplug_done();
17372         if (!err)
17373                 cpu_notify_nofail(CPU_POST_DEAD | mod, hcpu);
17374 +restore_cpus:
17375 +       set_cpus_allowed_ptr(current, cpumask_org);
17376 +       free_cpumask_var(cpumask_org);
17377         return err;
17380 diff --git a/kernel/cpuset.c b/kernel/cpuset.c
17381 index b271353d5202..dd7b87b7f618 100644
17382 --- a/kernel/cpuset.c
17383 +++ b/kernel/cpuset.c
17384 @@ -283,7 +283,7 @@ static struct cpuset top_cpuset = {
17385   */
17387  static DEFINE_MUTEX(cpuset_mutex);
17388 -static DEFINE_SPINLOCK(callback_lock);
17389 +static DEFINE_RAW_SPINLOCK(callback_lock);
17391  static struct workqueue_struct *cpuset_migrate_mm_wq;
17393 @@ -906,9 +906,9 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
17394                         continue;
17395                 rcu_read_unlock();
17397 -               spin_lock_irq(&callback_lock);
17398 +               raw_spin_lock_irq(&callback_lock);
17399                 cpumask_copy(cp->effective_cpus, new_cpus);
17400 -               spin_unlock_irq(&callback_lock);
17401 +               raw_spin_unlock_irq(&callback_lock);
17403                 WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
17404                         !cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
17405 @@ -973,9 +973,9 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
17406         if (retval < 0)
17407                 return retval;
17409 -       spin_lock_irq(&callback_lock);
17410 +       raw_spin_lock_irq(&callback_lock);
17411         cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
17412 -       spin_unlock_irq(&callback_lock);
17413 +       raw_spin_unlock_irq(&callback_lock);
17415         /* use trialcs->cpus_allowed as a temp variable */
17416         update_cpumasks_hier(cs, trialcs->cpus_allowed);
17417 @@ -1184,9 +1184,9 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
17418                         continue;
17419                 rcu_read_unlock();
17421 -               spin_lock_irq(&callback_lock);
17422 +               raw_spin_lock_irq(&callback_lock);
17423                 cp->effective_mems = *new_mems;
17424 -               spin_unlock_irq(&callback_lock);
17425 +               raw_spin_unlock_irq(&callback_lock);
17427                 WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
17428                         !nodes_equal(cp->mems_allowed, cp->effective_mems));
17429 @@ -1254,9 +1254,9 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
17430         if (retval < 0)
17431                 goto done;
17433 -       spin_lock_irq(&callback_lock);
17434 +       raw_spin_lock_irq(&callback_lock);
17435         cs->mems_allowed = trialcs->mems_allowed;
17436 -       spin_unlock_irq(&callback_lock);
17437 +       raw_spin_unlock_irq(&callback_lock);
17439         /* use trialcs->mems_allowed as a temp variable */
17440         update_nodemasks_hier(cs, &trialcs->mems_allowed);
17441 @@ -1347,9 +1347,9 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
17442         spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
17443                         || (is_spread_page(cs) != is_spread_page(trialcs)));
17445 -       spin_lock_irq(&callback_lock);
17446 +       raw_spin_lock_irq(&callback_lock);
17447         cs->flags = trialcs->flags;
17448 -       spin_unlock_irq(&callback_lock);
17449 +       raw_spin_unlock_irq(&callback_lock);
17451         if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
17452                 rebuild_sched_domains_locked();
17453 @@ -1761,7 +1761,7 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
17454         cpuset_filetype_t type = seq_cft(sf)->private;
17455         int ret = 0;
17457 -       spin_lock_irq(&callback_lock);
17458 +       raw_spin_lock_irq(&callback_lock);
17460         switch (type) {
17461         case FILE_CPULIST:
17462 @@ -1780,7 +1780,7 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
17463                 ret = -EINVAL;
17464         }
17466 -       spin_unlock_irq(&callback_lock);
17467 +       raw_spin_unlock_irq(&callback_lock);
17468         return ret;
17471 @@ -1994,12 +1994,12 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
17473         cpuset_inc();
17475 -       spin_lock_irq(&callback_lock);
17476 +       raw_spin_lock_irq(&callback_lock);
17477         if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) {
17478                 cpumask_copy(cs->effective_cpus, parent->effective_cpus);
17479                 cs->effective_mems = parent->effective_mems;
17480         }
17481 -       spin_unlock_irq(&callback_lock);
17482 +       raw_spin_unlock_irq(&callback_lock);
17484         if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
17485                 goto out_unlock;
17486 @@ -2026,12 +2026,12 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
17487         }
17488         rcu_read_unlock();
17490 -       spin_lock_irq(&callback_lock);
17491 +       raw_spin_lock_irq(&callback_lock);
17492         cs->mems_allowed = parent->mems_allowed;
17493         cs->effective_mems = parent->mems_allowed;
17494         cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
17495         cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
17496 -       spin_unlock_irq(&callback_lock);
17497 +       raw_spin_unlock_irq(&callback_lock);
17498  out_unlock:
17499         mutex_unlock(&cpuset_mutex);
17500         return 0;
17501 @@ -2070,7 +2070,7 @@ static void cpuset_css_free(struct cgroup_subsys_state *css)
17502  static void cpuset_bind(struct cgroup_subsys_state *root_css)
17504         mutex_lock(&cpuset_mutex);
17505 -       spin_lock_irq(&callback_lock);
17506 +       raw_spin_lock_irq(&callback_lock);
17508         if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) {
17509                 cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
17510 @@ -2081,7 +2081,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css)
17511                 top_cpuset.mems_allowed = top_cpuset.effective_mems;
17512         }
17514 -       spin_unlock_irq(&callback_lock);
17515 +       raw_spin_unlock_irq(&callback_lock);
17516         mutex_unlock(&cpuset_mutex);
17519 @@ -2182,12 +2182,12 @@ hotplug_update_tasks_legacy(struct cpuset *cs,
17521         bool is_empty;
17523 -       spin_lock_irq(&callback_lock);
17524 +       raw_spin_lock_irq(&callback_lock);
17525         cpumask_copy(cs->cpus_allowed, new_cpus);
17526         cpumask_copy(cs->effective_cpus, new_cpus);
17527         cs->mems_allowed = *new_mems;
17528         cs->effective_mems = *new_mems;
17529 -       spin_unlock_irq(&callback_lock);
17530 +       raw_spin_unlock_irq(&callback_lock);
17532         /*
17533          * Don't call update_tasks_cpumask() if the cpuset becomes empty,
17534 @@ -2224,10 +2224,10 @@ hotplug_update_tasks(struct cpuset *cs,
17535         if (nodes_empty(*new_mems))
17536                 *new_mems = parent_cs(cs)->effective_mems;
17538 -       spin_lock_irq(&callback_lock);
17539 +       raw_spin_lock_irq(&callback_lock);
17540         cpumask_copy(cs->effective_cpus, new_cpus);
17541         cs->effective_mems = *new_mems;
17542 -       spin_unlock_irq(&callback_lock);
17543 +       raw_spin_unlock_irq(&callback_lock);
17545         if (cpus_updated)
17546                 update_tasks_cpumask(cs);
17547 @@ -2313,21 +2313,21 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
17549         /* synchronize cpus_allowed to cpu_active_mask */
17550         if (cpus_updated) {
17551 -               spin_lock_irq(&callback_lock);
17552 +               raw_spin_lock_irq(&callback_lock);
17553                 if (!on_dfl)
17554                         cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
17555                 cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
17556 -               spin_unlock_irq(&callback_lock);
17557 +               raw_spin_unlock_irq(&callback_lock);
17558                 /* we don't mess with cpumasks of tasks in top_cpuset */
17559         }
17561         /* synchronize mems_allowed to N_MEMORY */
17562         if (mems_updated) {
17563 -               spin_lock_irq(&callback_lock);
17564 +               raw_spin_lock_irq(&callback_lock);
17565                 if (!on_dfl)
17566                         top_cpuset.mems_allowed = new_mems;
17567                 top_cpuset.effective_mems = new_mems;
17568 -               spin_unlock_irq(&callback_lock);
17569 +               raw_spin_unlock_irq(&callback_lock);
17570                 update_tasks_nodemask(&top_cpuset);
17571         }
17573 @@ -2425,11 +2425,11 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
17575         unsigned long flags;
17577 -       spin_lock_irqsave(&callback_lock, flags);
17578 +       raw_spin_lock_irqsave(&callback_lock, flags);
17579         rcu_read_lock();
17580         guarantee_online_cpus(task_cs(tsk), pmask);
17581         rcu_read_unlock();
17582 -       spin_unlock_irqrestore(&callback_lock, flags);
17583 +       raw_spin_unlock_irqrestore(&callback_lock, flags);
17586  void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
17587 @@ -2477,11 +2477,11 @@ nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
17588         nodemask_t mask;
17589         unsigned long flags;
17591 -       spin_lock_irqsave(&callback_lock, flags);
17592 +       raw_spin_lock_irqsave(&callback_lock, flags);
17593         rcu_read_lock();
17594         guarantee_online_mems(task_cs(tsk), &mask);
17595         rcu_read_unlock();
17596 -       spin_unlock_irqrestore(&callback_lock, flags);
17597 +       raw_spin_unlock_irqrestore(&callback_lock, flags);
17599         return mask;
17601 @@ -2573,14 +2573,14 @@ int __cpuset_node_allowed(int node, gfp_t gfp_mask)
17602                 return 1;
17604         /* Not hardwall and node outside mems_allowed: scan up cpusets */
17605 -       spin_lock_irqsave(&callback_lock, flags);
17606 +       raw_spin_lock_irqsave(&callback_lock, flags);
17608         rcu_read_lock();
17609         cs = nearest_hardwall_ancestor(task_cs(current));
17610         allowed = node_isset(node, cs->mems_allowed);
17611         rcu_read_unlock();
17613 -       spin_unlock_irqrestore(&callback_lock, flags);
17614 +       raw_spin_unlock_irqrestore(&callback_lock, flags);
17615         return allowed;
17618 diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
17619 index fc1ef736253c..83c666537a7a 100644
17620 --- a/kernel/debug/kdb/kdb_io.c
17621 +++ b/kernel/debug/kdb/kdb_io.c
17622 @@ -554,7 +554,6 @@ int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap)
17623         int linecount;
17624         int colcount;
17625         int logging, saved_loglevel = 0;
17626 -       int saved_trap_printk;
17627         int got_printf_lock = 0;
17628         int retlen = 0;
17629         int fnd, len;
17630 @@ -565,8 +564,6 @@ int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap)
17631         unsigned long uninitialized_var(flags);
17633         preempt_disable();
17634 -       saved_trap_printk = kdb_trap_printk;
17635 -       kdb_trap_printk = 0;
17637         /* Serialize kdb_printf if multiple cpus try to write at once.
17638          * But if any cpu goes recursive in kdb, just print the output,
17639 @@ -855,7 +852,6 @@ kdb_print_out:
17640         } else {
17641                 __release(kdb_printf_lock);
17642         }
17643 -       kdb_trap_printk = saved_trap_printk;
17644         preempt_enable();
17645         return retlen;
17647 @@ -865,9 +861,11 @@ int kdb_printf(const char *fmt, ...)
17648         va_list ap;
17649         int r;
17651 +       kdb_trap_printk++;
17652         va_start(ap, fmt);
17653         r = vkdb_printf(KDB_MSGSRC_INTERNAL, fmt, ap);
17654         va_end(ap);
17655 +       kdb_trap_printk--;
17657         return r;
17659 diff --git a/kernel/events/core.c b/kernel/events/core.c
17660 index 784ab8fe8714..03166fb2364e 100644
17661 --- a/kernel/events/core.c
17662 +++ b/kernel/events/core.c
17663 @@ -802,6 +802,7 @@ static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
17664         raw_spin_lock_init(&cpuctx->hrtimer_lock);
17665         hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
17666         timer->function = perf_mux_hrtimer_handler;
17667 +       timer->irqsafe = 1;
17670  static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx)
17671 @@ -7240,6 +7241,7 @@ static void perf_swevent_init_hrtimer(struct perf_event *event)
17673         hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
17674         hwc->hrtimer.function = perf_swevent_hrtimer;
17675 +       hwc->hrtimer.irqsafe = 1;
17677         /*
17678          * Since hrtimers have a fixed rate, we can do a static freq->period
17679 diff --git a/kernel/exit.c b/kernel/exit.c
17680 index ffba5df4abd5..e199407f8831 100644
17681 --- a/kernel/exit.c
17682 +++ b/kernel/exit.c
17683 @@ -144,7 +144,7 @@ static void __exit_signal(struct task_struct *tsk)
17684          * Do this under ->siglock, we can race with another thread
17685          * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals.
17686          */
17687 -       flush_sigqueue(&tsk->pending);
17688 +       flush_task_sigqueue(tsk);
17689         tsk->sighand = NULL;
17690         spin_unlock(&sighand->siglock);
17692 diff --git a/kernel/fork.c b/kernel/fork.c
17693 index 0ee630f3ad4b..0fe2c188e94d 100644
17694 --- a/kernel/fork.c
17695 +++ b/kernel/fork.c
17696 @@ -108,7 +108,7 @@ int max_threads;            /* tunable limit on nr_threads */
17698  DEFINE_PER_CPU(unsigned long, process_counts) = 0;
17700 -__cacheline_aligned DEFINE_RWLOCK(tasklist_lock);  /* outer */
17701 +DEFINE_RWLOCK(tasklist_lock);  /* outer */
17703  #ifdef CONFIG_PROVE_RCU
17704  int lockdep_tasklist_lock_is_held(void)
17705 @@ -244,7 +244,9 @@ static inline void put_signal_struct(struct signal_struct *sig)
17706         if (atomic_dec_and_test(&sig->sigcnt))
17707                 free_signal_struct(sig);
17710 +#ifdef CONFIG_PREEMPT_RT_BASE
17711 +static
17712 +#endif
17713  void __put_task_struct(struct task_struct *tsk)
17715         WARN_ON(!tsk->exit_state);
17716 @@ -261,7 +263,18 @@ void __put_task_struct(struct task_struct *tsk)
17717         if (!profile_handoff_task(tsk))
17718                 free_task(tsk);
17720 +#ifndef CONFIG_PREEMPT_RT_BASE
17721  EXPORT_SYMBOL_GPL(__put_task_struct);
17722 +#else
17723 +void __put_task_struct_cb(struct rcu_head *rhp)
17725 +       struct task_struct *tsk = container_of(rhp, struct task_struct, put_rcu);
17727 +       __put_task_struct(tsk);
17730 +EXPORT_SYMBOL_GPL(__put_task_struct_cb);
17731 +#endif
17733  void __init __weak arch_task_cache_init(void) { }
17735 @@ -693,6 +706,19 @@ void __mmdrop(struct mm_struct *mm)
17737  EXPORT_SYMBOL_GPL(__mmdrop);
17739 +#ifdef CONFIG_PREEMPT_RT_BASE
17741 + * RCU callback for delayed mm drop. Not strictly rcu, but we don't
17742 + * want another facility to make this work.
17743 + */
17744 +void __mmdrop_delayed(struct rcu_head *rhp)
17746 +       struct mm_struct *mm = container_of(rhp, struct mm_struct, delayed_drop);
17748 +       __mmdrop(mm);
17750 +#endif
17752  /*
17753   * Decrement the use count and release all resources for an mm.
17754   */
17755 @@ -1243,6 +1269,9 @@ static void rt_mutex_init_task(struct task_struct *p)
17756   */
17757  static void posix_cpu_timers_init(struct task_struct *tsk)
17759 +#ifdef CONFIG_PREEMPT_RT_BASE
17760 +       tsk->posix_timer_list = NULL;
17761 +#endif
17762         tsk->cputime_expires.prof_exp = 0;
17763         tsk->cputime_expires.virt_exp = 0;
17764         tsk->cputime_expires.sched_exp = 0;
17765 @@ -1369,15 +1398,16 @@ static struct task_struct *copy_process(unsigned long clone_flags,
17766         spin_lock_init(&p->alloc_lock);
17768         init_sigpending(&p->pending);
17769 +       p->sigqueue_cache = NULL;
17771         p->utime = p->stime = p->gtime = 0;
17772         p->utimescaled = p->stimescaled = 0;
17773         prev_cputime_init(&p->prev_cputime);
17775  #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
17776 -       seqlock_init(&p->vtime_seqlock);
17777 +       seqcount_init(&p->vtime_seqcount);
17778         p->vtime_snap = 0;
17779 -       p->vtime_snap_whence = VTIME_SLEEPING;
17780 +       p->vtime_snap_whence = VTIME_INACTIVE;
17781  #endif
17783  #if defined(SPLIT_RSS_COUNTING)
17784 diff --git a/kernel/futex.c b/kernel/futex.c
17785 index 3057dabf726f..9604b73ada47 100644
17786 --- a/kernel/futex.c
17787 +++ b/kernel/futex.c
17788 @@ -815,7 +815,9 @@ void exit_pi_state_list(struct task_struct *curr)
17789                  * task still owns the PI-state:
17790                  */
17791                 if (head->next != next) {
17792 +                       raw_spin_unlock_irq(&curr->pi_lock);
17793                         spin_unlock(&hb->lock);
17794 +                       raw_spin_lock_irq(&curr->pi_lock);
17795                         continue;
17796                 }
17798 @@ -1210,6 +1212,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
17799         struct futex_pi_state *pi_state = this->pi_state;
17800         u32 uninitialized_var(curval), newval;
17801         WAKE_Q(wake_q);
17802 +       WAKE_Q(wake_sleeper_q);
17803         bool deboost;
17804         int ret = 0;
17806 @@ -1223,7 +1226,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
17807         if (pi_state->owner != current)
17808                 return -EINVAL;
17810 -       raw_spin_lock(&pi_state->pi_mutex.wait_lock);
17811 +       raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
17812         new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
17814         /*
17815 @@ -1259,24 +1262,25 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
17816                         ret = -EINVAL;
17817         }
17818         if (ret) {
17819 -               raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
17820 +               raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
17821                 return ret;
17822         }
17824 -       raw_spin_lock_irq(&pi_state->owner->pi_lock);
17825 +       raw_spin_lock(&pi_state->owner->pi_lock);
17826         WARN_ON(list_empty(&pi_state->list));
17827         list_del_init(&pi_state->list);
17828 -       raw_spin_unlock_irq(&pi_state->owner->pi_lock);
17829 +       raw_spin_unlock(&pi_state->owner->pi_lock);
17831 -       raw_spin_lock_irq(&new_owner->pi_lock);
17832 +       raw_spin_lock(&new_owner->pi_lock);
17833         WARN_ON(!list_empty(&pi_state->list));
17834         list_add(&pi_state->list, &new_owner->pi_state_list);
17835         pi_state->owner = new_owner;
17836 -       raw_spin_unlock_irq(&new_owner->pi_lock);
17837 +       raw_spin_unlock(&new_owner->pi_lock);
17839 -       raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
17840 +       raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
17842 -       deboost = rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q);
17843 +       deboost = rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q,
17844 +                                       &wake_sleeper_q);
17846         /*
17847          * First unlock HB so the waiter does not spin on it once he got woken
17848 @@ -1284,8 +1288,9 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
17849          * deboost first (and lose our higher priority), then the task might get
17850          * scheduled away before the wake up can take place.
17851          */
17852 -       spin_unlock(&hb->lock);
17853 +       deboost |= spin_unlock_no_deboost(&hb->lock);
17854         wake_up_q(&wake_q);
17855 +       wake_up_q_sleeper(&wake_sleeper_q);
17856         if (deboost)
17857                 rt_mutex_adjust_prio(current);
17859 @@ -1822,6 +1827,16 @@ retry_private:
17860                                 requeue_pi_wake_futex(this, &key2, hb2);
17861                                 drop_count++;
17862                                 continue;
17863 +                       } else if (ret == -EAGAIN) {
17864 +                               /*
17865 +                                * Waiter was woken by timeout or
17866 +                                * signal and has set pi_blocked_on to
17867 +                                * PI_WAKEUP_INPROGRESS before we
17868 +                                * tried to enqueue it on the rtmutex.
17869 +                                */
17870 +                               this->pi_state = NULL;
17871 +                               free_pi_state(pi_state);
17872 +                               continue;
17873                         } else if (ret) {
17874                                 /* -EDEADLK */
17875                                 this->pi_state = NULL;
17876 @@ -2139,11 +2154,11 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
17877                  * we returned due to timeout or signal without taking the
17878                  * rt_mutex. Too late.
17879                  */
17880 -               raw_spin_lock(&q->pi_state->pi_mutex.wait_lock);
17881 +               raw_spin_lock_irq(&q->pi_state->pi_mutex.wait_lock);
17882                 owner = rt_mutex_owner(&q->pi_state->pi_mutex);
17883                 if (!owner)
17884                         owner = rt_mutex_next_owner(&q->pi_state->pi_mutex);
17885 -               raw_spin_unlock(&q->pi_state->pi_mutex.wait_lock);
17886 +               raw_spin_unlock_irq(&q->pi_state->pi_mutex.wait_lock);
17887                 ret = fixup_pi_state_owner(uaddr, q, owner);
17888                 goto out;
17889         }
17890 @@ -2690,7 +2705,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
17892         struct hrtimer_sleeper timeout, *to = NULL;
17893         struct rt_mutex_waiter rt_waiter;
17894 -       struct futex_hash_bucket *hb;
17895 +       struct futex_hash_bucket *hb, *hb2;
17896         union futex_key key2 = FUTEX_KEY_INIT;
17897         struct futex_q q = futex_q_init;
17898         int res, ret;
17899 @@ -2715,10 +2730,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
17900          * The waiter is allocated on our stack, manipulated by the requeue
17901          * code while we sleep on uaddr.
17902          */
17903 -       debug_rt_mutex_init_waiter(&rt_waiter);
17904 -       RB_CLEAR_NODE(&rt_waiter.pi_tree_entry);
17905 -       RB_CLEAR_NODE(&rt_waiter.tree_entry);
17906 -       rt_waiter.task = NULL;
17907 +       rt_mutex_init_waiter(&rt_waiter, false);
17909         ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
17910         if (unlikely(ret != 0))
17911 @@ -2749,20 +2761,55 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
17912         /* Queue the futex_q, drop the hb lock, wait for wakeup. */
17913         futex_wait_queue_me(hb, &q, to);
17915 -       spin_lock(&hb->lock);
17916 -       ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
17917 -       spin_unlock(&hb->lock);
17918 -       if (ret)
17919 -               goto out_put_keys;
17920 +       /*
17921 +        * On RT we must avoid races with requeue and trying to block
17922 +        * on two mutexes (hb->lock and uaddr2's rtmutex) by
17923 +        * serializing access to pi_blocked_on with pi_lock.
17924 +        */
17925 +       raw_spin_lock_irq(&current->pi_lock);
17926 +       if (current->pi_blocked_on) {
17927 +               /*
17928 +                * We have been requeued or are in the process of
17929 +                * being requeued.
17930 +                */
17931 +               raw_spin_unlock_irq(&current->pi_lock);
17932 +       } else {
17933 +               /*
17934 +                * Setting pi_blocked_on to PI_WAKEUP_INPROGRESS
17935 +                * prevents a concurrent requeue from moving us to the
17936 +                * uaddr2 rtmutex. After that we can safely acquire
17937 +                * (and possibly block on) hb->lock.
17938 +                */
17939 +               current->pi_blocked_on = PI_WAKEUP_INPROGRESS;
17940 +               raw_spin_unlock_irq(&current->pi_lock);
17942 +               spin_lock(&hb->lock);
17944 +               /*
17945 +                * Clean up pi_blocked_on. We might leak it otherwise
17946 +                * when we succeeded with the hb->lock in the fast
17947 +                * path.
17948 +                */
17949 +               raw_spin_lock_irq(&current->pi_lock);
17950 +               current->pi_blocked_on = NULL;
17951 +               raw_spin_unlock_irq(&current->pi_lock);
17953 +               ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
17954 +               spin_unlock(&hb->lock);
17955 +               if (ret)
17956 +                       goto out_put_keys;
17957 +       }
17959         /*
17960 -        * In order for us to be here, we know our q.key == key2, and since
17961 -        * we took the hb->lock above, we also know that futex_requeue() has
17962 -        * completed and we no longer have to concern ourselves with a wakeup
17963 -        * race with the atomic proxy lock acquisition by the requeue code. The
17964 -        * futex_requeue dropped our key1 reference and incremented our key2
17965 -        * reference count.
17966 +        * In order to be here, we have either been requeued, are in
17967 +        * the process of being requeued, or requeue successfully
17968 +        * acquired uaddr2 on our behalf.  If pi_blocked_on was
17969 +        * non-null above, we may be racing with a requeue.  Do not
17970 +        * rely on q->lock_ptr to be hb2->lock until after blocking on
17971 +        * hb->lock or hb2->lock. The futex_requeue dropped our key1
17972 +        * reference and incremented our key2 reference count.
17973          */
17974 +       hb2 = hash_futex(&key2);
17976         /* Check if the requeue code acquired the second futex for us. */
17977         if (!q.rt_waiter) {
17978 @@ -2771,7 +2818,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
17979                  * did a lock-steal - fix up the PI-state in that case.
17980                  */
17981                 if (q.pi_state && (q.pi_state->owner != current)) {
17982 -                       spin_lock(q.lock_ptr);
17983 +                       spin_lock(&hb2->lock);
17984 +                       BUG_ON(&hb2->lock != q.lock_ptr);
17985                         ret = fixup_pi_state_owner(uaddr2, &q, current);
17986                         if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current)
17987                                 rt_mutex_unlock(&q.pi_state->pi_mutex);
17988 @@ -2780,7 +2828,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
17989                          * the requeue_pi() code acquired for us.
17990                          */
17991                         free_pi_state(q.pi_state);
17992 -                       spin_unlock(q.lock_ptr);
17993 +                       spin_unlock(&hb2->lock);
17994                 }
17995         } else {
17996                 struct rt_mutex *pi_mutex;
17997 @@ -2795,7 +2843,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
17998                 ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter);
17999                 debug_rt_mutex_free_waiter(&rt_waiter);
18001 -               spin_lock(q.lock_ptr);
18002 +               spin_lock(&hb2->lock);
18003 +               BUG_ON(&hb2->lock != q.lock_ptr);
18004                 /*
18005                  * Fixup the pi_state owner and possibly acquire the lock if we
18006                  * haven't already.
18007 diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
18008 index 57bff7857e87..6c65c9252991 100644
18009 --- a/kernel/irq/handle.c
18010 +++ b/kernel/irq/handle.c
18011 @@ -134,6 +134,8 @@ void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action)
18013  irqreturn_t handle_irq_event_percpu(struct irq_desc *desc)
18015 +       struct pt_regs *regs = get_irq_regs();
18016 +       u64 ip = regs ? instruction_pointer(regs) : 0;
18017         irqreturn_t retval = IRQ_NONE;
18018         unsigned int flags = 0, irq = desc->irq_data.irq;
18019         struct irqaction *action = desc->action;
18020 @@ -176,7 +178,11 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc)
18021                 action = action->next;
18022         }
18024 -       add_interrupt_randomness(irq, flags);
18025 +#ifdef CONFIG_PREEMPT_RT_FULL
18026 +       desc->random_ip = ip;
18027 +#else
18028 +       add_interrupt_randomness(irq, flags, ip);
18029 +#endif
18031         if (!noirqdebug)
18032                 note_interrupt(desc, retval);
18033 diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
18034 index 239e2ae2c947..0b73349a42d5 100644
18035 --- a/kernel/irq/irqdesc.c
18036 +++ b/kernel/irq/irqdesc.c
18037 @@ -24,10 +24,27 @@
18038  static struct lock_class_key irq_desc_lock_class;
18040  #if defined(CONFIG_SMP)
18041 +static int __init irq_affinity_setup(char *str)
18043 +       zalloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
18044 +       cpulist_parse(str, irq_default_affinity);
18045 +       /*
18046 +        * Set at least the boot cpu. We don't want to end up with
18047 +        * bugreports caused by random comandline masks
18048 +        */
18049 +       cpumask_set_cpu(smp_processor_id(), irq_default_affinity);
18050 +       return 1;
18052 +__setup("irqaffinity=", irq_affinity_setup);
18054  static void __init init_irq_default_affinity(void)
18056 -       alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
18057 -       cpumask_setall(irq_default_affinity);
18058 +#ifdef CONFIG_CPUMASK_OFFSTACK
18059 +       if (!irq_default_affinity)
18060 +               zalloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
18061 +#endif
18062 +       if (cpumask_empty(irq_default_affinity))
18063 +               cpumask_setall(irq_default_affinity);
18065  #else
18066  static void __init init_irq_default_affinity(void)
18067 diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
18068 index 6ead200370da..8e89554aa345 100644
18069 --- a/kernel/irq/manage.c
18070 +++ b/kernel/irq/manage.c
18071 @@ -22,6 +22,7 @@
18072  #include "internals.h"
18074  #ifdef CONFIG_IRQ_FORCED_THREADING
18075 +# ifndef CONFIG_PREEMPT_RT_BASE
18076  __read_mostly bool force_irqthreads;
18078  static int __init setup_forced_irqthreads(char *arg)
18079 @@ -30,6 +31,7 @@ static int __init setup_forced_irqthreads(char *arg)
18080         return 0;
18082  early_param("threadirqs", setup_forced_irqthreads);
18083 +# endif
18084  #endif
18086  static void __synchronize_hardirq(struct irq_desc *desc)
18087 @@ -181,6 +183,62 @@ static inline void
18088  irq_get_pending(struct cpumask *mask, struct irq_desc *desc) { }
18089  #endif
18091 +#ifdef CONFIG_PREEMPT_RT_FULL
18092 +static void _irq_affinity_notify(struct irq_affinity_notify *notify);
18093 +static struct task_struct *set_affinity_helper;
18094 +static LIST_HEAD(affinity_list);
18095 +static DEFINE_RAW_SPINLOCK(affinity_list_lock);
18097 +static int set_affinity_thread(void *unused)
18099 +       while (1) {
18100 +               struct irq_affinity_notify *notify;
18101 +               int empty;
18103 +               set_current_state(TASK_INTERRUPTIBLE);
18105 +               raw_spin_lock_irq(&affinity_list_lock);
18106 +               empty = list_empty(&affinity_list);
18107 +               raw_spin_unlock_irq(&affinity_list_lock);
18109 +               if (empty)
18110 +                       schedule();
18111 +               if (kthread_should_stop())
18112 +                       break;
18113 +               set_current_state(TASK_RUNNING);
18114 +try_next:
18115 +               notify = NULL;
18117 +               raw_spin_lock_irq(&affinity_list_lock);
18118 +               if (!list_empty(&affinity_list)) {
18119 +                       notify = list_first_entry(&affinity_list,
18120 +                                       struct irq_affinity_notify, list);
18121 +                       list_del_init(&notify->list);
18122 +               }
18123 +               raw_spin_unlock_irq(&affinity_list_lock);
18125 +               if (!notify)
18126 +                       continue;
18127 +               _irq_affinity_notify(notify);
18128 +               goto try_next;
18129 +       }
18130 +       return 0;
18133 +static void init_helper_thread(void)
18135 +       if (set_affinity_helper)
18136 +               return;
18137 +       set_affinity_helper = kthread_run(set_affinity_thread, NULL,
18138 +                       "affinity-cb");
18139 +       WARN_ON(IS_ERR(set_affinity_helper));
18141 +#else
18143 +static inline void init_helper_thread(void) { }
18145 +#endif
18147  int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
18148                         bool force)
18150 @@ -220,7 +278,17 @@ int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask,
18152         if (desc->affinity_notify) {
18153                 kref_get(&desc->affinity_notify->kref);
18155 +#ifdef CONFIG_PREEMPT_RT_FULL
18156 +               raw_spin_lock(&affinity_list_lock);
18157 +               if (list_empty(&desc->affinity_notify->list))
18158 +                       list_add_tail(&affinity_list,
18159 +                                       &desc->affinity_notify->list);
18160 +               raw_spin_unlock(&affinity_list_lock);
18161 +               wake_up_process(set_affinity_helper);
18162 +#else
18163                 schedule_work(&desc->affinity_notify->work);
18164 +#endif
18165         }
18166         irqd_set(data, IRQD_AFFINITY_SET);
18168 @@ -258,10 +326,8 @@ int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
18170  EXPORT_SYMBOL_GPL(irq_set_affinity_hint);
18172 -static void irq_affinity_notify(struct work_struct *work)
18173 +static void _irq_affinity_notify(struct irq_affinity_notify *notify)
18175 -       struct irq_affinity_notify *notify =
18176 -               container_of(work, struct irq_affinity_notify, work);
18177         struct irq_desc *desc = irq_to_desc(notify->irq);
18178         cpumask_var_t cpumask;
18179         unsigned long flags;
18180 @@ -283,6 +349,13 @@ out:
18181         kref_put(&notify->kref, notify->release);
18184 +static void irq_affinity_notify(struct work_struct *work)
18186 +       struct irq_affinity_notify *notify =
18187 +               container_of(work, struct irq_affinity_notify, work);
18188 +       _irq_affinity_notify(notify);
18191  /**
18192   *     irq_set_affinity_notifier - control notification of IRQ affinity changes
18193   *     @irq:           Interrupt for which to enable/disable notification
18194 @@ -312,6 +385,8 @@ irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify)
18195                 notify->irq = irq;
18196                 kref_init(&notify->kref);
18197                 INIT_WORK(&notify->work, irq_affinity_notify);
18198 +               INIT_LIST_HEAD(&notify->list);
18199 +               init_helper_thread();
18200         }
18202         raw_spin_lock_irqsave(&desc->lock, flags);
18203 @@ -865,7 +940,15 @@ irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action)
18204         local_bh_disable();
18205         ret = action->thread_fn(action->irq, action->dev_id);
18206         irq_finalize_oneshot(desc, action);
18207 -       local_bh_enable();
18208 +       /*
18209 +        * Interrupts which have real time requirements can be set up
18210 +        * to avoid softirq processing in the thread handler. This is
18211 +        * safe as these interrupts do not raise soft interrupts.
18212 +        */
18213 +       if (irq_settings_no_softirq_call(desc))
18214 +               _local_bh_enable();
18215 +       else
18216 +               local_bh_enable();
18217         return ret;
18220 @@ -962,6 +1045,12 @@ static int irq_thread(void *data)
18221                 if (action_ret == IRQ_WAKE_THREAD)
18222                         irq_wake_secondary(desc, action);
18224 +#ifdef CONFIG_PREEMPT_RT_FULL
18225 +               migrate_disable();
18226 +               add_interrupt_randomness(action->irq, 0,
18227 +                                desc->random_ip ^ (unsigned long) action);
18228 +               migrate_enable();
18229 +#endif
18230                 wake_threads_waitq(desc);
18231         }
18233 @@ -1315,6 +1404,9 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
18234                         irqd_set(&desc->irq_data, IRQD_NO_BALANCING);
18235                 }
18237 +               if (new->flags & IRQF_NO_SOFTIRQ_CALL)
18238 +                       irq_settings_set_no_softirq_call(desc);
18240                 /* Set default affinity mask once everything is setup */
18241                 setup_affinity(desc, mask);
18243 @@ -1968,7 +2060,7 @@ EXPORT_SYMBOL_GPL(irq_get_irqchip_state);
18244   *     This call sets the internal irqchip state of an interrupt,
18245   *     depending on the value of @which.
18246   *
18247 - *     This function should be called with preemption disabled if the
18248 + *     This function should be called with migration disabled if the
18249   *     interrupt controller has per-cpu registers.
18250   */
18251  int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
18252 diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
18253 index 320579d89091..2df2d4445b1e 100644
18254 --- a/kernel/irq/settings.h
18255 +++ b/kernel/irq/settings.h
18256 @@ -16,6 +16,7 @@ enum {
18257         _IRQ_PER_CPU_DEVID      = IRQ_PER_CPU_DEVID,
18258         _IRQ_IS_POLLED          = IRQ_IS_POLLED,
18259         _IRQ_DISABLE_UNLAZY     = IRQ_DISABLE_UNLAZY,
18260 +       _IRQ_NO_SOFTIRQ_CALL    = IRQ_NO_SOFTIRQ_CALL,
18261         _IRQF_MODIFY_MASK       = IRQF_MODIFY_MASK,
18262  };
18264 @@ -30,6 +31,7 @@ enum {
18265  #define IRQ_PER_CPU_DEVID      GOT_YOU_MORON
18266  #define IRQ_IS_POLLED          GOT_YOU_MORON
18267  #define IRQ_DISABLE_UNLAZY     GOT_YOU_MORON
18268 +#define IRQ_NO_SOFTIRQ_CALL    GOT_YOU_MORON
18269  #undef IRQF_MODIFY_MASK
18270  #define IRQF_MODIFY_MASK       GOT_YOU_MORON
18272 @@ -40,6 +42,16 @@ irq_settings_clr_and_set(struct irq_desc *desc, u32 clr, u32 set)
18273         desc->status_use_accessors |= (set & _IRQF_MODIFY_MASK);
18276 +static inline bool irq_settings_no_softirq_call(struct irq_desc *desc)
18278 +       return desc->status_use_accessors & _IRQ_NO_SOFTIRQ_CALL;
18281 +static inline void irq_settings_set_no_softirq_call(struct irq_desc *desc)
18283 +       desc->status_use_accessors |= _IRQ_NO_SOFTIRQ_CALL;
18286  static inline bool irq_settings_is_per_cpu(struct irq_desc *desc)
18288         return desc->status_use_accessors & _IRQ_PER_CPU;
18289 diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
18290 index 32144175458d..ed26f2554972 100644
18291 --- a/kernel/irq/spurious.c
18292 +++ b/kernel/irq/spurious.c
18293 @@ -444,6 +444,10 @@ MODULE_PARM_DESC(noirqdebug, "Disable irq lockup detection when true");
18295  static int __init irqfixup_setup(char *str)
18297 +#ifdef CONFIG_PREEMPT_RT_BASE
18298 +       pr_warn("irqfixup boot option not supported w/ CONFIG_PREEMPT_RT_BASE\n");
18299 +       return 1;
18300 +#endif
18301         irqfixup = 1;
18302         printk(KERN_WARNING "Misrouted IRQ fixup support enabled.\n");
18303         printk(KERN_WARNING "This may impact system performance.\n");
18304 @@ -456,6 +460,10 @@ module_param(irqfixup, int, 0644);
18306  static int __init irqpoll_setup(char *str)
18308 +#ifdef CONFIG_PREEMPT_RT_BASE
18309 +       pr_warn("irqpoll boot option not supported w/ CONFIG_PREEMPT_RT_BASE\n");
18310 +       return 1;
18311 +#endif
18312         irqfixup = 2;
18313         printk(KERN_WARNING "Misrouted IRQ fixup and polling support "
18314                                 "enabled\n");
18315 diff --git a/kernel/irq_work.c b/kernel/irq_work.c
18316 index bcf107ce0854..2899ba0d23d1 100644
18317 --- a/kernel/irq_work.c
18318 +++ b/kernel/irq_work.c
18319 @@ -17,6 +17,7 @@
18320  #include <linux/cpu.h>
18321  #include <linux/notifier.h>
18322  #include <linux/smp.h>
18323 +#include <linux/interrupt.h>
18324  #include <asm/processor.h>
18327 @@ -65,6 +66,8 @@ void __weak arch_irq_work_raise(void)
18328   */
18329  bool irq_work_queue_on(struct irq_work *work, int cpu)
18331 +       struct llist_head *list;
18333         /* All work should have been flushed before going offline */
18334         WARN_ON_ONCE(cpu_is_offline(cpu));
18336 @@ -75,7 +78,12 @@ bool irq_work_queue_on(struct irq_work *work, int cpu)
18337         if (!irq_work_claim(work))
18338                 return false;
18340 -       if (llist_add(&work->llnode, &per_cpu(raised_list, cpu)))
18341 +       if (IS_ENABLED(CONFIG_PREEMPT_RT_FULL) && !(work->flags & IRQ_WORK_HARD_IRQ))
18342 +               list = &per_cpu(lazy_list, cpu);
18343 +       else
18344 +               list = &per_cpu(raised_list, cpu);
18346 +       if (llist_add(&work->llnode, list))
18347                 arch_send_call_function_single_ipi(cpu);
18349         return true;
18350 @@ -86,6 +94,9 @@ EXPORT_SYMBOL_GPL(irq_work_queue_on);
18351  /* Enqueue the irq work @work on the current CPU */
18352  bool irq_work_queue(struct irq_work *work)
18354 +       struct llist_head *list;
18355 +       bool lazy_work, realtime = IS_ENABLED(CONFIG_PREEMPT_RT_FULL);
18357         /* Only queue if not already pending */
18358         if (!irq_work_claim(work))
18359                 return false;
18360 @@ -93,13 +104,15 @@ bool irq_work_queue(struct irq_work *work)
18361         /* Queue the entry and raise the IPI if needed. */
18362         preempt_disable();
18364 -       /* If the work is "lazy", handle it from next tick if any */
18365 -       if (work->flags & IRQ_WORK_LAZY) {
18366 -               if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) &&
18367 -                   tick_nohz_tick_stopped())
18368 -                       arch_irq_work_raise();
18369 -       } else {
18370 -               if (llist_add(&work->llnode, this_cpu_ptr(&raised_list)))
18371 +       lazy_work = work->flags & IRQ_WORK_LAZY;
18373 +       if (lazy_work || (realtime && !(work->flags & IRQ_WORK_HARD_IRQ)))
18374 +               list = this_cpu_ptr(&lazy_list);
18375 +       else
18376 +               list = this_cpu_ptr(&raised_list);
18378 +       if (llist_add(&work->llnode, list)) {
18379 +               if (!lazy_work || tick_nohz_tick_stopped())
18380                         arch_irq_work_raise();
18381         }
18383 @@ -116,9 +129,8 @@ bool irq_work_needs_cpu(void)
18384         raised = this_cpu_ptr(&raised_list);
18385         lazy = this_cpu_ptr(&lazy_list);
18387 -       if (llist_empty(raised) || arch_irq_work_has_interrupt())
18388 -               if (llist_empty(lazy))
18389 -                       return false;
18390 +       if (llist_empty(raised) && llist_empty(lazy))
18391 +               return false;
18393         /* All work should have been flushed before going offline */
18394         WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
18395 @@ -132,7 +144,7 @@ static void irq_work_run_list(struct llist_head *list)
18396         struct irq_work *work;
18397         struct llist_node *llnode;
18399 -       BUG_ON(!irqs_disabled());
18400 +       BUG_ON_NONRT(!irqs_disabled());
18402         if (llist_empty(list))
18403                 return;
18404 @@ -169,7 +181,16 @@ static void irq_work_run_list(struct llist_head *list)
18405  void irq_work_run(void)
18407         irq_work_run_list(this_cpu_ptr(&raised_list));
18408 -       irq_work_run_list(this_cpu_ptr(&lazy_list));
18409 +       if (IS_ENABLED(CONFIG_PREEMPT_RT_FULL)) {
18410 +               /*
18411 +                * NOTE: we raise softirq via IPI for safety,
18412 +                * and execute in irq_work_tick() to move the
18413 +                * overhead from hard to soft irq context.
18414 +                */
18415 +               if (!llist_empty(this_cpu_ptr(&lazy_list)))
18416 +                       raise_softirq(TIMER_SOFTIRQ);
18417 +       } else
18418 +               irq_work_run_list(this_cpu_ptr(&lazy_list));
18420  EXPORT_SYMBOL_GPL(irq_work_run);
18422 @@ -179,8 +200,17 @@ void irq_work_tick(void)
18424         if (!llist_empty(raised) && !arch_irq_work_has_interrupt())
18425                 irq_work_run_list(raised);
18427 +       if (!IS_ENABLED(CONFIG_PREEMPT_RT_FULL))
18428 +               irq_work_run_list(this_cpu_ptr(&lazy_list));
18431 +#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_PREEMPT_RT_FULL)
18432 +void irq_work_tick_soft(void)
18434         irq_work_run_list(this_cpu_ptr(&lazy_list));
18436 +#endif
18438  /*
18439   * Synchronize against the irq_work @entry, ensures the entry is not
18440 diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
18441 index e83b26464061..c0e08d1cf33e 100644
18442 --- a/kernel/ksysfs.c
18443 +++ b/kernel/ksysfs.c
18444 @@ -136,6 +136,15 @@ KERNEL_ATTR_RO(vmcoreinfo);
18446  #endif /* CONFIG_KEXEC_CORE */
18448 +#if defined(CONFIG_PREEMPT_RT_FULL)
18449 +static ssize_t  realtime_show(struct kobject *kobj,
18450 +                             struct kobj_attribute *attr, char *buf)
18452 +       return sprintf(buf, "%d\n", 1);
18454 +KERNEL_ATTR_RO(realtime);
18455 +#endif
18457  /* whether file capabilities are enabled */
18458  static ssize_t fscaps_show(struct kobject *kobj,
18459                                   struct kobj_attribute *attr, char *buf)
18460 @@ -203,6 +212,9 @@ static struct attribute * kernel_attrs[] = {
18461         &vmcoreinfo_attr.attr,
18462  #endif
18463         &rcu_expedited_attr.attr,
18464 +#ifdef CONFIG_PREEMPT_RT_FULL
18465 +       &realtime_attr.attr,
18466 +#endif
18467         NULL
18468  };
18470 diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
18471 index 8e96f6cc2a4a..447b03082d88 100644
18472 --- a/kernel/locking/Makefile
18473 +++ b/kernel/locking/Makefile
18474 @@ -1,5 +1,5 @@
18476 -obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o
18477 +obj-y += semaphore.o percpu-rwsem.o
18479  ifdef CONFIG_FUNCTION_TRACER
18480  CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE)
18481 @@ -8,7 +8,11 @@ CFLAGS_REMOVE_mutex-debug.o = $(CC_FLAGS_FTRACE)
18482  CFLAGS_REMOVE_rtmutex-debug.o = $(CC_FLAGS_FTRACE)
18483  endif
18485 +ifneq ($(CONFIG_PREEMPT_RT_FULL),y)
18486 +obj-y += mutex.o
18487  obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
18488 +obj-y += rwsem.o
18489 +endif
18490  obj-$(CONFIG_LOCKDEP) += lockdep.o
18491  ifeq ($(CONFIG_PROC_FS),y)
18492  obj-$(CONFIG_LOCKDEP) += lockdep_proc.o
18493 @@ -22,7 +26,10 @@ obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
18494  obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
18495  obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
18496  obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o
18497 +ifneq ($(CONFIG_PREEMPT_RT_FULL),y)
18498  obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
18499  obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o
18500 +endif
18501 +obj-$(CONFIG_PREEMPT_RT_FULL) += rt.o
18502  obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o
18503  obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o
18504 diff --git a/kernel/locking/lglock.c b/kernel/locking/lglock.c
18505 index 951cfcd10b4a..57e0ea72c28a 100644
18506 --- a/kernel/locking/lglock.c
18507 +++ b/kernel/locking/lglock.c
18508 @@ -4,6 +4,15 @@
18509  #include <linux/cpu.h>
18510  #include <linux/string.h>
18512 +#ifndef CONFIG_PREEMPT_RT_FULL
18513 +# define lg_lock_ptr           arch_spinlock_t
18514 +# define lg_do_lock(l)         arch_spin_lock(l)
18515 +# define lg_do_unlock(l)       arch_spin_unlock(l)
18516 +#else
18517 +# define lg_lock_ptr           struct rt_mutex
18518 +# define lg_do_lock(l)         __rt_spin_lock__no_mg(l)
18519 +# define lg_do_unlock(l)       __rt_spin_unlock(l)
18520 +#endif
18521  /*
18522   * Note there is no uninit, so lglocks cannot be defined in
18523   * modules (but it's fine to use them from there)
18524 @@ -12,51 +21,60 @@
18526  void lg_lock_init(struct lglock *lg, char *name)
18528 +#ifdef CONFIG_PREEMPT_RT_FULL
18529 +       int i;
18531 +       for_each_possible_cpu(i) {
18532 +               struct rt_mutex *lock = per_cpu_ptr(lg->lock, i);
18534 +               rt_mutex_init(lock);
18535 +       }
18536 +#endif
18537         LOCKDEP_INIT_MAP(&lg->lock_dep_map, name, &lg->lock_key, 0);
18539  EXPORT_SYMBOL(lg_lock_init);
18541  void lg_local_lock(struct lglock *lg)
18543 -       arch_spinlock_t *lock;
18544 +       lg_lock_ptr *lock;
18546 -       preempt_disable();
18547 +       migrate_disable();
18548         lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
18549         lock = this_cpu_ptr(lg->lock);
18550 -       arch_spin_lock(lock);
18551 +       lg_do_lock(lock);
18553  EXPORT_SYMBOL(lg_local_lock);
18555  void lg_local_unlock(struct lglock *lg)
18557 -       arch_spinlock_t *lock;
18558 +       lg_lock_ptr *lock;
18560         lock_release(&lg->lock_dep_map, 1, _RET_IP_);
18561         lock = this_cpu_ptr(lg->lock);
18562 -       arch_spin_unlock(lock);
18563 -       preempt_enable();
18564 +       lg_do_unlock(lock);
18565 +       migrate_enable();
18567  EXPORT_SYMBOL(lg_local_unlock);
18569  void lg_local_lock_cpu(struct lglock *lg, int cpu)
18571 -       arch_spinlock_t *lock;
18572 +       lg_lock_ptr *lock;
18574 -       preempt_disable();
18575 +       preempt_disable_nort();
18576         lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
18577         lock = per_cpu_ptr(lg->lock, cpu);
18578 -       arch_spin_lock(lock);
18579 +       lg_do_lock(lock);
18581  EXPORT_SYMBOL(lg_local_lock_cpu);
18583  void lg_local_unlock_cpu(struct lglock *lg, int cpu)
18585 -       arch_spinlock_t *lock;
18586 +       lg_lock_ptr *lock;
18588         lock_release(&lg->lock_dep_map, 1, _RET_IP_);
18589         lock = per_cpu_ptr(lg->lock, cpu);
18590 -       arch_spin_unlock(lock);
18591 -       preempt_enable();
18592 +       lg_do_unlock(lock);
18593 +       preempt_enable_nort();
18595  EXPORT_SYMBOL(lg_local_unlock_cpu);
18597 @@ -68,30 +86,30 @@ void lg_double_lock(struct lglock *lg, int cpu1, int cpu2)
18598         if (cpu2 < cpu1)
18599                 swap(cpu1, cpu2);
18601 -       preempt_disable();
18602 +       preempt_disable_nort();
18603         lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
18604 -       arch_spin_lock(per_cpu_ptr(lg->lock, cpu1));
18605 -       arch_spin_lock(per_cpu_ptr(lg->lock, cpu2));
18606 +       lg_do_lock(per_cpu_ptr(lg->lock, cpu1));
18607 +       lg_do_lock(per_cpu_ptr(lg->lock, cpu2));
18610  void lg_double_unlock(struct lglock *lg, int cpu1, int cpu2)
18612         lock_release(&lg->lock_dep_map, 1, _RET_IP_);
18613 -       arch_spin_unlock(per_cpu_ptr(lg->lock, cpu1));
18614 -       arch_spin_unlock(per_cpu_ptr(lg->lock, cpu2));
18615 -       preempt_enable();
18616 +       lg_do_unlock(per_cpu_ptr(lg->lock, cpu1));
18617 +       lg_do_unlock(per_cpu_ptr(lg->lock, cpu2));
18618 +       preempt_enable_nort();
18621  void lg_global_lock(struct lglock *lg)
18623         int i;
18625 -       preempt_disable();
18626 +       preempt_disable_nort();
18627         lock_acquire_exclusive(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
18628         for_each_possible_cpu(i) {
18629 -               arch_spinlock_t *lock;
18630 +               lg_lock_ptr *lock;
18631                 lock = per_cpu_ptr(lg->lock, i);
18632 -               arch_spin_lock(lock);
18633 +               lg_do_lock(lock);
18634         }
18636  EXPORT_SYMBOL(lg_global_lock);
18637 @@ -102,10 +120,35 @@ void lg_global_unlock(struct lglock *lg)
18639         lock_release(&lg->lock_dep_map, 1, _RET_IP_);
18640         for_each_possible_cpu(i) {
18641 -               arch_spinlock_t *lock;
18642 +               lg_lock_ptr *lock;
18643                 lock = per_cpu_ptr(lg->lock, i);
18644 -               arch_spin_unlock(lock);
18645 +               lg_do_unlock(lock);
18646         }
18647 -       preempt_enable();
18648 +       preempt_enable_nort();
18650  EXPORT_SYMBOL(lg_global_unlock);
18652 +#ifdef CONFIG_PREEMPT_RT_FULL
18654 + * HACK: If you use this, you get to keep the pieces.
18655 + * Used in queue_stop_cpus_work() when stop machinery
18656 + * is called from inactive CPU, so we can't schedule.
18657 + */
18658 +# define lg_do_trylock_relax(l)                        \
18659 +       do {                                    \
18660 +               while (!__rt_spin_trylock(l))   \
18661 +                       cpu_relax();            \
18662 +       } while (0)
18664 +void lg_global_trylock_relax(struct lglock *lg)
18666 +       int i;
18668 +       lock_acquire_exclusive(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
18669 +       for_each_possible_cpu(i) {
18670 +               lg_lock_ptr *lock;
18671 +               lock = per_cpu_ptr(lg->lock, i);
18672 +               lg_do_trylock_relax(lock);
18673 +       }
18675 +#endif
18676 diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
18677 index 60ace56618f6..fd54dbf686cc 100644
18678 --- a/kernel/locking/lockdep.c
18679 +++ b/kernel/locking/lockdep.c
18680 @@ -668,6 +668,7 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
18681         struct lockdep_subclass_key *key;
18682         struct list_head *hash_head;
18683         struct lock_class *class;
18684 +       bool is_static = false;
18686  #ifdef CONFIG_DEBUG_LOCKDEP
18687         /*
18688 @@ -695,10 +696,23 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
18690         /*
18691          * Static locks do not have their class-keys yet - for them the key
18692 -        * is the lock object itself:
18693 +        * is the lock object itself. If the lock is in the per cpu area,
18694 +        * the canonical address of the lock (per cpu offset removed) is
18695 +        * used.
18696          */
18697 -       if (unlikely(!lock->key))
18698 -               lock->key = (void *)lock;
18699 +       if (unlikely(!lock->key)) {
18700 +               unsigned long can_addr, addr = (unsigned long)lock;
18702 +               if (__is_kernel_percpu_address(addr, &can_addr))
18703 +                       lock->key = (void *)can_addr;
18704 +               else if (__is_module_percpu_address(addr, &can_addr))
18705 +                       lock->key = (void *)can_addr;
18706 +               else if (static_obj(lock))
18707 +                       lock->key = (void *)lock;
18708 +               else
18709 +                       return ERR_PTR(-EINVAL);
18710 +               is_static = true;
18711 +       }
18713         /*
18714          * NOTE: the class-key must be unique. For dynamic locks, a static
18715 @@ -730,7 +744,7 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
18716                 }
18717         }
18719 -       return NULL;
18720 +       return is_static || static_obj(lock->key) ? NULL : ERR_PTR(-EINVAL);
18723  /*
18724 @@ -748,19 +762,18 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
18725         DEBUG_LOCKS_WARN_ON(!irqs_disabled());
18727         class = look_up_lock_class(lock, subclass);
18728 -       if (likely(class))
18729 +       if (likely(!IS_ERR_OR_NULL(class)))
18730                 goto out_set_class_cache;
18732         /*
18733          * Debug-check: all keys must be persistent!
18734 -        */
18735 -       if (!static_obj(lock->key)) {
18736 +        */
18737 +       if (IS_ERR(class)) {
18738                 debug_locks_off();
18739                 printk("INFO: trying to register non-static key.\n");
18740                 printk("the code is fine but needs lockdep annotation.\n");
18741                 printk("turning off the locking correctness validator.\n");
18742                 dump_stack();
18744                 return NULL;
18745         }
18747 @@ -3278,7 +3291,7 @@ static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock)
18748                  * Clearly if the lock hasn't been acquired _ever_, we're not
18749                  * holding it either, so report failure.
18750                  */
18751 -               if (!class)
18752 +               if (IS_ERR_OR_NULL(class))
18753                         return 0;
18755                 /*
18756 @@ -3525,6 +3538,7 @@ static void check_flags(unsigned long flags)
18757                 }
18758         }
18760 +#ifndef CONFIG_PREEMPT_RT_FULL
18761         /*
18762          * We dont accurately track softirq state in e.g.
18763          * hardirq contexts (such as on 4KSTACKS), so only
18764 @@ -3539,6 +3553,7 @@ static void check_flags(unsigned long flags)
18765                         DEBUG_LOCKS_WARN_ON(!current->softirqs_enabled);
18766                 }
18767         }
18768 +#endif
18770         if (!debug_locks)
18771                 print_irqtrace_events(current);
18772 @@ -3977,7 +3992,7 @@ void lockdep_reset_lock(struct lockdep_map *lock)
18773                  * If the class exists we look it up and zap it:
18774                  */
18775                 class = look_up_lock_class(lock, j);
18776 -               if (class)
18777 +               if (!IS_ERR_OR_NULL(class))
18778                         zap_class(class);
18779         }
18780         /*
18781 diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
18782 index 8ef1919d63b2..291fc19e28e0 100644
18783 --- a/kernel/locking/locktorture.c
18784 +++ b/kernel/locking/locktorture.c
18785 @@ -26,7 +26,6 @@
18786  #include <linux/kthread.h>
18787  #include <linux/sched/rt.h>
18788  #include <linux/spinlock.h>
18789 -#include <linux/rwlock.h>
18790  #include <linux/mutex.h>
18791  #include <linux/rwsem.h>
18792  #include <linux/smp.h>
18793 diff --git a/kernel/locking/rt.c b/kernel/locking/rt.c
18794 new file mode 100644
18795 index 000000000000..d4ab61c1848b
18796 --- /dev/null
18797 +++ b/kernel/locking/rt.c
18798 @@ -0,0 +1,474 @@
18800 + * kernel/rt.c
18801 + *
18802 + * Real-Time Preemption Support
18803 + *
18804 + * started by Ingo Molnar:
18805 + *
18806 + *  Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
18807 + *  Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
18808 + *
18809 + * historic credit for proving that Linux spinlocks can be implemented via
18810 + * RT-aware mutexes goes to many people: The Pmutex project (Dirk Grambow
18811 + * and others) who prototyped it on 2.4 and did lots of comparative
18812 + * research and analysis; TimeSys, for proving that you can implement a
18813 + * fully preemptible kernel via the use of IRQ threading and mutexes;
18814 + * Bill Huey for persuasively arguing on lkml that the mutex model is the
18815 + * right one; and to MontaVista, who ported pmutexes to 2.6.
18816 + *
18817 + * This code is a from-scratch implementation and is not based on pmutexes,
18818 + * but the idea of converting spinlocks to mutexes is used here too.
18819 + *
18820 + * lock debugging, locking tree, deadlock detection:
18821 + *
18822 + *  Copyright (C) 2004, LynuxWorks, Inc., Igor Manyilov, Bill Huey
18823 + *  Released under the General Public License (GPL).
18824 + *
18825 + * Includes portions of the generic R/W semaphore implementation from:
18826 + *
18827 + *  Copyright (c) 2001   David Howells (dhowells@redhat.com).
18828 + *  - Derived partially from idea by Andrea Arcangeli <andrea@suse.de>
18829 + *  - Derived also from comments by Linus
18830 + *
18831 + * Pending ownership of locks and ownership stealing:
18832 + *
18833 + *  Copyright (C) 2005, Kihon Technologies Inc., Steven Rostedt
18834 + *
18835 + *   (also by Steven Rostedt)
18836 + *    - Converted single pi_lock to individual task locks.
18837 + *
18838 + * By Esben Nielsen:
18839 + *    Doing priority inheritance with help of the scheduler.
18840 + *
18841 + *  Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
18842 + *  - major rework based on Esben Nielsens initial patch
18843 + *  - replaced thread_info references by task_struct refs
18844 + *  - removed task->pending_owner dependency
18845 + *  - BKL drop/reacquire for semaphore style locks to avoid deadlocks
18846 + *    in the scheduler return path as discussed with Steven Rostedt
18847 + *
18848 + *  Copyright (C) 2006, Kihon Technologies Inc.
18849 + *    Steven Rostedt <rostedt@goodmis.org>
18850 + *  - debugged and patched Thomas Gleixner's rework.
18851 + *  - added back the cmpxchg to the rework.
18852 + *  - turned atomic require back on for SMP.
18853 + */
18855 +#include <linux/spinlock.h>
18856 +#include <linux/rtmutex.h>
18857 +#include <linux/sched.h>
18858 +#include <linux/delay.h>
18859 +#include <linux/module.h>
18860 +#include <linux/kallsyms.h>
18861 +#include <linux/syscalls.h>
18862 +#include <linux/interrupt.h>
18863 +#include <linux/plist.h>
18864 +#include <linux/fs.h>
18865 +#include <linux/futex.h>
18866 +#include <linux/hrtimer.h>
18868 +#include "rtmutex_common.h"
18871 + * struct mutex functions
18872 + */
18873 +void __mutex_do_init(struct mutex *mutex, const char *name,
18874 +                    struct lock_class_key *key)
18876 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
18877 +       /*
18878 +        * Make sure we are not reinitializing a held lock:
18879 +        */
18880 +       debug_check_no_locks_freed((void *)mutex, sizeof(*mutex));
18881 +       lockdep_init_map(&mutex->dep_map, name, key, 0);
18882 +#endif
18883 +       mutex->lock.save_state = 0;
18885 +EXPORT_SYMBOL(__mutex_do_init);
18887 +void __lockfunc _mutex_lock(struct mutex *lock)
18889 +       mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
18890 +       rt_mutex_lock(&lock->lock);
18892 +EXPORT_SYMBOL(_mutex_lock);
18894 +int __lockfunc _mutex_lock_interruptible(struct mutex *lock)
18896 +       int ret;
18898 +       mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
18899 +       ret = rt_mutex_lock_interruptible(&lock->lock);
18900 +       if (ret)
18901 +               mutex_release(&lock->dep_map, 1, _RET_IP_);
18902 +       return ret;
18904 +EXPORT_SYMBOL(_mutex_lock_interruptible);
18906 +int __lockfunc _mutex_lock_killable(struct mutex *lock)
18908 +       int ret;
18910 +       mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
18911 +       ret = rt_mutex_lock_killable(&lock->lock);
18912 +       if (ret)
18913 +               mutex_release(&lock->dep_map, 1, _RET_IP_);
18914 +       return ret;
18916 +EXPORT_SYMBOL(_mutex_lock_killable);
18918 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
18919 +void __lockfunc _mutex_lock_nested(struct mutex *lock, int subclass)
18921 +       mutex_acquire_nest(&lock->dep_map, subclass, 0, NULL, _RET_IP_);
18922 +       rt_mutex_lock(&lock->lock);
18924 +EXPORT_SYMBOL(_mutex_lock_nested);
18926 +void __lockfunc _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest)
18928 +       mutex_acquire_nest(&lock->dep_map, 0, 0, nest, _RET_IP_);
18929 +       rt_mutex_lock(&lock->lock);
18931 +EXPORT_SYMBOL(_mutex_lock_nest_lock);
18933 +int __lockfunc _mutex_lock_interruptible_nested(struct mutex *lock, int subclass)
18935 +       int ret;
18937 +       mutex_acquire_nest(&lock->dep_map, subclass, 0, NULL, _RET_IP_);
18938 +       ret = rt_mutex_lock_interruptible(&lock->lock);
18939 +       if (ret)
18940 +               mutex_release(&lock->dep_map, 1, _RET_IP_);
18941 +       return ret;
18943 +EXPORT_SYMBOL(_mutex_lock_interruptible_nested);
18945 +int __lockfunc _mutex_lock_killable_nested(struct mutex *lock, int subclass)
18947 +       int ret;
18949 +       mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
18950 +       ret = rt_mutex_lock_killable(&lock->lock);
18951 +       if (ret)
18952 +               mutex_release(&lock->dep_map, 1, _RET_IP_);
18953 +       return ret;
18955 +EXPORT_SYMBOL(_mutex_lock_killable_nested);
18956 +#endif
18958 +int __lockfunc _mutex_trylock(struct mutex *lock)
18960 +       int ret = rt_mutex_trylock(&lock->lock);
18962 +       if (ret)
18963 +               mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
18965 +       return ret;
18967 +EXPORT_SYMBOL(_mutex_trylock);
18969 +void __lockfunc _mutex_unlock(struct mutex *lock)
18971 +       mutex_release(&lock->dep_map, 1, _RET_IP_);
18972 +       rt_mutex_unlock(&lock->lock);
18974 +EXPORT_SYMBOL(_mutex_unlock);
18977 + * rwlock_t functions
18978 + */
18979 +int __lockfunc rt_write_trylock(rwlock_t *rwlock)
18981 +       int ret;
18983 +       migrate_disable();
18984 +       ret = rt_mutex_trylock(&rwlock->lock);
18985 +       if (ret)
18986 +               rwlock_acquire(&rwlock->dep_map, 0, 1, _RET_IP_);
18987 +       else
18988 +               migrate_enable();
18990 +       return ret;
18992 +EXPORT_SYMBOL(rt_write_trylock);
18994 +int __lockfunc rt_write_trylock_irqsave(rwlock_t *rwlock, unsigned long *flags)
18996 +       int ret;
18998 +       *flags = 0;
18999 +       ret = rt_write_trylock(rwlock);
19000 +       return ret;
19002 +EXPORT_SYMBOL(rt_write_trylock_irqsave);
19004 +int __lockfunc rt_read_trylock(rwlock_t *rwlock)
19006 +       struct rt_mutex *lock = &rwlock->lock;
19007 +       int ret = 1;
19009 +       /*
19010 +        * recursive read locks succeed when current owns the lock,
19011 +        * but not when read_depth == 0 which means that the lock is
19012 +        * write locked.
19013 +        */
19014 +       if (rt_mutex_owner(lock) != current) {
19015 +               migrate_disable();
19016 +               ret = rt_mutex_trylock(lock);
19017 +               if (ret)
19018 +                       rwlock_acquire(&rwlock->dep_map, 0, 1, _RET_IP_);
19019 +               else
19020 +                       migrate_enable();
19022 +       } else if (!rwlock->read_depth) {
19023 +               ret = 0;
19024 +       }
19026 +       if (ret)
19027 +               rwlock->read_depth++;
19029 +       return ret;
19031 +EXPORT_SYMBOL(rt_read_trylock);
19033 +void __lockfunc rt_write_lock(rwlock_t *rwlock)
19035 +       rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_);
19036 +       __rt_spin_lock(&rwlock->lock);
19038 +EXPORT_SYMBOL(rt_write_lock);
19040 +void __lockfunc rt_read_lock(rwlock_t *rwlock)
19042 +       struct rt_mutex *lock = &rwlock->lock;
19045 +       /*
19046 +        * recursive read locks succeed when current owns the lock
19047 +        */
19048 +       if (rt_mutex_owner(lock) != current) {
19049 +               rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_);
19050 +               __rt_spin_lock(lock);
19051 +       }
19052 +       rwlock->read_depth++;
19055 +EXPORT_SYMBOL(rt_read_lock);
19057 +void __lockfunc rt_write_unlock(rwlock_t *rwlock)
19059 +       /* NOTE: we always pass in '1' for nested, for simplicity */
19060 +       rwlock_release(&rwlock->dep_map, 1, _RET_IP_);
19061 +       __rt_spin_unlock(&rwlock->lock);
19062 +       migrate_enable();
19064 +EXPORT_SYMBOL(rt_write_unlock);
19066 +void __lockfunc rt_read_unlock(rwlock_t *rwlock)
19068 +       /* Release the lock only when read_depth is down to 0 */
19069 +       if (--rwlock->read_depth == 0) {
19070 +               rwlock_release(&rwlock->dep_map, 1, _RET_IP_);
19071 +               __rt_spin_unlock(&rwlock->lock);
19072 +               migrate_enable();
19073 +       }
19075 +EXPORT_SYMBOL(rt_read_unlock);
19077 +unsigned long __lockfunc rt_write_lock_irqsave(rwlock_t *rwlock)
19079 +       rt_write_lock(rwlock);
19081 +       return 0;
19083 +EXPORT_SYMBOL(rt_write_lock_irqsave);
19085 +unsigned long __lockfunc rt_read_lock_irqsave(rwlock_t *rwlock)
19087 +       rt_read_lock(rwlock);
19089 +       return 0;
19091 +EXPORT_SYMBOL(rt_read_lock_irqsave);
19093 +void __rt_rwlock_init(rwlock_t *rwlock, char *name, struct lock_class_key *key)
19095 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
19096 +       /*
19097 +        * Make sure we are not reinitializing a held lock:
19098 +        */
19099 +       debug_check_no_locks_freed((void *)rwlock, sizeof(*rwlock));
19100 +       lockdep_init_map(&rwlock->dep_map, name, key, 0);
19101 +#endif
19102 +       rwlock->lock.save_state = 1;
19103 +       rwlock->read_depth = 0;
19105 +EXPORT_SYMBOL(__rt_rwlock_init);
19108 + * rw_semaphores
19109 + */
19111 +void  rt_up_write(struct rw_semaphore *rwsem)
19113 +       rwsem_release(&rwsem->dep_map, 1, _RET_IP_);
19114 +       rt_mutex_unlock(&rwsem->lock);
19116 +EXPORT_SYMBOL(rt_up_write);
19118 +void __rt_up_read(struct rw_semaphore *rwsem)
19120 +       if (--rwsem->read_depth == 0)
19121 +               rt_mutex_unlock(&rwsem->lock);
19124 +void  rt_up_read(struct rw_semaphore *rwsem)
19126 +       rwsem_release(&rwsem->dep_map, 1, _RET_IP_);
19127 +       __rt_up_read(rwsem);
19129 +EXPORT_SYMBOL(rt_up_read);
19132 + * downgrade a write lock into a read lock
19133 + * - just wake up any readers at the front of the queue
19134 + */
19135 +void  rt_downgrade_write(struct rw_semaphore *rwsem)
19137 +       BUG_ON(rt_mutex_owner(&rwsem->lock) != current);
19138 +       rwsem->read_depth = 1;
19140 +EXPORT_SYMBOL(rt_downgrade_write);
19142 +int  rt_down_write_trylock(struct rw_semaphore *rwsem)
19144 +       int ret = rt_mutex_trylock(&rwsem->lock);
19146 +       if (ret)
19147 +               rwsem_acquire(&rwsem->dep_map, 0, 1, _RET_IP_);
19148 +       return ret;
19150 +EXPORT_SYMBOL(rt_down_write_trylock);
19152 +void  rt_down_write(struct rw_semaphore *rwsem)
19154 +       rwsem_acquire(&rwsem->dep_map, 0, 0, _RET_IP_);
19155 +       rt_mutex_lock(&rwsem->lock);
19157 +EXPORT_SYMBOL(rt_down_write);
19159 +void  rt_down_write_nested(struct rw_semaphore *rwsem, int subclass)
19161 +       rwsem_acquire(&rwsem->dep_map, subclass, 0, _RET_IP_);
19162 +       rt_mutex_lock(&rwsem->lock);
19164 +EXPORT_SYMBOL(rt_down_write_nested);
19166 +void rt_down_write_nested_lock(struct rw_semaphore *rwsem,
19167 +                              struct lockdep_map *nest)
19169 +       rwsem_acquire_nest(&rwsem->dep_map, 0, 0, nest, _RET_IP_);
19170 +       rt_mutex_lock(&rwsem->lock);
19172 +EXPORT_SYMBOL(rt_down_write_nested_lock);
19174 +int rt__down_read_trylock(struct rw_semaphore *rwsem)
19176 +       struct rt_mutex *lock = &rwsem->lock;
19177 +       int ret = 1;
19179 +       /*
19180 +        * recursive read locks succeed when current owns the rwsem,
19181 +        * but not when read_depth == 0 which means that the rwsem is
19182 +        * write locked.
19183 +        */
19184 +       if (rt_mutex_owner(lock) != current)
19185 +               ret = rt_mutex_trylock(&rwsem->lock);
19186 +       else if (!rwsem->read_depth)
19187 +               ret = 0;
19189 +       if (ret)
19190 +               rwsem->read_depth++;
19191 +       return ret;
19195 +int  rt_down_read_trylock(struct rw_semaphore *rwsem)
19197 +       int ret;
19199 +       ret = rt__down_read_trylock(rwsem);
19200 +       if (ret)
19201 +               rwsem_acquire(&rwsem->dep_map, 0, 1, _RET_IP_);
19203 +       return ret;
19205 +EXPORT_SYMBOL(rt_down_read_trylock);
19207 +void rt__down_read(struct rw_semaphore *rwsem)
19209 +       struct rt_mutex *lock = &rwsem->lock;
19211 +       if (rt_mutex_owner(lock) != current)
19212 +               rt_mutex_lock(&rwsem->lock);
19213 +       rwsem->read_depth++;
19215 +EXPORT_SYMBOL(rt__down_read);
19217 +static void __rt_down_read(struct rw_semaphore *rwsem, int subclass)
19219 +       rwsem_acquire_read(&rwsem->dep_map, subclass, 0, _RET_IP_);
19220 +       rt__down_read(rwsem);
19223 +void  rt_down_read(struct rw_semaphore *rwsem)
19225 +       __rt_down_read(rwsem, 0);
19227 +EXPORT_SYMBOL(rt_down_read);
19229 +void  rt_down_read_nested(struct rw_semaphore *rwsem, int subclass)
19231 +       __rt_down_read(rwsem, subclass);
19233 +EXPORT_SYMBOL(rt_down_read_nested);
19235 +void  __rt_rwsem_init(struct rw_semaphore *rwsem, const char *name,
19236 +                             struct lock_class_key *key)
19238 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
19239 +       /*
19240 +        * Make sure we are not reinitializing a held lock:
19241 +        */
19242 +       debug_check_no_locks_freed((void *)rwsem, sizeof(*rwsem));
19243 +       lockdep_init_map(&rwsem->dep_map, name, key, 0);
19244 +#endif
19245 +       rwsem->read_depth = 0;
19246 +       rwsem->lock.save_state = 0;
19248 +EXPORT_SYMBOL(__rt_rwsem_init);
19250 +/**
19251 + * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0
19252 + * @cnt: the atomic which we are to dec
19253 + * @lock: the mutex to return holding if we dec to 0
19254 + *
19255 + * return true and hold lock if we dec to 0, return false otherwise
19256 + */
19257 +int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock)
19259 +       /* dec if we can't possibly hit 0 */
19260 +       if (atomic_add_unless(cnt, -1, 1))
19261 +               return 0;
19262 +       /* we might hit 0, so take the lock */
19263 +       mutex_lock(lock);
19264 +       if (!atomic_dec_and_test(cnt)) {
19265 +               /* when we actually did the dec, we didn't hit 0 */
19266 +               mutex_unlock(lock);
19267 +               return 0;
19268 +       }
19269 +       /* we hit 0, and we hold the lock */
19270 +       return 1;
19272 +EXPORT_SYMBOL(atomic_dec_and_mutex_lock);
19273 diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
19274 index b066724d7a5b..0e9a6260441d 100644
19275 --- a/kernel/locking/rtmutex.c
19276 +++ b/kernel/locking/rtmutex.c
19277 @@ -7,6 +7,11 @@
19278   *  Copyright (C) 2005-2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
19279   *  Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt
19280   *  Copyright (C) 2006 Esben Nielsen
19281 + *  Adaptive Spinlocks:
19282 + *  Copyright (C) 2008 Novell, Inc., Gregory Haskins, Sven Dietrich,
19283 + *                                  and Peter Morreale,
19284 + * Adaptive Spinlocks simplification:
19285 + *  Copyright (C) 2008 Red Hat, Inc., Steven Rostedt <srostedt@redhat.com>
19286   *
19287   *  See Documentation/locking/rt-mutex-design.txt for details.
19288   */
19289 @@ -16,6 +21,7 @@
19290  #include <linux/sched/rt.h>
19291  #include <linux/sched/deadline.h>
19292  #include <linux/timer.h>
19293 +#include <linux/ww_mutex.h>
19295  #include "rtmutex_common.h"
19297 @@ -133,6 +139,12 @@ static void fixup_rt_mutex_waiters(struct rt_mutex *lock)
19298                 WRITE_ONCE(*p, owner & ~RT_MUTEX_HAS_WAITERS);
19301 +static int rt_mutex_real_waiter(struct rt_mutex_waiter *waiter)
19303 +       return waiter && waiter != PI_WAKEUP_INPROGRESS &&
19304 +               waiter != PI_REQUEUE_INPROGRESS;
19307  /*
19308   * We can speed up the acquire/release, if there's no debugging state to be
19309   * set up.
19310 @@ -163,13 +175,14 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
19311   * 2) Drop lock->wait_lock
19312   * 3) Try to unlock the lock with cmpxchg
19313   */
19314 -static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock)
19315 +static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock,
19316 +                                       unsigned long flags)
19317         __releases(lock->wait_lock)
19319         struct task_struct *owner = rt_mutex_owner(lock);
19321         clear_rt_mutex_waiters(lock);
19322 -       raw_spin_unlock(&lock->wait_lock);
19323 +       raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
19324         /*
19325          * If a new waiter comes in between the unlock and the cmpxchg
19326          * we have two situations:
19327 @@ -211,11 +224,12 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
19328  /*
19329   * Simple slow path only version: lock->owner is protected by lock->wait_lock.
19330   */
19331 -static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock)
19332 +static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock,
19333 +                                       unsigned long flags)
19334         __releases(lock->wait_lock)
19336         lock->owner = NULL;
19337 -       raw_spin_unlock(&lock->wait_lock);
19338 +       raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
19339         return true;
19341  #endif
19342 @@ -412,6 +426,14 @@ static bool rt_mutex_cond_detect_deadlock(struct rt_mutex_waiter *waiter,
19343         return debug_rt_mutex_detect_deadlock(waiter, chwalk);
19346 +static void rt_mutex_wake_waiter(struct rt_mutex_waiter *waiter)
19348 +       if (waiter->savestate)
19349 +               wake_up_lock_sleeper(waiter->task);
19350 +       else
19351 +               wake_up_process(waiter->task);
19354  /*
19355   * Max number of times we'll walk the boosting chain:
19356   */
19357 @@ -419,7 +441,8 @@ int max_lock_depth = 1024;
19359  static inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p)
19361 -       return p->pi_blocked_on ? p->pi_blocked_on->lock : NULL;
19362 +       return rt_mutex_real_waiter(p->pi_blocked_on) ?
19363 +               p->pi_blocked_on->lock : NULL;
19366  /*
19367 @@ -497,7 +520,6 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
19368         int ret = 0, depth = 0;
19369         struct rt_mutex *lock;
19370         bool detect_deadlock;
19371 -       unsigned long flags;
19372         bool requeue = true;
19374         detect_deadlock = rt_mutex_cond_detect_deadlock(orig_waiter, chwalk);
19375 @@ -540,7 +562,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
19376         /*
19377          * [1] Task cannot go away as we did a get_task() before !
19378          */
19379 -       raw_spin_lock_irqsave(&task->pi_lock, flags);
19380 +       raw_spin_lock_irq(&task->pi_lock);
19382         /*
19383          * [2] Get the waiter on which @task is blocked on.
19384 @@ -556,7 +578,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
19385          * reached or the state of the chain has changed while we
19386          * dropped the locks.
19387          */
19388 -       if (!waiter)
19389 +       if (!rt_mutex_real_waiter(waiter))
19390                 goto out_unlock_pi;
19392         /*
19393 @@ -624,7 +646,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
19394          * operations.
19395          */
19396         if (!raw_spin_trylock(&lock->wait_lock)) {
19397 -               raw_spin_unlock_irqrestore(&task->pi_lock, flags);
19398 +               raw_spin_unlock_irq(&task->pi_lock);
19399                 cpu_relax();
19400                 goto retry;
19401         }
19402 @@ -655,7 +677,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
19403                 /*
19404                  * No requeue[7] here. Just release @task [8]
19405                  */
19406 -               raw_spin_unlock_irqrestore(&task->pi_lock, flags);
19407 +               raw_spin_unlock(&task->pi_lock);
19408                 put_task_struct(task);
19410                 /*
19411 @@ -663,14 +685,14 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
19412                  * If there is no owner of the lock, end of chain.
19413                  */
19414                 if (!rt_mutex_owner(lock)) {
19415 -                       raw_spin_unlock(&lock->wait_lock);
19416 +                       raw_spin_unlock_irq(&lock->wait_lock);
19417                         return 0;
19418                 }
19420                 /* [10] Grab the next task, i.e. owner of @lock */
19421                 task = rt_mutex_owner(lock);
19422                 get_task_struct(task);
19423 -               raw_spin_lock_irqsave(&task->pi_lock, flags);
19424 +               raw_spin_lock(&task->pi_lock);
19426                 /*
19427                  * No requeue [11] here. We just do deadlock detection.
19428 @@ -685,8 +707,8 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
19429                 top_waiter = rt_mutex_top_waiter(lock);
19431                 /* [13] Drop locks */
19432 -               raw_spin_unlock_irqrestore(&task->pi_lock, flags);
19433 -               raw_spin_unlock(&lock->wait_lock);
19434 +               raw_spin_unlock(&task->pi_lock);
19435 +               raw_spin_unlock_irq(&lock->wait_lock);
19437                 /* If owner is not blocked, end of chain. */
19438                 if (!next_lock)
19439 @@ -707,7 +729,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
19440         rt_mutex_enqueue(lock, waiter);
19442         /* [8] Release the task */
19443 -       raw_spin_unlock_irqrestore(&task->pi_lock, flags);
19444 +       raw_spin_unlock(&task->pi_lock);
19445         put_task_struct(task);
19447         /*
19448 @@ -718,21 +740,24 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
19449          * follow here. This is the end of the chain we are walking.
19450          */
19451         if (!rt_mutex_owner(lock)) {
19452 +               struct rt_mutex_waiter *lock_top_waiter;
19454                 /*
19455                  * If the requeue [7] above changed the top waiter,
19456                  * then we need to wake the new top waiter up to try
19457                  * to get the lock.
19458                  */
19459 -               if (prerequeue_top_waiter != rt_mutex_top_waiter(lock))
19460 -                       wake_up_process(rt_mutex_top_waiter(lock)->task);
19461 -               raw_spin_unlock(&lock->wait_lock);
19462 +               lock_top_waiter = rt_mutex_top_waiter(lock);
19463 +               if (prerequeue_top_waiter != lock_top_waiter)
19464 +                       rt_mutex_wake_waiter(lock_top_waiter);
19465 +               raw_spin_unlock_irq(&lock->wait_lock);
19466                 return 0;
19467         }
19469         /* [10] Grab the next task, i.e. the owner of @lock */
19470         task = rt_mutex_owner(lock);
19471         get_task_struct(task);
19472 -       raw_spin_lock_irqsave(&task->pi_lock, flags);
19473 +       raw_spin_lock(&task->pi_lock);
19475         /* [11] requeue the pi waiters if necessary */
19476         if (waiter == rt_mutex_top_waiter(lock)) {
19477 @@ -786,8 +811,8 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
19478         top_waiter = rt_mutex_top_waiter(lock);
19480         /* [13] Drop the locks */
19481 -       raw_spin_unlock_irqrestore(&task->pi_lock, flags);
19482 -       raw_spin_unlock(&lock->wait_lock);
19483 +       raw_spin_unlock(&task->pi_lock);
19484 +       raw_spin_unlock_irq(&lock->wait_lock);
19486         /*
19487          * Make the actual exit decisions [12], based on the stored
19488 @@ -810,28 +835,46 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
19489         goto again;
19491   out_unlock_pi:
19492 -       raw_spin_unlock_irqrestore(&task->pi_lock, flags);
19493 +       raw_spin_unlock_irq(&task->pi_lock);
19494   out_put_task:
19495         put_task_struct(task);
19497         return ret;
19501 +#define STEAL_NORMAL  0
19502 +#define STEAL_LATERAL 1
19505 + * Note that RT tasks are excluded from lateral-steals to prevent the
19506 + * introduction of an unbounded latency
19507 + */
19508 +static inline int lock_is_stealable(struct task_struct *task,
19509 +                                   struct task_struct *pendowner, int mode)
19511 +    if (mode == STEAL_NORMAL || rt_task(task)) {
19512 +           if (task->prio >= pendowner->prio)
19513 +                   return 0;
19514 +    } else if (task->prio > pendowner->prio)
19515 +           return 0;
19516 +    return 1;
19519  /*
19520   * Try to take an rt-mutex
19521   *
19522 - * Must be called with lock->wait_lock held.
19523 + * Must be called with lock->wait_lock held and interrupts disabled
19524   *
19525   * @lock:   The lock to be acquired.
19526   * @task:   The task which wants to acquire the lock
19527   * @waiter: The waiter that is queued to the lock's wait tree if the
19528   *         callsite called task_blocked_on_lock(), otherwise NULL
19529   */
19530 -static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
19531 -                               struct rt_mutex_waiter *waiter)
19532 +static int __try_to_take_rt_mutex(struct rt_mutex *lock,
19533 +                                 struct task_struct *task,
19534 +                                 struct rt_mutex_waiter *waiter, int mode)
19536 -       unsigned long flags;
19538         /*
19539          * Before testing whether we can acquire @lock, we set the
19540          * RT_MUTEX_HAS_WAITERS bit in @lock->owner. This forces all
19541 @@ -867,8 +910,10 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
19542                  * If waiter is not the highest priority waiter of
19543                  * @lock, give up.
19544                  */
19545 -               if (waiter != rt_mutex_top_waiter(lock))
19546 +               if (waiter != rt_mutex_top_waiter(lock)) {
19547 +                       /* XXX lock_is_stealable() ? */
19548                         return 0;
19549 +               }
19551                 /*
19552                  * We can acquire the lock. Remove the waiter from the
19553 @@ -886,14 +931,10 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
19554                  * not need to be dequeued.
19555                  */
19556                 if (rt_mutex_has_waiters(lock)) {
19557 -                       /*
19558 -                        * If @task->prio is greater than or equal to
19559 -                        * the top waiter priority (kernel view),
19560 -                        * @task lost.
19561 -                        */
19562 -                       if (task->prio >= rt_mutex_top_waiter(lock)->prio)
19563 -                               return 0;
19564 +                       struct task_struct *pown = rt_mutex_top_waiter(lock)->task;
19566 +                       if (task != pown && !lock_is_stealable(task, pown, mode))
19567 +                               return 0;
19568                         /*
19569                          * The current top waiter stays enqueued. We
19570                          * don't have to change anything in the lock
19571 @@ -916,7 +957,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
19572          * case, but conditionals are more expensive than a redundant
19573          * store.
19574          */
19575 -       raw_spin_lock_irqsave(&task->pi_lock, flags);
19576 +       raw_spin_lock(&task->pi_lock);
19577         task->pi_blocked_on = NULL;
19578         /*
19579          * Finish the lock acquisition. @task is the new owner. If
19580 @@ -925,7 +966,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
19581          */
19582         if (rt_mutex_has_waiters(lock))
19583                 rt_mutex_enqueue_pi(task, rt_mutex_top_waiter(lock));
19584 -       raw_spin_unlock_irqrestore(&task->pi_lock, flags);
19585 +       raw_spin_unlock(&task->pi_lock);
19587  takeit:
19588         /* We got the lock. */
19589 @@ -942,12 +983,444 @@ takeit:
19590         return 1;
19593 +#ifdef CONFIG_PREEMPT_RT_FULL
19595 + * preemptible spin_lock functions:
19596 + */
19597 +static inline void rt_spin_lock_fastlock(struct rt_mutex *lock,
19598 +                                        void  (*slowfn)(struct rt_mutex *lock,
19599 +                                                        bool mg_off),
19600 +                                        bool do_mig_dis)
19602 +       might_sleep_no_state_check();
19604 +       if (do_mig_dis)
19605 +               migrate_disable();
19607 +       if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
19608 +               rt_mutex_deadlock_account_lock(lock, current);
19609 +       else
19610 +               slowfn(lock, do_mig_dis);
19613 +static inline int rt_spin_lock_fastunlock(struct rt_mutex *lock,
19614 +                                          int  (*slowfn)(struct rt_mutex *lock))
19616 +       if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) {
19617 +               rt_mutex_deadlock_account_unlock(current);
19618 +               return 0;
19619 +       }
19620 +       return slowfn(lock);
19622 +#ifdef CONFIG_SMP
19624 + * Note that owner is a speculative pointer and dereferencing relies
19625 + * on rcu_read_lock() and the check against the lock owner.
19626 + */
19627 +static int adaptive_wait(struct rt_mutex *lock,
19628 +                        struct task_struct *owner)
19630 +       int res = 0;
19632 +       rcu_read_lock();
19633 +       for (;;) {
19634 +               if (owner != rt_mutex_owner(lock))
19635 +                       break;
19636 +               /*
19637 +                * Ensure that owner->on_cpu is dereferenced _after_
19638 +                * checking the above to be valid.
19639 +                */
19640 +               barrier();
19641 +               if (!owner->on_cpu) {
19642 +                       res = 1;
19643 +                       break;
19644 +               }
19645 +               cpu_relax();
19646 +       }
19647 +       rcu_read_unlock();
19648 +       return res;
19650 +#else
19651 +static int adaptive_wait(struct rt_mutex *lock,
19652 +                        struct task_struct *orig_owner)
19654 +       return 1;
19656 +#endif
19658 +static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
19659 +                                  struct rt_mutex_waiter *waiter,
19660 +                                  struct task_struct *task,
19661 +                                  enum rtmutex_chainwalk chwalk);
19663 + * Slow path lock function spin_lock style: this variant is very
19664 + * careful not to miss any non-lock wakeups.
19665 + *
19666 + * We store the current state under p->pi_lock in p->saved_state and
19667 + * the try_to_wake_up() code handles this accordingly.
19668 + */
19669 +static void  noinline __sched rt_spin_lock_slowlock(struct rt_mutex *lock,
19670 +                                                   bool mg_off)
19672 +       struct task_struct *lock_owner, *self = current;
19673 +       struct rt_mutex_waiter waiter, *top_waiter;
19674 +       unsigned long flags;
19675 +       int ret;
19677 +       rt_mutex_init_waiter(&waiter, true);
19679 +       raw_spin_lock_irqsave(&lock->wait_lock, flags);
19681 +       if (__try_to_take_rt_mutex(lock, self, NULL, STEAL_LATERAL)) {
19682 +               raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
19683 +               return;
19684 +       }
19686 +       BUG_ON(rt_mutex_owner(lock) == self);
19688 +       /*
19689 +        * We save whatever state the task is in and we'll restore it
19690 +        * after acquiring the lock taking real wakeups into account
19691 +        * as well. We are serialized via pi_lock against wakeups. See
19692 +        * try_to_wake_up().
19693 +        */
19694 +       raw_spin_lock(&self->pi_lock);
19695 +       self->saved_state = self->state;
19696 +       __set_current_state_no_track(TASK_UNINTERRUPTIBLE);
19697 +       raw_spin_unlock(&self->pi_lock);
19699 +       ret = task_blocks_on_rt_mutex(lock, &waiter, self, RT_MUTEX_MIN_CHAINWALK);
19700 +       BUG_ON(ret);
19702 +       for (;;) {
19703 +               /* Try to acquire the lock again. */
19704 +               if (__try_to_take_rt_mutex(lock, self, &waiter, STEAL_LATERAL))
19705 +                       break;
19707 +               top_waiter = rt_mutex_top_waiter(lock);
19708 +               lock_owner = rt_mutex_owner(lock);
19710 +               raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
19712 +               debug_rt_mutex_print_deadlock(&waiter);
19714 +               if (top_waiter != &waiter || adaptive_wait(lock, lock_owner)) {
19715 +                       if (mg_off)
19716 +                               migrate_enable();
19717 +                       schedule();
19718 +                       if (mg_off)
19719 +                               migrate_disable();
19720 +               }
19722 +               raw_spin_lock_irqsave(&lock->wait_lock, flags);
19724 +               raw_spin_lock(&self->pi_lock);
19725 +               __set_current_state_no_track(TASK_UNINTERRUPTIBLE);
19726 +               raw_spin_unlock(&self->pi_lock);
19727 +       }
19729 +       /*
19730 +        * Restore the task state to current->saved_state. We set it
19731 +        * to the original state above and the try_to_wake_up() code
19732 +        * has possibly updated it when a real (non-rtmutex) wakeup
19733 +        * happened while we were blocked. Clear saved_state so
19734 +        * try_to_wakeup() does not get confused.
19735 +        */
19736 +       raw_spin_lock(&self->pi_lock);
19737 +       __set_current_state_no_track(self->saved_state);
19738 +       self->saved_state = TASK_RUNNING;
19739 +       raw_spin_unlock(&self->pi_lock);
19741 +       /*
19742 +        * try_to_take_rt_mutex() sets the waiter bit
19743 +        * unconditionally. We might have to fix that up:
19744 +        */
19745 +       fixup_rt_mutex_waiters(lock);
19747 +       BUG_ON(rt_mutex_has_waiters(lock) && &waiter == rt_mutex_top_waiter(lock));
19748 +       BUG_ON(!RB_EMPTY_NODE(&waiter.tree_entry));
19750 +       raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
19752 +       debug_rt_mutex_free_waiter(&waiter);
19755 +static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
19756 +                                   struct wake_q_head *wake_sleeper_q,
19757 +                                   struct rt_mutex *lock);
19759 + * Slow path to release a rt_mutex spin_lock style
19760 + */
19761 +static int noinline __sched rt_spin_lock_slowunlock(struct rt_mutex *lock)
19763 +       unsigned long flags;
19764 +       WAKE_Q(wake_q);
19765 +       WAKE_Q(wake_sleeper_q);
19767 +       raw_spin_lock_irqsave(&lock->wait_lock, flags);
19769 +       debug_rt_mutex_unlock(lock);
19771 +       rt_mutex_deadlock_account_unlock(current);
19773 +       if (!rt_mutex_has_waiters(lock)) {
19774 +               lock->owner = NULL;
19775 +               raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
19776 +               return 0;
19777 +       }
19779 +       mark_wakeup_next_waiter(&wake_q, &wake_sleeper_q, lock);
19781 +       raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
19782 +       wake_up_q(&wake_q);
19783 +       wake_up_q_sleeper(&wake_sleeper_q);
19785 +       /* Undo pi boosting.when necessary */
19786 +       rt_mutex_adjust_prio(current);
19787 +       return 0;
19790 +static int noinline __sched rt_spin_lock_slowunlock_no_deboost(struct rt_mutex *lock)
19792 +       unsigned long flags;
19793 +       WAKE_Q(wake_q);
19794 +       WAKE_Q(wake_sleeper_q);
19796 +       raw_spin_lock_irqsave(&lock->wait_lock, flags);
19798 +       debug_rt_mutex_unlock(lock);
19800 +       rt_mutex_deadlock_account_unlock(current);
19802 +       if (!rt_mutex_has_waiters(lock)) {
19803 +               lock->owner = NULL;
19804 +               raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
19805 +               return 0;
19806 +       }
19808 +       mark_wakeup_next_waiter(&wake_q, &wake_sleeper_q, lock);
19810 +       raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
19811 +       wake_up_q(&wake_q);
19812 +       wake_up_q_sleeper(&wake_sleeper_q);
19813 +       return 1;
19816 +void __lockfunc rt_spin_lock__no_mg(spinlock_t *lock)
19818 +       rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock, false);
19819 +       spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
19821 +EXPORT_SYMBOL(rt_spin_lock__no_mg);
19823 +void __lockfunc rt_spin_lock(spinlock_t *lock)
19825 +       rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock, true);
19826 +       spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
19828 +EXPORT_SYMBOL(rt_spin_lock);
19830 +void __lockfunc __rt_spin_lock(struct rt_mutex *lock)
19832 +       rt_spin_lock_fastlock(lock, rt_spin_lock_slowlock, true);
19834 +EXPORT_SYMBOL(__rt_spin_lock);
19836 +void __lockfunc __rt_spin_lock__no_mg(struct rt_mutex *lock)
19838 +       rt_spin_lock_fastlock(lock, rt_spin_lock_slowlock, false);
19840 +EXPORT_SYMBOL(__rt_spin_lock__no_mg);
19842 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
19843 +void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int subclass)
19845 +       spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
19846 +       rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock, true);
19848 +EXPORT_SYMBOL(rt_spin_lock_nested);
19849 +#endif
19851 +void __lockfunc rt_spin_unlock__no_mg(spinlock_t *lock)
19853 +       /* NOTE: we always pass in '1' for nested, for simplicity */
19854 +       spin_release(&lock->dep_map, 1, _RET_IP_);
19855 +       rt_spin_lock_fastunlock(&lock->lock, rt_spin_lock_slowunlock);
19857 +EXPORT_SYMBOL(rt_spin_unlock__no_mg);
19859 +void __lockfunc rt_spin_unlock(spinlock_t *lock)
19861 +       /* NOTE: we always pass in '1' for nested, for simplicity */
19862 +       spin_release(&lock->dep_map, 1, _RET_IP_);
19863 +       rt_spin_lock_fastunlock(&lock->lock, rt_spin_lock_slowunlock);
19864 +       migrate_enable();
19866 +EXPORT_SYMBOL(rt_spin_unlock);
19868 +int __lockfunc rt_spin_unlock_no_deboost(spinlock_t *lock)
19870 +       int ret;
19872 +       /* NOTE: we always pass in '1' for nested, for simplicity */
19873 +       spin_release(&lock->dep_map, 1, _RET_IP_);
19874 +       ret = rt_spin_lock_fastunlock(&lock->lock, rt_spin_lock_slowunlock_no_deboost);
19875 +       migrate_enable();
19876 +       return ret;
19879 +void __lockfunc __rt_spin_unlock(struct rt_mutex *lock)
19881 +       rt_spin_lock_fastunlock(lock, rt_spin_lock_slowunlock);
19883 +EXPORT_SYMBOL(__rt_spin_unlock);
19886 + * Wait for the lock to get unlocked: instead of polling for an unlock
19887 + * (like raw spinlocks do), we lock and unlock, to force the kernel to
19888 + * schedule if there's contention:
19889 + */
19890 +void __lockfunc rt_spin_unlock_wait(spinlock_t *lock)
19892 +       spin_lock(lock);
19893 +       spin_unlock(lock);
19895 +EXPORT_SYMBOL(rt_spin_unlock_wait);
19897 +int __lockfunc __rt_spin_trylock(struct rt_mutex *lock)
19899 +       return rt_mutex_trylock(lock);
19902 +int __lockfunc rt_spin_trylock__no_mg(spinlock_t *lock)
19904 +       int ret;
19906 +       ret = rt_mutex_trylock(&lock->lock);
19907 +       if (ret)
19908 +               spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
19909 +       return ret;
19911 +EXPORT_SYMBOL(rt_spin_trylock__no_mg);
19913 +int __lockfunc rt_spin_trylock(spinlock_t *lock)
19915 +       int ret;
19917 +       migrate_disable();
19918 +       ret = rt_mutex_trylock(&lock->lock);
19919 +       if (ret)
19920 +               spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
19921 +       else
19922 +               migrate_enable();
19923 +       return ret;
19925 +EXPORT_SYMBOL(rt_spin_trylock);
19927 +int __lockfunc rt_spin_trylock_bh(spinlock_t *lock)
19929 +       int ret;
19931 +       local_bh_disable();
19932 +       ret = rt_mutex_trylock(&lock->lock);
19933 +       if (ret) {
19934 +               migrate_disable();
19935 +               spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
19936 +       } else
19937 +               local_bh_enable();
19938 +       return ret;
19940 +EXPORT_SYMBOL(rt_spin_trylock_bh);
19942 +int __lockfunc rt_spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags)
19944 +       int ret;
19946 +       *flags = 0;
19947 +       ret = rt_mutex_trylock(&lock->lock);
19948 +       if (ret) {
19949 +               migrate_disable();
19950 +               spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
19951 +       }
19952 +       return ret;
19954 +EXPORT_SYMBOL(rt_spin_trylock_irqsave);
19956 +int atomic_dec_and_spin_lock(atomic_t *atomic, spinlock_t *lock)
19958 +       /* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */
19959 +       if (atomic_add_unless(atomic, -1, 1))
19960 +               return 0;
19961 +       rt_spin_lock(lock);
19962 +       if (atomic_dec_and_test(atomic))
19963 +               return 1;
19964 +       rt_spin_unlock(lock);
19965 +       return 0;
19967 +EXPORT_SYMBOL(atomic_dec_and_spin_lock);
19969 +       void
19970 +__rt_spin_lock_init(spinlock_t *lock, char *name, struct lock_class_key *key)
19972 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
19973 +       /*
19974 +        * Make sure we are not reinitializing a held lock:
19975 +        */
19976 +       debug_check_no_locks_freed((void *)lock, sizeof(*lock));
19977 +       lockdep_init_map(&lock->dep_map, name, key, 0);
19978 +#endif
19980 +EXPORT_SYMBOL(__rt_spin_lock_init);
19982 +#endif /* PREEMPT_RT_FULL */
19984 +#ifdef CONFIG_PREEMPT_RT_FULL
19985 +       static inline int __sched
19986 +__mutex_lock_check_stamp(struct rt_mutex *lock, struct ww_acquire_ctx *ctx)
19988 +       struct ww_mutex *ww = container_of(lock, struct ww_mutex, base.lock);
19989 +       struct ww_acquire_ctx *hold_ctx = ACCESS_ONCE(ww->ctx);
19991 +       if (!hold_ctx)
19992 +               return 0;
19994 +       if (unlikely(ctx == hold_ctx))
19995 +               return -EALREADY;
19997 +       if (ctx->stamp - hold_ctx->stamp <= LONG_MAX &&
19998 +           (ctx->stamp != hold_ctx->stamp || ctx > hold_ctx)) {
19999 +#ifdef CONFIG_DEBUG_MUTEXES
20000 +               DEBUG_LOCKS_WARN_ON(ctx->contending_lock);
20001 +               ctx->contending_lock = ww;
20002 +#endif
20003 +               return -EDEADLK;
20004 +       }
20006 +       return 0;
20008 +#else
20009 +       static inline int __sched
20010 +__mutex_lock_check_stamp(struct rt_mutex *lock, struct ww_acquire_ctx *ctx)
20012 +       BUG();
20013 +       return 0;
20016 +#endif
20018 +static inline int
20019 +try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
20020 +                    struct rt_mutex_waiter *waiter)
20022 +       return __try_to_take_rt_mutex(lock, task, waiter, STEAL_NORMAL);
20025  /*
20026   * Task blocks on lock.
20027   *
20028   * Prepare waiter and propagate pi chain
20029   *
20030 - * This must be called with lock->wait_lock held.
20031 + * This must be called with lock->wait_lock held and interrupts disabled
20032   */
20033  static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
20034                                    struct rt_mutex_waiter *waiter,
20035 @@ -958,7 +1431,6 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
20036         struct rt_mutex_waiter *top_waiter = waiter;
20037         struct rt_mutex *next_lock;
20038         int chain_walk = 0, res;
20039 -       unsigned long flags;
20041         /*
20042          * Early deadlock detection. We really don't want the task to
20043 @@ -972,7 +1444,24 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
20044         if (owner == task)
20045                 return -EDEADLK;
20047 -       raw_spin_lock_irqsave(&task->pi_lock, flags);
20048 +       raw_spin_lock(&task->pi_lock);
20050 +       /*
20051 +        * In the case of futex requeue PI, this will be a proxy
20052 +        * lock. The task will wake unaware that it is enqueueed on
20053 +        * this lock. Avoid blocking on two locks and corrupting
20054 +        * pi_blocked_on via the PI_WAKEUP_INPROGRESS
20055 +        * flag. futex_wait_requeue_pi() sets this when it wakes up
20056 +        * before requeue (due to a signal or timeout). Do not enqueue
20057 +        * the task if PI_WAKEUP_INPROGRESS is set.
20058 +        */
20059 +       if (task != current && task->pi_blocked_on == PI_WAKEUP_INPROGRESS) {
20060 +               raw_spin_unlock(&task->pi_lock);
20061 +               return -EAGAIN;
20062 +       }
20064 +       BUG_ON(rt_mutex_real_waiter(task->pi_blocked_on));
20066         __rt_mutex_adjust_prio(task);
20067         waiter->task = task;
20068         waiter->lock = lock;
20069 @@ -985,18 +1474,18 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
20071         task->pi_blocked_on = waiter;
20073 -       raw_spin_unlock_irqrestore(&task->pi_lock, flags);
20074 +       raw_spin_unlock(&task->pi_lock);
20076         if (!owner)
20077                 return 0;
20079 -       raw_spin_lock_irqsave(&owner->pi_lock, flags);
20080 +       raw_spin_lock(&owner->pi_lock);
20081         if (waiter == rt_mutex_top_waiter(lock)) {
20082                 rt_mutex_dequeue_pi(owner, top_waiter);
20083                 rt_mutex_enqueue_pi(owner, waiter);
20085                 __rt_mutex_adjust_prio(owner);
20086 -               if (owner->pi_blocked_on)
20087 +               if (rt_mutex_real_waiter(owner->pi_blocked_on))
20088                         chain_walk = 1;
20089         } else if (rt_mutex_cond_detect_deadlock(waiter, chwalk)) {
20090                 chain_walk = 1;
20091 @@ -1005,7 +1494,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
20092         /* Store the lock on which owner is blocked or NULL */
20093         next_lock = task_blocked_on_lock(owner);
20095 -       raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
20096 +       raw_spin_unlock(&owner->pi_lock);
20097         /*
20098          * Even if full deadlock detection is on, if the owner is not
20099          * blocked itself, we can avoid finding this out in the chain
20100 @@ -1021,12 +1510,12 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
20101          */
20102         get_task_struct(owner);
20104 -       raw_spin_unlock(&lock->wait_lock);
20105 +       raw_spin_unlock_irq(&lock->wait_lock);
20107         res = rt_mutex_adjust_prio_chain(owner, chwalk, lock,
20108                                          next_lock, waiter, task);
20110 -       raw_spin_lock(&lock->wait_lock);
20111 +       raw_spin_lock_irq(&lock->wait_lock);
20113         return res;
20115 @@ -1035,15 +1524,15 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
20116   * Remove the top waiter from the current tasks pi waiter tree and
20117   * queue it up.
20118   *
20119 - * Called with lock->wait_lock held.
20120 + * Called with lock->wait_lock held and interrupts disabled.
20121   */
20122  static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
20123 +                                   struct wake_q_head *wake_sleeper_q,
20124                                     struct rt_mutex *lock)
20126         struct rt_mutex_waiter *waiter;
20127 -       unsigned long flags;
20129 -       raw_spin_lock_irqsave(&current->pi_lock, flags);
20130 +       raw_spin_lock(&current->pi_lock);
20132         waiter = rt_mutex_top_waiter(lock);
20134 @@ -1065,15 +1554,18 @@ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
20135          */
20136         lock->owner = (void *) RT_MUTEX_HAS_WAITERS;
20138 -       raw_spin_unlock_irqrestore(&current->pi_lock, flags);
20139 +       raw_spin_unlock(&current->pi_lock);
20141 -       wake_q_add(wake_q, waiter->task);
20142 +       if (waiter->savestate)
20143 +               wake_q_add(wake_sleeper_q, waiter->task);
20144 +       else
20145 +               wake_q_add(wake_q, waiter->task);
20148  /*
20149   * Remove a waiter from a lock and give up
20150   *
20151 - * Must be called with lock->wait_lock held and
20152 + * Must be called with lock->wait_lock held and interrupts disabled. I must
20153   * have just failed to try_to_take_rt_mutex().
20154   */
20155  static void remove_waiter(struct rt_mutex *lock,
20156 @@ -1081,13 +1573,12 @@ static void remove_waiter(struct rt_mutex *lock,
20158         bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock));
20159         struct task_struct *owner = rt_mutex_owner(lock);
20160 -       struct rt_mutex *next_lock;
20161 -       unsigned long flags;
20162 +       struct rt_mutex *next_lock = NULL;
20164 -       raw_spin_lock_irqsave(&current->pi_lock, flags);
20165 +       raw_spin_lock(&current->pi_lock);
20166         rt_mutex_dequeue(lock, waiter);
20167         current->pi_blocked_on = NULL;
20168 -       raw_spin_unlock_irqrestore(&current->pi_lock, flags);
20169 +       raw_spin_unlock(&current->pi_lock);
20171         /*
20172          * Only update priority if the waiter was the highest priority
20173 @@ -1096,7 +1587,7 @@ static void remove_waiter(struct rt_mutex *lock,
20174         if (!owner || !is_top_waiter)
20175                 return;
20177 -       raw_spin_lock_irqsave(&owner->pi_lock, flags);
20178 +       raw_spin_lock(&owner->pi_lock);
20180         rt_mutex_dequeue_pi(owner, waiter);
20182 @@ -1106,9 +1597,10 @@ static void remove_waiter(struct rt_mutex *lock,
20183         __rt_mutex_adjust_prio(owner);
20185         /* Store the lock on which owner is blocked or NULL */
20186 -       next_lock = task_blocked_on_lock(owner);
20187 +       if (rt_mutex_real_waiter(owner->pi_blocked_on))
20188 +               next_lock = task_blocked_on_lock(owner);
20190 -       raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
20191 +       raw_spin_unlock(&owner->pi_lock);
20193         /*
20194          * Don't walk the chain, if the owner task is not blocked
20195 @@ -1120,12 +1612,12 @@ static void remove_waiter(struct rt_mutex *lock,
20196         /* gets dropped in rt_mutex_adjust_prio_chain()! */
20197         get_task_struct(owner);
20199 -       raw_spin_unlock(&lock->wait_lock);
20200 +       raw_spin_unlock_irq(&lock->wait_lock);
20202         rt_mutex_adjust_prio_chain(owner, RT_MUTEX_MIN_CHAINWALK, lock,
20203                                    next_lock, NULL, current);
20205 -       raw_spin_lock(&lock->wait_lock);
20206 +       raw_spin_lock_irq(&lock->wait_lock);
20209  /*
20210 @@ -1142,17 +1634,17 @@ void rt_mutex_adjust_pi(struct task_struct *task)
20211         raw_spin_lock_irqsave(&task->pi_lock, flags);
20213         waiter = task->pi_blocked_on;
20214 -       if (!waiter || (waiter->prio == task->prio &&
20215 +       if (!rt_mutex_real_waiter(waiter) || (waiter->prio == task->prio &&
20216                         !dl_prio(task->prio))) {
20217                 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
20218                 return;
20219         }
20220         next_lock = waiter->lock;
20221 -       raw_spin_unlock_irqrestore(&task->pi_lock, flags);
20223         /* gets dropped in rt_mutex_adjust_prio_chain()! */
20224         get_task_struct(task);
20226 +       raw_spin_unlock_irqrestore(&task->pi_lock, flags);
20227         rt_mutex_adjust_prio_chain(task, RT_MUTEX_MIN_CHAINWALK, NULL,
20228                                    next_lock, NULL, task);
20230 @@ -1161,16 +1653,17 @@ void rt_mutex_adjust_pi(struct task_struct *task)
20231   * __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop
20232   * @lock:               the rt_mutex to take
20233   * @state:              the state the task should block in (TASK_INTERRUPTIBLE
20234 - *                      or TASK_UNINTERRUPTIBLE)
20235 + *                      or TASK_UNINTERRUPTIBLE)
20236   * @timeout:            the pre-initialized and started timer, or NULL for none
20237   * @waiter:             the pre-initialized rt_mutex_waiter
20238   *
20239 - * lock->wait_lock must be held by the caller.
20240 + * Must be called with lock->wait_lock held and interrupts disabled
20241   */
20242  static int __sched
20243  __rt_mutex_slowlock(struct rt_mutex *lock, int state,
20244                     struct hrtimer_sleeper *timeout,
20245 -                   struct rt_mutex_waiter *waiter)
20246 +                   struct rt_mutex_waiter *waiter,
20247 +                   struct ww_acquire_ctx *ww_ctx)
20249         int ret = 0;
20251 @@ -1193,13 +1686,19 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
20252                                 break;
20253                 }
20255 -               raw_spin_unlock(&lock->wait_lock);
20256 +               if (ww_ctx && ww_ctx->acquired > 0) {
20257 +                       ret = __mutex_lock_check_stamp(lock, ww_ctx);
20258 +                       if (ret)
20259 +                               break;
20260 +               }
20262 +               raw_spin_unlock_irq(&lock->wait_lock);
20264                 debug_rt_mutex_print_deadlock(waiter);
20266                 schedule();
20268 -               raw_spin_lock(&lock->wait_lock);
20269 +               raw_spin_lock_irq(&lock->wait_lock);
20270                 set_current_state(state);
20271         }
20273 @@ -1227,26 +1726,112 @@ static void rt_mutex_handle_deadlock(int res, int detect_deadlock,
20274         }
20277 +static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww,
20278 +                                                  struct ww_acquire_ctx *ww_ctx)
20280 +#ifdef CONFIG_DEBUG_MUTEXES
20281 +       /*
20282 +        * If this WARN_ON triggers, you used ww_mutex_lock to acquire,
20283 +        * but released with a normal mutex_unlock in this call.
20284 +        *
20285 +        * This should never happen, always use ww_mutex_unlock.
20286 +        */
20287 +       DEBUG_LOCKS_WARN_ON(ww->ctx);
20289 +       /*
20290 +        * Not quite done after calling ww_acquire_done() ?
20291 +        */
20292 +       DEBUG_LOCKS_WARN_ON(ww_ctx->done_acquire);
20294 +       if (ww_ctx->contending_lock) {
20295 +               /*
20296 +                * After -EDEADLK you tried to
20297 +                * acquire a different ww_mutex? Bad!
20298 +                */
20299 +               DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock != ww);
20301 +               /*
20302 +                * You called ww_mutex_lock after receiving -EDEADLK,
20303 +                * but 'forgot' to unlock everything else first?
20304 +                */
20305 +               DEBUG_LOCKS_WARN_ON(ww_ctx->acquired > 0);
20306 +               ww_ctx->contending_lock = NULL;
20307 +       }
20309 +       /*
20310 +        * Naughty, using a different class will lead to undefined behavior!
20311 +        */
20312 +       DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class);
20313 +#endif
20314 +       ww_ctx->acquired++;
20317 +#ifdef CONFIG_PREEMPT_RT_FULL
20318 +static void ww_mutex_account_lock(struct rt_mutex *lock,
20319 +                                 struct ww_acquire_ctx *ww_ctx)
20321 +       struct ww_mutex *ww = container_of(lock, struct ww_mutex, base.lock);
20322 +       struct rt_mutex_waiter *waiter, *n;
20324 +       /*
20325 +        * This branch gets optimized out for the common case,
20326 +        * and is only important for ww_mutex_lock.
20327 +        */
20328 +       ww_mutex_lock_acquired(ww, ww_ctx);
20329 +       ww->ctx = ww_ctx;
20331 +       /*
20332 +        * Give any possible sleeping processes the chance to wake up,
20333 +        * so they can recheck if they have to back off.
20334 +        */
20335 +       rbtree_postorder_for_each_entry_safe(waiter, n, &lock->waiters,
20336 +                                            tree_entry) {
20337 +               /* XXX debug rt mutex waiter wakeup */
20339 +               BUG_ON(waiter->lock != lock);
20340 +               rt_mutex_wake_waiter(waiter);
20341 +       }
20344 +#else
20346 +static void ww_mutex_account_lock(struct rt_mutex *lock,
20347 +                                 struct ww_acquire_ctx *ww_ctx)
20349 +       BUG();
20351 +#endif
20353  /*
20354   * Slow path lock function:
20355   */
20356  static int __sched
20357  rt_mutex_slowlock(struct rt_mutex *lock, int state,
20358                   struct hrtimer_sleeper *timeout,
20359 -                 enum rtmutex_chainwalk chwalk)
20360 +                 enum rtmutex_chainwalk chwalk,
20361 +                 struct ww_acquire_ctx *ww_ctx)
20363         struct rt_mutex_waiter waiter;
20364 +       unsigned long flags;
20365         int ret = 0;
20367 -       debug_rt_mutex_init_waiter(&waiter);
20368 -       RB_CLEAR_NODE(&waiter.pi_tree_entry);
20369 -       RB_CLEAR_NODE(&waiter.tree_entry);
20370 +       rt_mutex_init_waiter(&waiter, false);
20372 -       raw_spin_lock(&lock->wait_lock);
20373 +       /*
20374 +        * Technically we could use raw_spin_[un]lock_irq() here, but this can
20375 +        * be called in early boot if the cmpxchg() fast path is disabled
20376 +        * (debug, no architecture support). In this case we will acquire the
20377 +        * rtmutex with lock->wait_lock held. But we cannot unconditionally
20378 +        * enable interrupts in that early boot case. So we need to use the
20379 +        * irqsave/restore variants.
20380 +        */
20381 +       raw_spin_lock_irqsave(&lock->wait_lock, flags);
20383         /* Try to acquire the lock again: */
20384         if (try_to_take_rt_mutex(lock, current, NULL)) {
20385 -               raw_spin_unlock(&lock->wait_lock);
20386 +               if (ww_ctx)
20387 +                       ww_mutex_account_lock(lock, ww_ctx);
20388 +               raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
20389                 return 0;
20390         }
20392 @@ -1260,13 +1845,23 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
20394         if (likely(!ret))
20395                 /* sleep on the mutex */
20396 -               ret = __rt_mutex_slowlock(lock, state, timeout, &waiter);
20397 +               ret = __rt_mutex_slowlock(lock, state, timeout, &waiter,
20398 +                                         ww_ctx);
20399 +       else if (ww_ctx) {
20400 +               /* ww_mutex received EDEADLK, let it become EALREADY */
20401 +               ret = __mutex_lock_check_stamp(lock, ww_ctx);
20402 +               BUG_ON(!ret);
20403 +       }
20405         if (unlikely(ret)) {
20406                 __set_current_state(TASK_RUNNING);
20407                 if (rt_mutex_has_waiters(lock))
20408                         remove_waiter(lock, &waiter);
20409 -               rt_mutex_handle_deadlock(ret, chwalk, &waiter);
20410 +               /* ww_mutex want to report EDEADLK/EALREADY, let them */
20411 +               if (!ww_ctx)
20412 +                       rt_mutex_handle_deadlock(ret, chwalk, &waiter);
20413 +       } else if (ww_ctx) {
20414 +               ww_mutex_account_lock(lock, ww_ctx);
20415         }
20417         /*
20418 @@ -1275,7 +1870,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
20419          */
20420         fixup_rt_mutex_waiters(lock);
20422 -       raw_spin_unlock(&lock->wait_lock);
20423 +       raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
20425         /* Remove pending timer: */
20426         if (unlikely(timeout))
20427 @@ -1291,6 +1886,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
20428   */
20429  static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
20431 +       unsigned long flags;
20432         int ret;
20434         /*
20435 @@ -1302,10 +1898,10 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
20436                 return 0;
20438         /*
20439 -        * The mutex has currently no owner. Lock the wait lock and
20440 -        * try to acquire the lock.
20441 +        * The mutex has currently no owner. Lock the wait lock and try to
20442 +        * acquire the lock. We use irqsave here to support early boot calls.
20443          */
20444 -       raw_spin_lock(&lock->wait_lock);
20445 +       raw_spin_lock_irqsave(&lock->wait_lock, flags);
20447         ret = try_to_take_rt_mutex(lock, current, NULL);
20449 @@ -1315,7 +1911,7 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
20450          */
20451         fixup_rt_mutex_waiters(lock);
20453 -       raw_spin_unlock(&lock->wait_lock);
20454 +       raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
20456         return ret;
20458 @@ -1325,9 +1921,13 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
20459   * Return whether the current task needs to undo a potential priority boosting.
20460   */
20461  static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
20462 -                                       struct wake_q_head *wake_q)
20463 +                                       struct wake_q_head *wake_q,
20464 +                                       struct wake_q_head *wake_sleeper_q)
20466 -       raw_spin_lock(&lock->wait_lock);
20467 +       unsigned long flags;
20469 +       /* irqsave required to support early boot calls */
20470 +       raw_spin_lock_irqsave(&lock->wait_lock, flags);
20472         debug_rt_mutex_unlock(lock);
20474 @@ -1366,10 +1966,10 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
20475          */
20476         while (!rt_mutex_has_waiters(lock)) {
20477                 /* Drops lock->wait_lock ! */
20478 -               if (unlock_rt_mutex_safe(lock) == true)
20479 +               if (unlock_rt_mutex_safe(lock, flags) == true)
20480                         return false;
20481                 /* Relock the rtmutex and try again */
20482 -               raw_spin_lock(&lock->wait_lock);
20483 +               raw_spin_lock_irqsave(&lock->wait_lock, flags);
20484         }
20486         /*
20487 @@ -1378,9 +1978,9 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
20488          *
20489          * Queue the next waiter for wakeup once we release the wait_lock.
20490          */
20491 -       mark_wakeup_next_waiter(wake_q, lock);
20492 +       mark_wakeup_next_waiter(wake_q, wake_sleeper_q, lock);
20494 -       raw_spin_unlock(&lock->wait_lock);
20495 +       raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
20497         /* check PI boosting */
20498         return true;
20499 @@ -1394,31 +1994,36 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
20500   */
20501  static inline int
20502  rt_mutex_fastlock(struct rt_mutex *lock, int state,
20503 +                 struct ww_acquire_ctx *ww_ctx,
20504                   int (*slowfn)(struct rt_mutex *lock, int state,
20505                                 struct hrtimer_sleeper *timeout,
20506 -                               enum rtmutex_chainwalk chwalk))
20507 +                               enum rtmutex_chainwalk chwalk,
20508 +                               struct ww_acquire_ctx *ww_ctx))
20510         if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) {
20511                 rt_mutex_deadlock_account_lock(lock, current);
20512                 return 0;
20513         } else
20514 -               return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK);
20515 +               return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK,
20516 +                             ww_ctx);
20519  static inline int
20520  rt_mutex_timed_fastlock(struct rt_mutex *lock, int state,
20521                         struct hrtimer_sleeper *timeout,
20522                         enum rtmutex_chainwalk chwalk,
20523 +                       struct ww_acquire_ctx *ww_ctx,
20524                         int (*slowfn)(struct rt_mutex *lock, int state,
20525                                       struct hrtimer_sleeper *timeout,
20526 -                                     enum rtmutex_chainwalk chwalk))
20527 +                                     enum rtmutex_chainwalk chwalk,
20528 +                                     struct ww_acquire_ctx *ww_ctx))
20530         if (chwalk == RT_MUTEX_MIN_CHAINWALK &&
20531             likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) {
20532                 rt_mutex_deadlock_account_lock(lock, current);
20533                 return 0;
20534         } else
20535 -               return slowfn(lock, state, timeout, chwalk);
20536 +               return slowfn(lock, state, timeout, chwalk, ww_ctx);
20539  static inline int
20540 @@ -1435,17 +2040,20 @@ rt_mutex_fasttrylock(struct rt_mutex *lock,
20541  static inline void
20542  rt_mutex_fastunlock(struct rt_mutex *lock,
20543                     bool (*slowfn)(struct rt_mutex *lock,
20544 -                                  struct wake_q_head *wqh))
20545 +                                  struct wake_q_head *wqh,
20546 +                                  struct wake_q_head *wq_sleeper))
20548         WAKE_Q(wake_q);
20549 +       WAKE_Q(wake_sleeper_q);
20551         if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) {
20552                 rt_mutex_deadlock_account_unlock(current);
20554         } else {
20555 -               bool deboost = slowfn(lock, &wake_q);
20556 +               bool deboost = slowfn(lock, &wake_q, &wake_sleeper_q);
20558                 wake_up_q(&wake_q);
20559 +               wake_up_q_sleeper(&wake_sleeper_q);
20561                 /* Undo pi boosting if necessary: */
20562                 if (deboost)
20563 @@ -1462,7 +2070,7 @@ void __sched rt_mutex_lock(struct rt_mutex *lock)
20565         might_sleep();
20567 -       rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, rt_mutex_slowlock);
20568 +       rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, NULL, rt_mutex_slowlock);
20570  EXPORT_SYMBOL_GPL(rt_mutex_lock);
20572 @@ -1479,7 +2087,7 @@ int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock)
20574         might_sleep();
20576 -       return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, rt_mutex_slowlock);
20577 +       return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, NULL, rt_mutex_slowlock);
20579  EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
20581 @@ -1492,11 +2100,30 @@ int rt_mutex_timed_futex_lock(struct rt_mutex *lock,
20582         might_sleep();
20584         return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
20585 -                                      RT_MUTEX_FULL_CHAINWALK,
20586 +                                      RT_MUTEX_FULL_CHAINWALK, NULL,
20587                                        rt_mutex_slowlock);
20590  /**
20591 + * rt_mutex_lock_killable - lock a rt_mutex killable
20592 + *
20593 + * @lock:              the rt_mutex to be locked
20594 + * @detect_deadlock:   deadlock detection on/off
20595 + *
20596 + * Returns:
20597 + *  0          on success
20598 + * -EINTR      when interrupted by a signal
20599 + * -EDEADLK    when the lock would deadlock (when deadlock detection is on)
20600 + */
20601 +int __sched rt_mutex_lock_killable(struct rt_mutex *lock)
20603 +       might_sleep();
20605 +       return rt_mutex_fastlock(lock, TASK_KILLABLE, NULL, rt_mutex_slowlock);
20607 +EXPORT_SYMBOL_GPL(rt_mutex_lock_killable);
20609 +/**
20610   * rt_mutex_timed_lock - lock a rt_mutex interruptible
20611   *                     the timeout structure is provided
20612   *                     by the caller
20613 @@ -1516,6 +2143,7 @@ rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout)
20615         return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
20616                                        RT_MUTEX_MIN_CHAINWALK,
20617 +                                      NULL,
20618                                        rt_mutex_slowlock);
20620  EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);
20621 @@ -1533,7 +2161,11 @@ EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);
20622   */
20623  int __sched rt_mutex_trylock(struct rt_mutex *lock)
20625 +#ifdef CONFIG_PREEMPT_RT_FULL
20626 +       if (WARN_ON_ONCE(in_irq() || in_nmi()))
20627 +#else
20628         if (WARN_ON(in_irq() || in_nmi() || in_serving_softirq()))
20629 +#endif
20630                 return 0;
20632         return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock);
20633 @@ -1559,13 +2191,14 @@ EXPORT_SYMBOL_GPL(rt_mutex_unlock);
20634   * required or not.
20635   */
20636  bool __sched rt_mutex_futex_unlock(struct rt_mutex *lock,
20637 -                                  struct wake_q_head *wqh)
20638 +                                  struct wake_q_head *wqh,
20639 +                                  struct wake_q_head *wq_sleeper)
20641         if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) {
20642                 rt_mutex_deadlock_account_unlock(current);
20643                 return false;
20644         }
20645 -       return rt_mutex_slowunlock(lock, wqh);
20646 +       return rt_mutex_slowunlock(lock, wqh, wq_sleeper);
20649  /**
20650 @@ -1598,13 +2231,12 @@ EXPORT_SYMBOL_GPL(rt_mutex_destroy);
20651  void __rt_mutex_init(struct rt_mutex *lock, const char *name)
20653         lock->owner = NULL;
20654 -       raw_spin_lock_init(&lock->wait_lock);
20655         lock->waiters = RB_ROOT;
20656         lock->waiters_leftmost = NULL;
20658         debug_rt_mutex_init(lock, name);
20660 -EXPORT_SYMBOL_GPL(__rt_mutex_init);
20661 +EXPORT_SYMBOL(__rt_mutex_init);
20663  /**
20664   * rt_mutex_init_proxy_locked - initialize and lock a rt_mutex on behalf of a
20665 @@ -1619,7 +2251,7 @@ EXPORT_SYMBOL_GPL(__rt_mutex_init);
20666  void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
20667                                 struct task_struct *proxy_owner)
20669 -       __rt_mutex_init(lock, NULL);
20670 +       rt_mutex_init(lock);
20671         debug_rt_mutex_proxy_lock(lock, proxy_owner);
20672         rt_mutex_set_owner(lock, proxy_owner);
20673         rt_mutex_deadlock_account_lock(lock, proxy_owner);
20674 @@ -1660,13 +2292,42 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
20676         int ret;
20678 -       raw_spin_lock(&lock->wait_lock);
20679 +       raw_spin_lock_irq(&lock->wait_lock);
20681         if (try_to_take_rt_mutex(lock, task, NULL)) {
20682 -               raw_spin_unlock(&lock->wait_lock);
20683 +               raw_spin_unlock_irq(&lock->wait_lock);
20684                 return 1;
20685         }
20687 +#ifdef CONFIG_PREEMPT_RT_FULL
20688 +       /*
20689 +        * In PREEMPT_RT there's an added race.
20690 +        * If the task, that we are about to requeue, times out,
20691 +        * it can set the PI_WAKEUP_INPROGRESS. This tells the requeue
20692 +        * to skip this task. But right after the task sets
20693 +        * its pi_blocked_on to PI_WAKEUP_INPROGRESS it can then
20694 +        * block on the spin_lock(&hb->lock), which in RT is an rtmutex.
20695 +        * This will replace the PI_WAKEUP_INPROGRESS with the actual
20696 +        * lock that it blocks on. We *must not* place this task
20697 +        * on this proxy lock in that case.
20698 +        *
20699 +        * To prevent this race, we first take the task's pi_lock
20700 +        * and check if it has updated its pi_blocked_on. If it has,
20701 +        * we assume that it woke up and we return -EAGAIN.
20702 +        * Otherwise, we set the task's pi_blocked_on to
20703 +        * PI_REQUEUE_INPROGRESS, so that if the task is waking up
20704 +        * it will know that we are in the process of requeuing it.
20705 +        */
20706 +       raw_spin_lock(&task->pi_lock);
20707 +       if (task->pi_blocked_on) {
20708 +               raw_spin_unlock(&task->pi_lock);
20709 +               raw_spin_unlock_irq(&lock->wait_lock);
20710 +               return -EAGAIN;
20711 +       }
20712 +       task->pi_blocked_on = PI_REQUEUE_INPROGRESS;
20713 +       raw_spin_unlock(&task->pi_lock);
20714 +#endif
20716         /* We enforce deadlock detection for futexes */
20717         ret = task_blocks_on_rt_mutex(lock, waiter, task,
20718                                       RT_MUTEX_FULL_CHAINWALK);
20719 @@ -1681,10 +2342,10 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
20720                 ret = 0;
20721         }
20723 -       if (unlikely(ret))
20724 +       if (ret && rt_mutex_has_waiters(lock))
20725                 remove_waiter(lock, waiter);
20727 -       raw_spin_unlock(&lock->wait_lock);
20728 +       raw_spin_unlock_irq(&lock->wait_lock);
20730         debug_rt_mutex_print_deadlock(waiter);
20732 @@ -1732,12 +2393,12 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
20734         int ret;
20736 -       raw_spin_lock(&lock->wait_lock);
20737 +       raw_spin_lock_irq(&lock->wait_lock);
20739         set_current_state(TASK_INTERRUPTIBLE);
20741         /* sleep on the mutex */
20742 -       ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter);
20743 +       ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter, NULL);
20745         if (unlikely(ret))
20746                 remove_waiter(lock, waiter);
20747 @@ -1748,7 +2409,93 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
20748          */
20749         fixup_rt_mutex_waiters(lock);
20751 -       raw_spin_unlock(&lock->wait_lock);
20752 +       raw_spin_unlock_irq(&lock->wait_lock);
20754         return ret;
20757 +static inline int
20758 +ww_mutex_deadlock_injection(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
20760 +#ifdef CONFIG_DEBUG_WW_MUTEX_SLOWPATH
20761 +       unsigned tmp;
20763 +       if (ctx->deadlock_inject_countdown-- == 0) {
20764 +               tmp = ctx->deadlock_inject_interval;
20765 +               if (tmp > UINT_MAX/4)
20766 +                       tmp = UINT_MAX;
20767 +               else
20768 +                       tmp = tmp*2 + tmp + tmp/2;
20770 +               ctx->deadlock_inject_interval = tmp;
20771 +               ctx->deadlock_inject_countdown = tmp;
20772 +               ctx->contending_lock = lock;
20774 +               ww_mutex_unlock(lock);
20776 +               return -EDEADLK;
20777 +       }
20778 +#endif
20780 +       return 0;
20783 +#ifdef CONFIG_PREEMPT_RT_FULL
20784 +int __sched
20785 +__ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ww_ctx)
20787 +       int ret;
20789 +       might_sleep();
20791 +       mutex_acquire_nest(&lock->base.dep_map, 0, 0, &ww_ctx->dep_map, _RET_IP_);
20792 +       ret = rt_mutex_slowlock(&lock->base.lock, TASK_INTERRUPTIBLE, NULL, 0, ww_ctx);
20793 +       if (ret)
20794 +               mutex_release(&lock->base.dep_map, 1, _RET_IP_);
20795 +       else if (!ret && ww_ctx->acquired > 1)
20796 +               return ww_mutex_deadlock_injection(lock, ww_ctx);
20798 +       return ret;
20800 +EXPORT_SYMBOL_GPL(__ww_mutex_lock_interruptible);
20802 +int __sched
20803 +__ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ww_ctx)
20805 +       int ret;
20807 +       might_sleep();
20809 +       mutex_acquire_nest(&lock->base.dep_map, 0, 0, &ww_ctx->dep_map, _RET_IP_);
20810 +       ret = rt_mutex_slowlock(&lock->base.lock, TASK_UNINTERRUPTIBLE, NULL, 0, ww_ctx);
20811 +       if (ret)
20812 +               mutex_release(&lock->base.dep_map, 1, _RET_IP_);
20813 +       else if (!ret && ww_ctx->acquired > 1)
20814 +               return ww_mutex_deadlock_injection(lock, ww_ctx);
20816 +       return ret;
20818 +EXPORT_SYMBOL_GPL(__ww_mutex_lock);
20820 +void __sched ww_mutex_unlock(struct ww_mutex *lock)
20822 +       int nest = !!lock->ctx;
20824 +       /*
20825 +        * The unlocking fastpath is the 0->1 transition from 'locked'
20826 +        * into 'unlocked' state:
20827 +        */
20828 +       if (nest) {
20829 +#ifdef CONFIG_DEBUG_MUTEXES
20830 +               DEBUG_LOCKS_WARN_ON(!lock->ctx->acquired);
20831 +#endif
20832 +               if (lock->ctx->acquired > 0)
20833 +                       lock->ctx->acquired--;
20834 +               lock->ctx = NULL;
20835 +       }
20837 +       mutex_release(&lock->base.dep_map, nest, _RET_IP_);
20838 +       rt_mutex_unlock(&lock->base.lock);
20840 +EXPORT_SYMBOL(ww_mutex_unlock);
20841 +#endif
20842 diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
20843 index e317e1cbb3eb..f457c7574920 100644
20844 --- a/kernel/locking/rtmutex_common.h
20845 +++ b/kernel/locking/rtmutex_common.h
20846 @@ -27,6 +27,7 @@ struct rt_mutex_waiter {
20847         struct rb_node          pi_tree_entry;
20848         struct task_struct      *task;
20849         struct rt_mutex         *lock;
20850 +       bool                    savestate;
20851  #ifdef CONFIG_DEBUG_RT_MUTEXES
20852         unsigned long           ip;
20853         struct pid              *deadlock_task_pid;
20854 @@ -98,6 +99,9 @@ enum rtmutex_chainwalk {
20855  /*
20856   * PI-futex support (proxy locking functions, etc.):
20857   */
20858 +#define PI_WAKEUP_INPROGRESS   ((struct rt_mutex_waiter *) 1)
20859 +#define PI_REQUEUE_INPROGRESS  ((struct rt_mutex_waiter *) 2)
20861  extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock);
20862  extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
20863                                        struct task_struct *proxy_owner);
20864 @@ -111,7 +115,8 @@ extern int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
20865                                       struct rt_mutex_waiter *waiter);
20866  extern int rt_mutex_timed_futex_lock(struct rt_mutex *l, struct hrtimer_sleeper *to);
20867  extern bool rt_mutex_futex_unlock(struct rt_mutex *lock,
20868 -                                 struct wake_q_head *wqh);
20869 +                                 struct wake_q_head *wqh,
20870 +                                 struct wake_q_head *wq_sleeper);
20871  extern void rt_mutex_adjust_prio(struct task_struct *task);
20873  #ifdef CONFIG_DEBUG_RT_MUTEXES
20874 @@ -120,4 +125,14 @@ extern void rt_mutex_adjust_prio(struct task_struct *task);
20875  # include "rtmutex.h"
20876  #endif
20878 +static inline void
20879 +rt_mutex_init_waiter(struct rt_mutex_waiter *waiter, bool savestate)
20881 +       debug_rt_mutex_init_waiter(waiter);
20882 +       waiter->task = NULL;
20883 +       waiter->savestate = savestate;
20884 +       RB_CLEAR_NODE(&waiter->pi_tree_entry);
20885 +       RB_CLEAR_NODE(&waiter->tree_entry);
20888  #endif
20889 diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c
20890 index db3ccb1dd614..909779647bd1 100644
20891 --- a/kernel/locking/spinlock.c
20892 +++ b/kernel/locking/spinlock.c
20893 @@ -124,8 +124,11 @@ void __lockfunc __raw_##op##_lock_bh(locktype##_t *lock)           \
20894   *         __[spin|read|write]_lock_bh()
20895   */
20896  BUILD_LOCK_OPS(spin, raw_spinlock);
20898 +#ifndef CONFIG_PREEMPT_RT_FULL
20899  BUILD_LOCK_OPS(read, rwlock);
20900  BUILD_LOCK_OPS(write, rwlock);
20901 +#endif
20903  #endif
20905 @@ -209,6 +212,8 @@ void __lockfunc _raw_spin_unlock_bh(raw_spinlock_t *lock)
20906  EXPORT_SYMBOL(_raw_spin_unlock_bh);
20907  #endif
20909 +#ifndef CONFIG_PREEMPT_RT_FULL
20911  #ifndef CONFIG_INLINE_READ_TRYLOCK
20912  int __lockfunc _raw_read_trylock(rwlock_t *lock)
20914 @@ -353,6 +358,8 @@ void __lockfunc _raw_write_unlock_bh(rwlock_t *lock)
20915  EXPORT_SYMBOL(_raw_write_unlock_bh);
20916  #endif
20918 +#endif /* !PREEMPT_RT_FULL */
20920  #ifdef CONFIG_DEBUG_LOCK_ALLOC
20922  void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass)
20923 diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c
20924 index 0374a596cffa..94970338d518 100644
20925 --- a/kernel/locking/spinlock_debug.c
20926 +++ b/kernel/locking/spinlock_debug.c
20927 @@ -31,6 +31,7 @@ void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name,
20929  EXPORT_SYMBOL(__raw_spin_lock_init);
20931 +#ifndef CONFIG_PREEMPT_RT_FULL
20932  void __rwlock_init(rwlock_t *lock, const char *name,
20933                    struct lock_class_key *key)
20935 @@ -48,6 +49,7 @@ void __rwlock_init(rwlock_t *lock, const char *name,
20938  EXPORT_SYMBOL(__rwlock_init);
20939 +#endif
20941  static void spin_dump(raw_spinlock_t *lock, const char *msg)
20943 @@ -159,6 +161,7 @@ void do_raw_spin_unlock(raw_spinlock_t *lock)
20944         arch_spin_unlock(&lock->raw_lock);
20947 +#ifndef CONFIG_PREEMPT_RT_FULL
20948  static void rwlock_bug(rwlock_t *lock, const char *msg)
20950         if (!debug_locks_off())
20951 @@ -300,3 +303,5 @@ void do_raw_write_unlock(rwlock_t *lock)
20952         debug_write_unlock(lock);
20953         arch_write_unlock(&lock->raw_lock);
20956 +#endif
20957 diff --git a/kernel/module.c b/kernel/module.c
20958 index b14a4f31221f..be3f497a089d 100644
20959 --- a/kernel/module.c
20960 +++ b/kernel/module.c
20961 @@ -682,16 +682,7 @@ static void percpu_modcopy(struct module *mod,
20962                 memcpy(per_cpu_ptr(mod->percpu, cpu), from, size);
20965 -/**
20966 - * is_module_percpu_address - test whether address is from module static percpu
20967 - * @addr: address to test
20968 - *
20969 - * Test whether @addr belongs to module static percpu area.
20970 - *
20971 - * RETURNS:
20972 - * %true if @addr is from module static percpu area
20973 - */
20974 -bool is_module_percpu_address(unsigned long addr)
20975 +bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr)
20977         struct module *mod;
20978         unsigned int cpu;
20979 @@ -705,9 +696,11 @@ bool is_module_percpu_address(unsigned long addr)
20980                         continue;
20981                 for_each_possible_cpu(cpu) {
20982                         void *start = per_cpu_ptr(mod->percpu, cpu);
20983 +                       void *va = (void *)addr;
20985 -                       if ((void *)addr >= start &&
20986 -                           (void *)addr < start + mod->percpu_size) {
20987 +                       if (va >= start && va < start + mod->percpu_size) {
20988 +                               if (can_addr)
20989 +                                       *can_addr = (unsigned long) (va - start);
20990                                 preempt_enable();
20991                                 return true;
20992                         }
20993 @@ -718,6 +711,20 @@ bool is_module_percpu_address(unsigned long addr)
20994         return false;
20997 +/**
20998 + * is_module_percpu_address - test whether address is from module static percpu
20999 + * @addr: address to test
21000 + *
21001 + * Test whether @addr belongs to module static percpu area.
21002 + *
21003 + * RETURNS:
21004 + * %true if @addr is from module static percpu area
21005 + */
21006 +bool is_module_percpu_address(unsigned long addr)
21008 +       return __is_module_percpu_address(addr, NULL);
21011  #else /* ... !CONFIG_SMP */
21013  static inline void __percpu *mod_percpu(struct module *mod)
21014 @@ -749,6 +756,11 @@ bool is_module_percpu_address(unsigned long addr)
21015         return false;
21018 +bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr)
21020 +       return false;
21023  #endif /* CONFIG_SMP */
21025  #define MODINFO_ATTR(field)    \
21026 diff --git a/kernel/panic.c b/kernel/panic.c
21027 index 41e2b54f36b5..3535f802953a 100644
21028 --- a/kernel/panic.c
21029 +++ b/kernel/panic.c
21030 @@ -61,6 +61,37 @@ void __weak panic_smp_self_stop(void)
21031                 cpu_relax();
21035 + * Stop ourselves in NMI context if another CPU has already panicked. Arch code
21036 + * may override this to prepare for crash dumping, e.g. save regs info.
21037 + */
21038 +void __weak nmi_panic_self_stop(struct pt_regs *regs)
21040 +       panic_smp_self_stop();
21043 +atomic_t panic_cpu = ATOMIC_INIT(PANIC_CPU_INVALID);
21046 + * A variant of panic() called from NMI context. We return if we've already
21047 + * panicked on this CPU. If another CPU already panicked, loop in
21048 + * nmi_panic_self_stop() which can provide architecture dependent code such
21049 + * as saving register state for crash dump.
21050 + */
21051 +void nmi_panic(struct pt_regs *regs, const char *msg)
21053 +       int old_cpu, cpu;
21055 +       cpu = raw_smp_processor_id();
21056 +       old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, cpu);
21058 +       if (old_cpu == PANIC_CPU_INVALID)
21059 +               panic("%s", msg);
21060 +       else if (old_cpu != cpu)
21061 +               nmi_panic_self_stop(regs);
21063 +EXPORT_SYMBOL(nmi_panic);
21065  /**
21066   *     panic - halt the system
21067   *     @fmt: The text string to print
21068 @@ -71,17 +102,17 @@ void __weak panic_smp_self_stop(void)
21069   */
21070  void panic(const char *fmt, ...)
21072 -       static DEFINE_SPINLOCK(panic_lock);
21073         static char buf[1024];
21074         va_list args;
21075         long i, i_next = 0;
21076         int state = 0;
21077 +       int old_cpu, this_cpu;
21079         /*
21080          * Disable local interrupts. This will prevent panic_smp_self_stop
21081          * from deadlocking the first cpu that invokes the panic, since
21082          * there is nothing to prevent an interrupt handler (that runs
21083 -        * after the panic_lock is acquired) from invoking panic again.
21084 +        * after setting panic_cpu) from invoking panic() again.
21085          */
21086         local_irq_disable();
21088 @@ -94,8 +125,16 @@ void panic(const char *fmt, ...)
21089          * multiple parallel invocations of panic, all other CPUs either
21090          * stop themself or will wait until they are stopped by the 1st CPU
21091          * with smp_send_stop().
21092 +        *
21093 +        * `old_cpu == PANIC_CPU_INVALID' means this is the 1st CPU which
21094 +        * comes here, so go ahead.
21095 +        * `old_cpu == this_cpu' means we came from nmi_panic() which sets
21096 +        * panic_cpu to this CPU.  In this case, this is also the 1st CPU.
21097          */
21098 -       if (!spin_trylock(&panic_lock))
21099 +       this_cpu = raw_smp_processor_id();
21100 +       old_cpu  = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, this_cpu);
21102 +       if (old_cpu != PANIC_CPU_INVALID && old_cpu != this_cpu)
21103                 panic_smp_self_stop();
21105         console_verbose();
21106 @@ -400,9 +439,11 @@ static u64 oops_id;
21108  static int init_oops_id(void)
21110 +#ifndef CONFIG_PREEMPT_RT_FULL
21111         if (!oops_id)
21112                 get_random_bytes(&oops_id, sizeof(oops_id));
21113         else
21114 +#endif
21115                 oops_id++;
21117         return 0;
21118 diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
21119 index 3124cebaec31..c1b981521dd0 100644
21120 --- a/kernel/power/hibernate.c
21121 +++ b/kernel/power/hibernate.c
21122 @@ -285,6 +285,8 @@ static int create_image(int platform_mode)
21124         local_irq_disable();
21126 +       system_state = SYSTEM_SUSPEND;
21128         error = syscore_suspend();
21129         if (error) {
21130                 printk(KERN_ERR "PM: Some system devices failed to power down, "
21131 @@ -314,6 +316,7 @@ static int create_image(int platform_mode)
21132         syscore_resume();
21134   Enable_irqs:
21135 +       system_state = SYSTEM_RUNNING;
21136         local_irq_enable();
21138   Enable_cpus:
21139 @@ -438,6 +441,7 @@ static int resume_target_kernel(bool platform_mode)
21140                 goto Enable_cpus;
21142         local_irq_disable();
21143 +       system_state = SYSTEM_SUSPEND;
21145         error = syscore_suspend();
21146         if (error)
21147 @@ -471,6 +475,7 @@ static int resume_target_kernel(bool platform_mode)
21148         syscore_resume();
21150   Enable_irqs:
21151 +       system_state = SYSTEM_RUNNING;
21152         local_irq_enable();
21154   Enable_cpus:
21155 @@ -556,6 +561,7 @@ int hibernation_platform_enter(void)
21156                 goto Enable_cpus;
21158         local_irq_disable();
21159 +       system_state = SYSTEM_SUSPEND;
21160         syscore_suspend();
21161         if (pm_wakeup_pending()) {
21162                 error = -EAGAIN;
21163 @@ -568,6 +574,7 @@ int hibernation_platform_enter(void)
21165   Power_up:
21166         syscore_resume();
21167 +       system_state = SYSTEM_RUNNING;
21168         local_irq_enable();
21170   Enable_cpus:
21171 @@ -642,6 +649,10 @@ static void power_down(void)
21172                 cpu_relax();
21175 +#ifndef CONFIG_SUSPEND
21176 +bool pm_in_action;
21177 +#endif
21179  /**
21180   * hibernate - Carry out system hibernation, including saving the image.
21181   */
21182 @@ -654,6 +665,8 @@ int hibernate(void)
21183                 return -EPERM;
21184         }
21186 +       pm_in_action = true;
21188         lock_system_sleep();
21189         /* The snapshot device should not be opened while we're running */
21190         if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
21191 @@ -719,6 +732,7 @@ int hibernate(void)
21192         atomic_inc(&snapshot_device_available);
21193   Unlock:
21194         unlock_system_sleep();
21195 +       pm_in_action = false;
21196         return error;
21199 diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
21200 index f9fe133c13e2..393bc342c586 100644
21201 --- a/kernel/power/suspend.c
21202 +++ b/kernel/power/suspend.c
21203 @@ -359,6 +359,8 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
21204         arch_suspend_disable_irqs();
21205         BUG_ON(!irqs_disabled());
21207 +       system_state = SYSTEM_SUSPEND;
21209         error = syscore_suspend();
21210         if (!error) {
21211                 *wakeup = pm_wakeup_pending();
21212 @@ -375,6 +377,8 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
21213                 syscore_resume();
21214         }
21216 +       system_state = SYSTEM_RUNNING;
21218         arch_suspend_enable_irqs();
21219         BUG_ON(irqs_disabled());
21221 @@ -518,6 +522,8 @@ static int enter_state(suspend_state_t state)
21222         return error;
21225 +bool pm_in_action;
21227  /**
21228   * pm_suspend - Externally visible function for suspending the system.
21229   * @state: System sleep state to enter.
21230 @@ -532,6 +538,8 @@ int pm_suspend(suspend_state_t state)
21231         if (state <= PM_SUSPEND_ON || state >= PM_SUSPEND_MAX)
21232                 return -EINVAL;
21234 +       pm_in_action = true;
21236         error = enter_state(state);
21237         if (error) {
21238                 suspend_stats.fail++;
21239 @@ -539,6 +547,7 @@ int pm_suspend(suspend_state_t state)
21240         } else {
21241                 suspend_stats.success++;
21242         }
21243 +       pm_in_action = false;
21244         return error;
21246  EXPORT_SYMBOL(pm_suspend);
21247 diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
21248 index 0b5613554769..99deb2617308 100644
21249 --- a/kernel/printk/printk.c
21250 +++ b/kernel/printk/printk.c
21251 @@ -241,6 +241,65 @@ struct printk_log {
21252   */
21253  static DEFINE_RAW_SPINLOCK(logbuf_lock);
21255 +#ifdef CONFIG_EARLY_PRINTK
21256 +struct console *early_console;
21258 +static void early_vprintk(const char *fmt, va_list ap)
21260 +       if (early_console) {
21261 +               char buf[512];
21262 +               int n = vscnprintf(buf, sizeof(buf), fmt, ap);
21264 +               early_console->write(early_console, buf, n);
21265 +       }
21268 +asmlinkage void early_printk(const char *fmt, ...)
21270 +       va_list ap;
21272 +       va_start(ap, fmt);
21273 +       early_vprintk(fmt, ap);
21274 +       va_end(ap);
21278 + * This is independent of any log levels - a global
21279 + * kill switch that turns off all of printk.
21280 + *
21281 + * Used by the NMI watchdog if early-printk is enabled.
21282 + */
21283 +static bool __read_mostly printk_killswitch;
21285 +static int __init force_early_printk_setup(char *str)
21287 +       printk_killswitch = true;
21288 +       return 0;
21290 +early_param("force_early_printk", force_early_printk_setup);
21292 +void printk_kill(void)
21294 +       printk_killswitch = true;
21297 +#ifdef CONFIG_PRINTK
21298 +static int forced_early_printk(const char *fmt, va_list ap)
21300 +       if (!printk_killswitch)
21301 +               return 0;
21302 +       early_vprintk(fmt, ap);
21303 +       return 1;
21305 +#endif
21307 +#else
21308 +static inline int forced_early_printk(const char *fmt, va_list ap)
21310 +       return 0;
21312 +#endif
21314  #ifdef CONFIG_PRINTK
21315  DECLARE_WAIT_QUEUE_HEAD(log_wait);
21316  /* the next printk record to read by syslog(READ) or /proc/kmsg */
21317 @@ -1203,6 +1262,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
21319         char *text;
21320         int len = 0;
21321 +       int attempts = 0;
21323         text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL);
21324         if (!text)
21325 @@ -1214,7 +1274,14 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
21326                 u64 seq;
21327                 u32 idx;
21328                 enum log_flags prev;
21330 +               int num_msg;
21331 +try_again:
21332 +               attempts++;
21333 +               if (attempts > 10) {
21334 +                       len = -EBUSY;
21335 +                       goto out;
21336 +               }
21337 +               num_msg = 0;
21338                 if (clear_seq < log_first_seq) {
21339                         /* messages are gone, move to first available one */
21340                         clear_seq = log_first_seq;
21341 @@ -1235,6 +1302,14 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
21342                         prev = msg->flags;
21343                         idx = log_next(idx);
21344                         seq++;
21345 +                       num_msg++;
21346 +                       if (num_msg > 5) {
21347 +                               num_msg = 0;
21348 +                               raw_spin_unlock_irq(&logbuf_lock);
21349 +                               raw_spin_lock_irq(&logbuf_lock);
21350 +                               if (clear_seq < log_first_seq)
21351 +                                       goto try_again;
21352 +                       }
21353                 }
21355                 /* move first record forward until length fits into the buffer */
21356 @@ -1248,6 +1323,14 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
21357                         prev = msg->flags;
21358                         idx = log_next(idx);
21359                         seq++;
21360 +                       num_msg++;
21361 +                       if (num_msg > 5) {
21362 +                               num_msg = 0;
21363 +                               raw_spin_unlock_irq(&logbuf_lock);
21364 +                               raw_spin_lock_irq(&logbuf_lock);
21365 +                               if (clear_seq < log_first_seq)
21366 +                                       goto try_again;
21367 +                       }
21368                 }
21370                 /* last message fitting into this dump */
21371 @@ -1288,6 +1371,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
21372                 clear_seq = log_next_seq;
21373                 clear_idx = log_next_idx;
21374         }
21375 +out:
21376         raw_spin_unlock_irq(&logbuf_lock);
21378         kfree(text);
21379 @@ -1443,6 +1527,12 @@ static void call_console_drivers(int level,
21380         if (!console_drivers)
21381                 return;
21383 +       if (IS_ENABLED(CONFIG_PREEMPT_RT_BASE)) {
21384 +               if (in_irq() || in_nmi())
21385 +                       return;
21386 +       }
21388 +       migrate_disable();
21389         for_each_console(con) {
21390                 if (exclusive_console && con != exclusive_console)
21391                         continue;
21392 @@ -1458,6 +1548,7 @@ static void call_console_drivers(int level,
21393                 else
21394                         con->write(con, text, len);
21395         }
21396 +       migrate_enable();
21399  /*
21400 @@ -1518,6 +1609,15 @@ static inline int can_use_console(unsigned int cpu)
21401  static int console_trylock_for_printk(void)
21403         unsigned int cpu = smp_processor_id();
21404 +#ifdef CONFIG_PREEMPT_RT_FULL
21405 +       int lock = !early_boot_irqs_disabled && (preempt_count() == 0) &&
21406 +               !irqs_disabled();
21407 +#else
21408 +       int lock = 1;
21409 +#endif
21411 +       if (!lock)
21412 +               return 0;
21414         if (!console_trylock())
21415                 return 0;
21416 @@ -1672,6 +1772,13 @@ asmlinkage int vprintk_emit(int facility, int level,
21417         /* cpu currently holding logbuf_lock in this function */
21418         static unsigned int logbuf_cpu = UINT_MAX;
21420 +       /*
21421 +        * Fall back to early_printk if a debugging subsystem has
21422 +        * killed printk output
21423 +        */
21424 +       if (unlikely(forced_early_printk(fmt, args)))
21425 +               return 1;
21427         if (level == LOGLEVEL_SCHED) {
21428                 level = LOGLEVEL_DEFAULT;
21429                 in_sched = true;
21430 @@ -1813,8 +1920,7 @@ asmlinkage int vprintk_emit(int facility, int level,
21431                  * console_sem which would prevent anyone from printing to
21432                  * console
21433                  */
21434 -               preempt_disable();
21436 +               migrate_disable();
21437                 /*
21438                  * Try to acquire and then immediately release the console
21439                  * semaphore.  The release will print out buffers and wake up
21440 @@ -1822,7 +1928,7 @@ asmlinkage int vprintk_emit(int facility, int level,
21441                  */
21442                 if (console_trylock_for_printk())
21443                         console_unlock();
21444 -               preempt_enable();
21445 +               migrate_enable();
21446                 lockdep_on();
21447         }
21449 @@ -1961,26 +2067,6 @@ DEFINE_PER_CPU(printk_func_t, printk_func);
21451  #endif /* CONFIG_PRINTK */
21453 -#ifdef CONFIG_EARLY_PRINTK
21454 -struct console *early_console;
21456 -asmlinkage __visible void early_printk(const char *fmt, ...)
21458 -       va_list ap;
21459 -       char buf[512];
21460 -       int n;
21462 -       if (!early_console)
21463 -               return;
21465 -       va_start(ap, fmt);
21466 -       n = vscnprintf(buf, sizeof(buf), fmt, ap);
21467 -       va_end(ap);
21469 -       early_console->write(early_console, buf, n);
21471 -#endif
21473  static int __add_preferred_console(char *name, int idx, char *options,
21474                                    char *brl_options)
21476 @@ -2202,11 +2288,16 @@ static void console_cont_flush(char *text, size_t size)
21477                 goto out;
21479         len = cont_print_text(text, size);
21480 +#ifdef CONFIG_PREEMPT_RT_FULL
21481 +       raw_spin_unlock_irqrestore(&logbuf_lock, flags);
21482 +       call_console_drivers(cont.level, NULL, 0, text, len);
21483 +#else
21484         raw_spin_unlock(&logbuf_lock);
21485         stop_critical_timings();
21486         call_console_drivers(cont.level, NULL, 0, text, len);
21487         start_critical_timings();
21488         local_irq_restore(flags);
21489 +#endif
21490         return;
21491  out:
21492         raw_spin_unlock_irqrestore(&logbuf_lock, flags);
21493 @@ -2316,13 +2407,17 @@ skip:
21494                 console_idx = log_next(console_idx);
21495                 console_seq++;
21496                 console_prev = msg->flags;
21497 +#ifdef CONFIG_PREEMPT_RT_FULL
21498 +               raw_spin_unlock_irqrestore(&logbuf_lock, flags);
21499 +               call_console_drivers(level, ext_text, ext_len, text, len);
21500 +#else
21501                 raw_spin_unlock(&logbuf_lock);
21503                 stop_critical_timings();        /* don't trace print latency */
21504                 call_console_drivers(level, ext_text, ext_len, text, len);
21505                 start_critical_timings();
21506                 local_irq_restore(flags);
21508 +#endif
21509                 if (do_cond_resched)
21510                         cond_resched();
21511         }
21512 @@ -2374,6 +2469,11 @@ void console_unblank(void)
21514         struct console *c;
21516 +       if (IS_ENABLED(CONFIG_PREEMPT_RT_BASE)) {
21517 +               if (in_irq() || in_nmi())
21518 +                       return;
21519 +       }
21521         /*
21522          * console_unblank can no longer be called in interrupt context unless
21523          * oops_in_progress is set to 1..
21524 diff --git a/kernel/ptrace.c b/kernel/ptrace.c
21525 index c7e8ed99c953..2a19cc5001b4 100644
21526 --- a/kernel/ptrace.c
21527 +++ b/kernel/ptrace.c
21528 @@ -136,7 +136,14 @@ static bool ptrace_freeze_traced(struct task_struct *task)
21530         spin_lock_irq(&task->sighand->siglock);
21531         if (task_is_traced(task) && !__fatal_signal_pending(task)) {
21532 -               task->state = __TASK_TRACED;
21533 +               unsigned long flags;
21535 +               raw_spin_lock_irqsave(&task->pi_lock, flags);
21536 +               if (task->state & __TASK_TRACED)
21537 +                       task->state = __TASK_TRACED;
21538 +               else
21539 +                       task->saved_state = __TASK_TRACED;
21540 +               raw_spin_unlock_irqrestore(&task->pi_lock, flags);
21541                 ret = true;
21542         }
21543         spin_unlock_irq(&task->sighand->siglock);
21544 diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
21545 index d89328e260df..5bb3364a6284 100644
21546 --- a/kernel/rcu/rcutorture.c
21547 +++ b/kernel/rcu/rcutorture.c
21548 @@ -390,6 +390,7 @@ static struct rcu_torture_ops rcu_ops = {
21549         .name           = "rcu"
21550  };
21552 +#ifndef CONFIG_PREEMPT_RT_FULL
21553  /*
21554   * Definitions for rcu_bh torture testing.
21555   */
21556 @@ -429,6 +430,12 @@ static struct rcu_torture_ops rcu_bh_ops = {
21557         .name           = "rcu_bh"
21558  };
21560 +#else
21561 +static struct rcu_torture_ops rcu_bh_ops = {
21562 +       .ttype          = INVALID_RCU_FLAVOR,
21564 +#endif
21566  /*
21567   * Don't even think about trying any of these in real life!!!
21568   * The names includes "busted", and they really means it!
21569 diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
21570 index f07343b54fe5..d862a203fce0 100644
21571 --- a/kernel/rcu/tree.c
21572 +++ b/kernel/rcu/tree.c
21573 @@ -56,6 +56,11 @@
21574  #include <linux/random.h>
21575  #include <linux/trace_events.h>
21576  #include <linux/suspend.h>
21577 +#include <linux/delay.h>
21578 +#include <linux/gfp.h>
21579 +#include <linux/oom.h>
21580 +#include <linux/smpboot.h>
21581 +#include "../time/tick-internal.h"
21583  #include "tree.h"
21584  #include "rcu.h"
21585 @@ -266,6 +271,19 @@ void rcu_sched_qs(void)
21586         }
21589 +#ifdef CONFIG_PREEMPT_RT_FULL
21590 +static void rcu_preempt_qs(void);
21592 +void rcu_bh_qs(void)
21594 +       unsigned long flags;
21596 +       /* Callers to this function, rcu_preempt_qs(), must disable irqs. */
21597 +       local_irq_save(flags);
21598 +       rcu_preempt_qs();
21599 +       local_irq_restore(flags);
21601 +#else
21602  void rcu_bh_qs(void)
21604         if (__this_cpu_read(rcu_bh_data.cpu_no_qs.s)) {
21605 @@ -275,6 +293,7 @@ void rcu_bh_qs(void)
21606                 __this_cpu_write(rcu_bh_data.cpu_no_qs.b.norm, false);
21607         }
21609 +#endif
21611  static DEFINE_PER_CPU(int, rcu_sched_qs_mask);
21613 @@ -435,11 +454,13 @@ EXPORT_SYMBOL_GPL(rcu_batches_started_sched);
21614  /*
21615   * Return the number of RCU BH batches started thus far for debug & stats.
21616   */
21617 +#ifndef CONFIG_PREEMPT_RT_FULL
21618  unsigned long rcu_batches_started_bh(void)
21620         return rcu_bh_state.gpnum;
21622  EXPORT_SYMBOL_GPL(rcu_batches_started_bh);
21623 +#endif
21625  /*
21626   * Return the number of RCU batches completed thus far for debug & stats.
21627 @@ -459,6 +480,7 @@ unsigned long rcu_batches_completed_sched(void)
21629  EXPORT_SYMBOL_GPL(rcu_batches_completed_sched);
21631 +#ifndef CONFIG_PREEMPT_RT_FULL
21632  /*
21633   * Return the number of RCU BH batches completed thus far for debug & stats.
21634   */
21635 @@ -486,6 +508,13 @@ void rcu_bh_force_quiescent_state(void)
21637  EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state);
21639 +#else
21640 +void rcu_force_quiescent_state(void)
21643 +EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
21644 +#endif
21646  /*
21647   * Force a quiescent state for RCU-sched.
21648   */
21649 @@ -536,9 +565,11 @@ void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
21650         case RCU_FLAVOR:
21651                 rsp = rcu_state_p;
21652                 break;
21653 +#ifndef CONFIG_PREEMPT_RT_FULL
21654         case RCU_BH_FLAVOR:
21655                 rsp = &rcu_bh_state;
21656                 break;
21657 +#endif
21658         case RCU_SCHED_FLAVOR:
21659                 rsp = &rcu_sched_state;
21660                 break;
21661 @@ -1590,7 +1621,6 @@ static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
21662         int needmore;
21663         struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
21665 -       rcu_nocb_gp_cleanup(rsp, rnp);
21666         rnp->need_future_gp[c & 0x1] = 0;
21667         needmore = rnp->need_future_gp[(c + 1) & 0x1];
21668         trace_rcu_future_gp(rnp, rdp, c,
21669 @@ -1611,7 +1641,7 @@ static void rcu_gp_kthread_wake(struct rcu_state *rsp)
21670             !READ_ONCE(rsp->gp_flags) ||
21671             !rsp->gp_kthread)
21672                 return;
21673 -       wake_up(&rsp->gp_wq);
21674 +       swake_up(&rsp->gp_wq);
21677  /*
21678 @@ -1991,6 +2021,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
21679         int nocb = 0;
21680         struct rcu_data *rdp;
21681         struct rcu_node *rnp = rcu_get_root(rsp);
21682 +       struct swait_queue_head *sq;
21684         WRITE_ONCE(rsp->gp_activity, jiffies);
21685         raw_spin_lock_irq(&rnp->lock);
21686 @@ -2029,7 +2060,9 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
21687                         needgp = __note_gp_changes(rsp, rnp, rdp) || needgp;
21688                 /* smp_mb() provided by prior unlock-lock pair. */
21689                 nocb += rcu_future_gp_cleanup(rsp, rnp);
21690 +               sq = rcu_nocb_gp_get(rnp);
21691                 raw_spin_unlock_irq(&rnp->lock);
21692 +               rcu_nocb_gp_cleanup(sq);
21693                 cond_resched_rcu_qs();
21694                 WRITE_ONCE(rsp->gp_activity, jiffies);
21695                 rcu_gp_slow(rsp, gp_cleanup_delay);
21696 @@ -2076,7 +2109,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
21697                                                READ_ONCE(rsp->gpnum),
21698                                                TPS("reqwait"));
21699                         rsp->gp_state = RCU_GP_WAIT_GPS;
21700 -                       wait_event_interruptible(rsp->gp_wq,
21701 +                       swait_event_interruptible(rsp->gp_wq,
21702                                                  READ_ONCE(rsp->gp_flags) &
21703                                                  RCU_GP_FLAG_INIT);
21704                         rsp->gp_state = RCU_GP_DONE_GPS;
21705 @@ -2106,7 +2139,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
21706                                                READ_ONCE(rsp->gpnum),
21707                                                TPS("fqswait"));
21708                         rsp->gp_state = RCU_GP_WAIT_FQS;
21709 -                       ret = wait_event_interruptible_timeout(rsp->gp_wq,
21710 +                       ret = swait_event_interruptible_timeout(rsp->gp_wq,
21711                                         rcu_gp_fqs_check_wake(rsp, &gf), j);
21712                         rsp->gp_state = RCU_GP_DOING_FQS;
21713                         /* Locking provides needed memory barriers. */
21714 @@ -2230,7 +2263,7 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
21715         WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
21716         WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS);
21717         raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags);
21718 -       rcu_gp_kthread_wake(rsp);
21719 +       swake_up(&rsp->gp_wq);  /* Memory barrier implied by swake_up() path. */
21722  /*
21723 @@ -2891,7 +2924,7 @@ static void force_quiescent_state(struct rcu_state *rsp)
21724         }
21725         WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS);
21726         raw_spin_unlock_irqrestore(&rnp_old->lock, flags);
21727 -       rcu_gp_kthread_wake(rsp);
21728 +       swake_up(&rsp->gp_wq); /* Memory barrier implied by swake_up() path. */
21731  /*
21732 @@ -2934,18 +2967,17 @@ __rcu_process_callbacks(struct rcu_state *rsp)
21733  /*
21734   * Do RCU core processing for the current CPU.
21735   */
21736 -static void rcu_process_callbacks(struct softirq_action *unused)
21737 +static void rcu_process_callbacks(void)
21739         struct rcu_state *rsp;
21741         if (cpu_is_offline(smp_processor_id()))
21742                 return;
21743 -       trace_rcu_utilization(TPS("Start RCU core"));
21744         for_each_rcu_flavor(rsp)
21745                 __rcu_process_callbacks(rsp);
21746 -       trace_rcu_utilization(TPS("End RCU core"));
21749 +static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
21750  /*
21751   * Schedule RCU callback invocation.  If the specified type of RCU
21752   * does not support RCU priority boosting, just do a direct call,
21753 @@ -2957,18 +2989,105 @@ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
21755         if (unlikely(!READ_ONCE(rcu_scheduler_fully_active)))
21756                 return;
21757 -       if (likely(!rsp->boost)) {
21758 -               rcu_do_batch(rsp, rdp);
21759 -               return;
21760 -       }
21761 -       invoke_rcu_callbacks_kthread();
21762 +       rcu_do_batch(rsp, rdp);
21765 +static void rcu_wake_cond(struct task_struct *t, int status)
21767 +       /*
21768 +        * If the thread is yielding, only wake it when this
21769 +        * is invoked from idle
21770 +        */
21771 +       if (t && (status != RCU_KTHREAD_YIELDING || is_idle_task(current)))
21772 +               wake_up_process(t);
21776 + * Wake up this CPU's rcuc kthread to do RCU core processing.
21777 + */
21778  static void invoke_rcu_core(void)
21780 -       if (cpu_online(smp_processor_id()))
21781 -               raise_softirq(RCU_SOFTIRQ);
21782 +       unsigned long flags;
21783 +       struct task_struct *t;
21785 +       if (!cpu_online(smp_processor_id()))
21786 +               return;
21787 +       local_irq_save(flags);
21788 +       __this_cpu_write(rcu_cpu_has_work, 1);
21789 +       t = __this_cpu_read(rcu_cpu_kthread_task);
21790 +       if (t != NULL && current != t)
21791 +               rcu_wake_cond(t, __this_cpu_read(rcu_cpu_kthread_status));
21792 +       local_irq_restore(flags);
21795 +static void rcu_cpu_kthread_park(unsigned int cpu)
21797 +       per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
21800 +static int rcu_cpu_kthread_should_run(unsigned int cpu)
21802 +       return __this_cpu_read(rcu_cpu_has_work);
21806 + * Per-CPU kernel thread that invokes RCU callbacks.  This replaces the
21807 + * RCU softirq used in flavors and configurations of RCU that do not
21808 + * support RCU priority boosting.
21809 + */
21810 +static void rcu_cpu_kthread(unsigned int cpu)
21812 +       unsigned int *statusp = this_cpu_ptr(&rcu_cpu_kthread_status);
21813 +       char work, *workp = this_cpu_ptr(&rcu_cpu_has_work);
21814 +       int spincnt;
21816 +       for (spincnt = 0; spincnt < 10; spincnt++) {
21817 +               trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait"));
21818 +               local_bh_disable();
21819 +               *statusp = RCU_KTHREAD_RUNNING;
21820 +               this_cpu_inc(rcu_cpu_kthread_loops);
21821 +               local_irq_disable();
21822 +               work = *workp;
21823 +               *workp = 0;
21824 +               local_irq_enable();
21825 +               if (work)
21826 +                       rcu_process_callbacks();
21827 +               local_bh_enable();
21828 +               if (*workp == 0) {
21829 +                       trace_rcu_utilization(TPS("End CPU kthread@rcu_wait"));
21830 +                       *statusp = RCU_KTHREAD_WAITING;
21831 +                       return;
21832 +               }
21833 +       }
21834 +       *statusp = RCU_KTHREAD_YIELDING;
21835 +       trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield"));
21836 +       schedule_timeout_interruptible(2);
21837 +       trace_rcu_utilization(TPS("End CPU kthread@rcu_yield"));
21838 +       *statusp = RCU_KTHREAD_WAITING;
21841 +static struct smp_hotplug_thread rcu_cpu_thread_spec = {
21842 +       .store                  = &rcu_cpu_kthread_task,
21843 +       .thread_should_run      = rcu_cpu_kthread_should_run,
21844 +       .thread_fn              = rcu_cpu_kthread,
21845 +       .thread_comm            = "rcuc/%u",
21846 +       .setup                  = rcu_cpu_kthread_setup,
21847 +       .park                   = rcu_cpu_kthread_park,
21851 + * Spawn per-CPU RCU core processing kthreads.
21852 + */
21853 +static int __init rcu_spawn_core_kthreads(void)
21855 +       int cpu;
21857 +       for_each_possible_cpu(cpu)
21858 +               per_cpu(rcu_cpu_has_work, cpu) = 0;
21859 +       BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec));
21860 +       return 0;
21862 +early_initcall(rcu_spawn_core_kthreads);
21864  /*
21865   * Handle any core-RCU processing required by a call_rcu() invocation.
21866 @@ -3114,6 +3233,7 @@ void call_rcu_sched(struct rcu_head *head, rcu_callback_t func)
21868  EXPORT_SYMBOL_GPL(call_rcu_sched);
21870 +#ifndef CONFIG_PREEMPT_RT_FULL
21871  /*
21872   * Queue an RCU callback for invocation after a quicker grace period.
21873   */
21874 @@ -3122,6 +3242,7 @@ void call_rcu_bh(struct rcu_head *head, rcu_callback_t func)
21875         __call_rcu(head, func, &rcu_bh_state, -1, 0);
21877  EXPORT_SYMBOL_GPL(call_rcu_bh);
21878 +#endif
21880  /*
21881   * Queue an RCU callback for lazy invocation after a grace period.
21882 @@ -3213,6 +3334,7 @@ void synchronize_sched(void)
21884  EXPORT_SYMBOL_GPL(synchronize_sched);
21886 +#ifndef CONFIG_PREEMPT_RT_FULL
21887  /**
21888   * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed.
21889   *
21890 @@ -3239,6 +3361,7 @@ void synchronize_rcu_bh(void)
21891                 wait_rcu_gp(call_rcu_bh);
21893  EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
21894 +#endif
21896  /**
21897   * get_state_synchronize_rcu - Snapshot current RCU state
21898 @@ -3524,7 +3647,7 @@ static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
21899                         raw_spin_unlock_irqrestore(&rnp->lock, flags);
21900                         if (wake) {
21901                                 smp_mb(); /* EGP done before wake_up(). */
21902 -                               wake_up(&rsp->expedited_wq);
21903 +                               swake_up(&rsp->expedited_wq);
21904                         }
21905                         break;
21906                 }
21907 @@ -3781,7 +3904,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
21908         jiffies_start = jiffies;
21910         for (;;) {
21911 -               ret = wait_event_interruptible_timeout(
21912 +               ret = swait_event_timeout(
21913                                 rsp->expedited_wq,
21914                                 sync_rcu_preempt_exp_done(rnp_root),
21915                                 jiffies_stall);
21916 @@ -3789,7 +3912,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
21917                         return;
21918                 if (ret < 0) {
21919                         /* Hit a signal, disable CPU stall warnings. */
21920 -                       wait_event(rsp->expedited_wq,
21921 +                       swait_event(rsp->expedited_wq,
21922                                    sync_rcu_preempt_exp_done(rnp_root));
21923                         return;
21924                 }
21925 @@ -4101,6 +4224,7 @@ static void _rcu_barrier(struct rcu_state *rsp)
21926         mutex_unlock(&rsp->barrier_mutex);
21929 +#ifndef CONFIG_PREEMPT_RT_FULL
21930  /**
21931   * rcu_barrier_bh - Wait until all in-flight call_rcu_bh() callbacks complete.
21932   */
21933 @@ -4109,6 +4233,7 @@ void rcu_barrier_bh(void)
21934         _rcu_barrier(&rcu_bh_state);
21936  EXPORT_SYMBOL_GPL(rcu_barrier_bh);
21937 +#endif
21939  /**
21940   * rcu_barrier_sched - Wait for in-flight call_rcu_sched() callbacks.
21941 @@ -4455,8 +4580,8 @@ static void __init rcu_init_one(struct rcu_state *rsp,
21942                 }
21943         }
21945 -       init_waitqueue_head(&rsp->gp_wq);
21946 -       init_waitqueue_head(&rsp->expedited_wq);
21947 +       init_swait_queue_head(&rsp->gp_wq);
21948 +       init_swait_queue_head(&rsp->expedited_wq);
21949         rnp = rsp->level[rcu_num_lvls - 1];
21950         for_each_possible_cpu(i) {
21951                 while (i > rnp->grphi)
21952 @@ -4576,12 +4701,13 @@ void __init rcu_init(void)
21954         rcu_bootup_announce();
21955         rcu_init_geometry();
21956 +#ifndef CONFIG_PREEMPT_RT_FULL
21957         rcu_init_one(&rcu_bh_state, &rcu_bh_data);
21958 +#endif
21959         rcu_init_one(&rcu_sched_state, &rcu_sched_data);
21960         if (dump_tree)
21961                 rcu_dump_rcu_node_tree(&rcu_sched_state);
21962         __rcu_init_preempt();
21963 -       open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
21965         /*
21966          * We don't need protection against CPU-hotplug here because
21967 diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
21968 index 9fb4e238d4dc..c75834d8de24 100644
21969 --- a/kernel/rcu/tree.h
21970 +++ b/kernel/rcu/tree.h
21971 @@ -27,6 +27,7 @@
21972  #include <linux/threads.h>
21973  #include <linux/cpumask.h>
21974  #include <linux/seqlock.h>
21975 +#include <linux/swait.h>
21976  #include <linux/stop_machine.h>
21978  /*
21979 @@ -241,7 +242,7 @@ struct rcu_node {
21980                                 /* Refused to boost: not sure why, though. */
21981                                 /*  This can happen due to race conditions. */
21982  #ifdef CONFIG_RCU_NOCB_CPU
21983 -       wait_queue_head_t nocb_gp_wq[2];
21984 +       struct swait_queue_head nocb_gp_wq[2];
21985                                 /* Place for rcu_nocb_kthread() to wait GP. */
21986  #endif /* #ifdef CONFIG_RCU_NOCB_CPU */
21987         int need_future_gp[2];
21988 @@ -393,7 +394,7 @@ struct rcu_data {
21989         atomic_long_t nocb_q_count_lazy; /*  invocation (all stages). */
21990         struct rcu_head *nocb_follower_head; /* CBs ready to invoke. */
21991         struct rcu_head **nocb_follower_tail;
21992 -       wait_queue_head_t nocb_wq;      /* For nocb kthreads to sleep on. */
21993 +       struct swait_queue_head nocb_wq; /* For nocb kthreads to sleep on. */
21994         struct task_struct *nocb_kthread;
21995         int nocb_defer_wakeup;          /* Defer wakeup of nocb_kthread. */
21997 @@ -472,7 +473,7 @@ struct rcu_state {
21998         unsigned long gpnum;                    /* Current gp number. */
21999         unsigned long completed;                /* # of last completed gp. */
22000         struct task_struct *gp_kthread;         /* Task for grace periods. */
22001 -       wait_queue_head_t gp_wq;                /* Where GP task waits. */
22002 +       struct swait_queue_head gp_wq;          /* Where GP task waits. */
22003         short gp_flags;                         /* Commands for GP task. */
22004         short gp_state;                         /* GP kthread sleep state. */
22006 @@ -504,7 +505,7 @@ struct rcu_state {
22007         atomic_long_t expedited_workdone3;      /* # done by others #3. */
22008         atomic_long_t expedited_normal;         /* # fallbacks to normal. */
22009         atomic_t expedited_need_qs;             /* # CPUs left to check in. */
22010 -       wait_queue_head_t expedited_wq;         /* Wait for check-ins. */
22011 +       struct swait_queue_head expedited_wq;   /* Wait for check-ins. */
22012         int ncpus_snap;                         /* # CPUs seen last time. */
22014         unsigned long jiffies_force_qs;         /* Time at which to invoke */
22015 @@ -556,18 +557,18 @@ extern struct list_head rcu_struct_flavors;
22016   */
22017  extern struct rcu_state rcu_sched_state;
22019 +#ifndef CONFIG_PREEMPT_RT_FULL
22020  extern struct rcu_state rcu_bh_state;
22021 +#endif
22023  #ifdef CONFIG_PREEMPT_RCU
22024  extern struct rcu_state rcu_preempt_state;
22025  #endif /* #ifdef CONFIG_PREEMPT_RCU */
22027 -#ifdef CONFIG_RCU_BOOST
22028  DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
22029  DECLARE_PER_CPU(int, rcu_cpu_kthread_cpu);
22030  DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
22031  DECLARE_PER_CPU(char, rcu_cpu_has_work);
22032 -#endif /* #ifdef CONFIG_RCU_BOOST */
22034  #ifndef RCU_TREE_NONCORE
22036 @@ -587,10 +588,9 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func);
22037  static void __init __rcu_init_preempt(void);
22038  static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
22039  static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
22040 -static void invoke_rcu_callbacks_kthread(void);
22041  static bool rcu_is_callbacks_kthread(void);
22042 +static void rcu_cpu_kthread_setup(unsigned int cpu);
22043  #ifdef CONFIG_RCU_BOOST
22044 -static void rcu_preempt_do_callbacks(void);
22045  static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
22046                                                  struct rcu_node *rnp);
22047  #endif /* #ifdef CONFIG_RCU_BOOST */
22048 @@ -607,7 +607,8 @@ static void zero_cpu_stall_ticks(struct rcu_data *rdp);
22049  static void increment_cpu_stall_ticks(void);
22050  static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu);
22051  static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq);
22052 -static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp);
22053 +static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp);
22054 +static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq);
22055  static void rcu_init_one_nocb(struct rcu_node *rnp);
22056  static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
22057                             bool lazy, unsigned long flags);
22058 diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
22059 index 32cbe72bf545..45e3e3e02a5c 100644
22060 --- a/kernel/rcu/tree_plugin.h
22061 +++ b/kernel/rcu/tree_plugin.h
22062 @@ -24,25 +24,10 @@
22063   *        Paul E. McKenney <paulmck@linux.vnet.ibm.com>
22064   */
22066 -#include <linux/delay.h>
22067 -#include <linux/gfp.h>
22068 -#include <linux/oom.h>
22069 -#include <linux/smpboot.h>
22070 -#include "../time/tick-internal.h"
22072  #ifdef CONFIG_RCU_BOOST
22074  #include "../locking/rtmutex_common.h"
22077 - * Control variables for per-CPU and per-rcu_node kthreads.  These
22078 - * handle all flavors of RCU.
22079 - */
22080 -static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
22081 -DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
22082 -DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
22083 -DEFINE_PER_CPU(char, rcu_cpu_has_work);
22085  #else /* #ifdef CONFIG_RCU_BOOST */
22087  /*
22088 @@ -55,6 +40,14 @@ DEFINE_PER_CPU(char, rcu_cpu_has_work);
22090  #endif /* #else #ifdef CONFIG_RCU_BOOST */
22093 + * Control variables for per-CPU and per-rcu_node kthreads.  These
22094 + * handle all flavors of RCU.
22095 + */
22096 +DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
22097 +DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
22098 +DEFINE_PER_CPU(char, rcu_cpu_has_work);
22100  #ifdef CONFIG_RCU_NOCB_CPU
22101  static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */
22102  static bool have_rcu_nocb_mask;            /* Was rcu_nocb_mask allocated? */
22103 @@ -432,7 +425,7 @@ void rcu_read_unlock_special(struct task_struct *t)
22104         }
22106         /* Hardware IRQ handlers cannot block, complain if they get here. */
22107 -       if (in_irq() || in_serving_softirq()) {
22108 +       if (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_OFFSET)) {
22109                 lockdep_rcu_suspicious(__FILE__, __LINE__,
22110                                        "rcu_read_unlock() from irq or softirq with blocking in critical section!!!\n");
22111                 pr_alert("->rcu_read_unlock_special: %#x (b: %d, enq: %d nq: %d)\n",
22112 @@ -645,15 +638,6 @@ static void rcu_preempt_check_callbacks(void)
22113                 t->rcu_read_unlock_special.b.need_qs = true;
22116 -#ifdef CONFIG_RCU_BOOST
22118 -static void rcu_preempt_do_callbacks(void)
22120 -       rcu_do_batch(rcu_state_p, this_cpu_ptr(rcu_data_p));
22123 -#endif /* #ifdef CONFIG_RCU_BOOST */
22125  /*
22126   * Queue a preemptible-RCU callback for invocation after a grace period.
22127   */
22128 @@ -930,6 +914,19 @@ void exit_rcu(void)
22130  #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
22133 + * If boosting, set rcuc kthreads to realtime priority.
22134 + */
22135 +static void rcu_cpu_kthread_setup(unsigned int cpu)
22137 +#ifdef CONFIG_RCU_BOOST
22138 +       struct sched_param sp;
22140 +       sp.sched_priority = kthread_prio;
22141 +       sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
22142 +#endif /* #ifdef CONFIG_RCU_BOOST */
22145  #ifdef CONFIG_RCU_BOOST
22147  #include "../locking/rtmutex_common.h"
22148 @@ -961,16 +958,6 @@ static void rcu_initiate_boost_trace(struct rcu_node *rnp)
22150  #endif /* #else #ifdef CONFIG_RCU_TRACE */
22152 -static void rcu_wake_cond(struct task_struct *t, int status)
22154 -       /*
22155 -        * If the thread is yielding, only wake it when this
22156 -        * is invoked from idle
22157 -        */
22158 -       if (status != RCU_KTHREAD_YIELDING || is_idle_task(current))
22159 -               wake_up_process(t);
22162  /*
22163   * Carry out RCU priority boosting on the task indicated by ->exp_tasks
22164   * or ->boost_tasks, advancing the pointer to the next task in the
22165 @@ -1115,23 +1102,6 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
22168  /*
22169 - * Wake up the per-CPU kthread to invoke RCU callbacks.
22170 - */
22171 -static void invoke_rcu_callbacks_kthread(void)
22173 -       unsigned long flags;
22175 -       local_irq_save(flags);
22176 -       __this_cpu_write(rcu_cpu_has_work, 1);
22177 -       if (__this_cpu_read(rcu_cpu_kthread_task) != NULL &&
22178 -           current != __this_cpu_read(rcu_cpu_kthread_task)) {
22179 -               rcu_wake_cond(__this_cpu_read(rcu_cpu_kthread_task),
22180 -                             __this_cpu_read(rcu_cpu_kthread_status));
22181 -       }
22182 -       local_irq_restore(flags);
22186   * Is the current CPU running the RCU-callbacks kthread?
22187   * Caller must have preemption disabled.
22188   */
22189 @@ -1186,67 +1156,6 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
22190         return 0;
22193 -static void rcu_kthread_do_work(void)
22195 -       rcu_do_batch(&rcu_sched_state, this_cpu_ptr(&rcu_sched_data));
22196 -       rcu_do_batch(&rcu_bh_state, this_cpu_ptr(&rcu_bh_data));
22197 -       rcu_preempt_do_callbacks();
22200 -static void rcu_cpu_kthread_setup(unsigned int cpu)
22202 -       struct sched_param sp;
22204 -       sp.sched_priority = kthread_prio;
22205 -       sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
22208 -static void rcu_cpu_kthread_park(unsigned int cpu)
22210 -       per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
22213 -static int rcu_cpu_kthread_should_run(unsigned int cpu)
22215 -       return __this_cpu_read(rcu_cpu_has_work);
22219 - * Per-CPU kernel thread that invokes RCU callbacks.  This replaces the
22220 - * RCU softirq used in flavors and configurations of RCU that do not
22221 - * support RCU priority boosting.
22222 - */
22223 -static void rcu_cpu_kthread(unsigned int cpu)
22225 -       unsigned int *statusp = this_cpu_ptr(&rcu_cpu_kthread_status);
22226 -       char work, *workp = this_cpu_ptr(&rcu_cpu_has_work);
22227 -       int spincnt;
22229 -       for (spincnt = 0; spincnt < 10; spincnt++) {
22230 -               trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait"));
22231 -               local_bh_disable();
22232 -               *statusp = RCU_KTHREAD_RUNNING;
22233 -               this_cpu_inc(rcu_cpu_kthread_loops);
22234 -               local_irq_disable();
22235 -               work = *workp;
22236 -               *workp = 0;
22237 -               local_irq_enable();
22238 -               if (work)
22239 -                       rcu_kthread_do_work();
22240 -               local_bh_enable();
22241 -               if (*workp == 0) {
22242 -                       trace_rcu_utilization(TPS("End CPU kthread@rcu_wait"));
22243 -                       *statusp = RCU_KTHREAD_WAITING;
22244 -                       return;
22245 -               }
22246 -       }
22247 -       *statusp = RCU_KTHREAD_YIELDING;
22248 -       trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield"));
22249 -       schedule_timeout_interruptible(2);
22250 -       trace_rcu_utilization(TPS("End CPU kthread@rcu_yield"));
22251 -       *statusp = RCU_KTHREAD_WAITING;
22254  /*
22255   * Set the per-rcu_node kthread's affinity to cover all CPUs that are
22256   * served by the rcu_node in question.  The CPU hotplug lock is still
22257 @@ -1276,26 +1185,12 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
22258         free_cpumask_var(cm);
22261 -static struct smp_hotplug_thread rcu_cpu_thread_spec = {
22262 -       .store                  = &rcu_cpu_kthread_task,
22263 -       .thread_should_run      = rcu_cpu_kthread_should_run,
22264 -       .thread_fn              = rcu_cpu_kthread,
22265 -       .thread_comm            = "rcuc/%u",
22266 -       .setup                  = rcu_cpu_kthread_setup,
22267 -       .park                   = rcu_cpu_kthread_park,
22270  /*
22271   * Spawn boost kthreads -- called as soon as the scheduler is running.
22272   */
22273  static void __init rcu_spawn_boost_kthreads(void)
22275         struct rcu_node *rnp;
22276 -       int cpu;
22278 -       for_each_possible_cpu(cpu)
22279 -               per_cpu(rcu_cpu_has_work, cpu) = 0;
22280 -       BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec));
22281         rcu_for_each_leaf_node(rcu_state_p, rnp)
22282                 (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp);
22284 @@ -1318,11 +1213,6 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
22285         raw_spin_unlock_irqrestore(&rnp->lock, flags);
22288 -static void invoke_rcu_callbacks_kthread(void)
22290 -       WARN_ON_ONCE(1);
22293  static bool rcu_is_callbacks_kthread(void)
22295         return false;
22296 @@ -1346,7 +1236,7 @@ static void rcu_prepare_kthreads(int cpu)
22298  #endif /* #else #ifdef CONFIG_RCU_BOOST */
22300 -#if !defined(CONFIG_RCU_FAST_NO_HZ)
22301 +#if !defined(CONFIG_RCU_FAST_NO_HZ) || defined(CONFIG_PREEMPT_RT_FULL)
22303  /*
22304   * Check to see if any future RCU-related work will need to be done
22305 @@ -1363,7 +1253,9 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt)
22306         return IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL)
22307                ? 0 : rcu_cpu_has_callbacks(NULL);
22309 +#endif /* !defined(CONFIG_RCU_FAST_NO_HZ) || defined(CONFIG_PREEMPT_RT_FULL) */
22311 +#if !defined(CONFIG_RCU_FAST_NO_HZ)
22312  /*
22313   * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up
22314   * after it.
22315 @@ -1459,6 +1351,8 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void)
22316         return cbs_ready;
22319 +#ifndef CONFIG_PREEMPT_RT_FULL
22321  /*
22322   * Allow the CPU to enter dyntick-idle mode unless it has callbacks ready
22323   * to invoke.  If the CPU has callbacks, try to advance them.  Tell the
22324 @@ -1504,6 +1398,7 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt)
22325         *nextevt = basemono + dj * TICK_NSEC;
22326         return 0;
22328 +#endif /* #ifndef CONFIG_PREEMPT_RT_FULL */
22330  /*
22331   * Prepare a CPU for idle from an RCU perspective.  The first major task
22332 @@ -1822,9 +1717,9 @@ early_param("rcu_nocb_poll", parse_rcu_nocb_poll);
22333   * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended
22334   * grace period.
22335   */
22336 -static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
22337 +static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq)
22339 -       wake_up_all(&rnp->nocb_gp_wq[rnp->completed & 0x1]);
22340 +       swake_up_all(sq);
22343  /*
22344 @@ -1840,10 +1735,15 @@ static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq)
22345         rnp->need_future_gp[(rnp->completed + 1) & 0x1] += nrq;
22348 +static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp)
22350 +       return &rnp->nocb_gp_wq[rnp->completed & 0x1];
22353  static void rcu_init_one_nocb(struct rcu_node *rnp)
22355 -       init_waitqueue_head(&rnp->nocb_gp_wq[0]);
22356 -       init_waitqueue_head(&rnp->nocb_gp_wq[1]);
22357 +       init_swait_queue_head(&rnp->nocb_gp_wq[0]);
22358 +       init_swait_queue_head(&rnp->nocb_gp_wq[1]);
22361  #ifndef CONFIG_RCU_NOCB_CPU_ALL
22362 @@ -1868,7 +1768,7 @@ static void wake_nocb_leader(struct rcu_data *rdp, bool force)
22363         if (READ_ONCE(rdp_leader->nocb_leader_sleep) || force) {
22364                 /* Prior smp_mb__after_atomic() orders against prior enqueue. */
22365                 WRITE_ONCE(rdp_leader->nocb_leader_sleep, false);
22366 -               wake_up(&rdp_leader->nocb_wq);
22367 +               swake_up(&rdp_leader->nocb_wq);
22368         }
22371 @@ -2081,7 +1981,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
22372          */
22373         trace_rcu_future_gp(rnp, rdp, c, TPS("StartWait"));
22374         for (;;) {
22375 -               wait_event_interruptible(
22376 +               swait_event_interruptible(
22377                         rnp->nocb_gp_wq[c & 0x1],
22378                         (d = ULONG_CMP_GE(READ_ONCE(rnp->completed), c)));
22379                 if (likely(d))
22380 @@ -2109,7 +2009,7 @@ wait_again:
22381         /* Wait for callbacks to appear. */
22382         if (!rcu_nocb_poll) {
22383                 trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Sleep");
22384 -               wait_event_interruptible(my_rdp->nocb_wq,
22385 +               swait_event_interruptible(my_rdp->nocb_wq,
22386                                 !READ_ONCE(my_rdp->nocb_leader_sleep));
22387                 /* Memory barrier handled by smp_mb() calls below and repoll. */
22388         } else if (firsttime) {
22389 @@ -2184,7 +2084,7 @@ wait_again:
22390                          * List was empty, wake up the follower.
22391                          * Memory barriers supplied by atomic_long_add().
22392                          */
22393 -                       wake_up(&rdp->nocb_wq);
22394 +                       swake_up(&rdp->nocb_wq);
22395                 }
22396         }
22398 @@ -2205,7 +2105,7 @@ static void nocb_follower_wait(struct rcu_data *rdp)
22399                 if (!rcu_nocb_poll) {
22400                         trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
22401                                             "FollowerSleep");
22402 -                       wait_event_interruptible(rdp->nocb_wq,
22403 +                       swait_event_interruptible(rdp->nocb_wq,
22404                                                  READ_ONCE(rdp->nocb_follower_head));
22405                 } else if (firsttime) {
22406                         /* Don't drown trace log with "Poll"! */
22407 @@ -2365,7 +2265,7 @@ void __init rcu_init_nohz(void)
22408  static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
22410         rdp->nocb_tail = &rdp->nocb_head;
22411 -       init_waitqueue_head(&rdp->nocb_wq);
22412 +       init_swait_queue_head(&rdp->nocb_wq);
22413         rdp->nocb_follower_tail = &rdp->nocb_follower_head;
22416 @@ -2515,7 +2415,7 @@ static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu)
22417         return false;
22420 -static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
22421 +static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq)
22425 @@ -2523,6 +2423,11 @@ static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq)
22429 +static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp)
22431 +       return NULL;
22434  static void rcu_init_one_nocb(struct rcu_node *rnp)
22437 diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
22438 index 5f748c5a40f0..9a3904603ff6 100644
22439 --- a/kernel/rcu/update.c
22440 +++ b/kernel/rcu/update.c
22441 @@ -276,6 +276,7 @@ int rcu_read_lock_held(void)
22443  EXPORT_SYMBOL_GPL(rcu_read_lock_held);
22445 +#ifndef CONFIG_PREEMPT_RT_FULL
22446  /**
22447   * rcu_read_lock_bh_held() - might we be in RCU-bh read-side critical section?
22448   *
22449 @@ -302,6 +303,7 @@ int rcu_read_lock_bh_held(void)
22450         return in_softirq() || irqs_disabled();
22452  EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
22453 +#endif
22455  #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
22457 diff --git a/kernel/relay.c b/kernel/relay.c
22458 index 0b4570cfacae..60684be39f22 100644
22459 --- a/kernel/relay.c
22460 +++ b/kernel/relay.c
22461 @@ -336,6 +336,10 @@ static void wakeup_readers(unsigned long data)
22463         struct rchan_buf *buf = (struct rchan_buf *)data;
22464         wake_up_interruptible(&buf->read_wait);
22465 +       /*
22466 +        * Stupid polling for now:
22467 +        */
22468 +       mod_timer(&buf->timer, jiffies + 1);
22471  /**
22472 @@ -353,6 +357,7 @@ static void __relay_reset(struct rchan_buf *buf, unsigned int init)
22473                 init_waitqueue_head(&buf->read_wait);
22474                 kref_init(&buf->kref);
22475                 setup_timer(&buf->timer, wakeup_readers, (unsigned long)buf);
22476 +               mod_timer(&buf->timer, jiffies + 1);
22477         } else
22478                 del_timer_sync(&buf->timer);
22480 @@ -736,15 +741,6 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length)
22481                 else
22482                         buf->early_bytes += buf->chan->subbuf_size -
22483                                             buf->padding[old_subbuf];
22484 -               smp_mb();
22485 -               if (waitqueue_active(&buf->read_wait))
22486 -                       /*
22487 -                        * Calling wake_up_interruptible() from here
22488 -                        * will deadlock if we happen to be logging
22489 -                        * from the scheduler (trying to re-grab
22490 -                        * rq->lock), so defer it.
22491 -                        */
22492 -                       mod_timer(&buf->timer, jiffies + 1);
22493         }
22495         old = buf->data;
22496 diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
22497 index 67687973ce80..01b9994b367a 100644
22498 --- a/kernel/sched/Makefile
22499 +++ b/kernel/sched/Makefile
22500 @@ -13,7 +13,7 @@ endif
22502  obj-y += core.o loadavg.o clock.o cputime.o
22503  obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
22504 -obj-y += wait.o completion.o idle.o
22505 +obj-y += wait.o swait.o swork.o completion.o idle.o
22506  obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o
22507  obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
22508  obj-$(CONFIG_SCHEDSTATS) += stats.o
22509 diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
22510 index 8d0f35debf35..b62cf6400fe0 100644
22511 --- a/kernel/sched/completion.c
22512 +++ b/kernel/sched/completion.c
22513 @@ -30,10 +30,10 @@ void complete(struct completion *x)
22515         unsigned long flags;
22517 -       spin_lock_irqsave(&x->wait.lock, flags);
22518 +       raw_spin_lock_irqsave(&x->wait.lock, flags);
22519         x->done++;
22520 -       __wake_up_locked(&x->wait, TASK_NORMAL, 1);
22521 -       spin_unlock_irqrestore(&x->wait.lock, flags);
22522 +       swake_up_locked(&x->wait);
22523 +       raw_spin_unlock_irqrestore(&x->wait.lock, flags);
22525  EXPORT_SYMBOL(complete);
22527 @@ -50,10 +50,10 @@ void complete_all(struct completion *x)
22529         unsigned long flags;
22531 -       spin_lock_irqsave(&x->wait.lock, flags);
22532 +       raw_spin_lock_irqsave(&x->wait.lock, flags);
22533         x->done += UINT_MAX/2;
22534 -       __wake_up_locked(&x->wait, TASK_NORMAL, 0);
22535 -       spin_unlock_irqrestore(&x->wait.lock, flags);
22536 +       swake_up_all_locked(&x->wait);
22537 +       raw_spin_unlock_irqrestore(&x->wait.lock, flags);
22539  EXPORT_SYMBOL(complete_all);
22541 @@ -62,20 +62,20 @@ do_wait_for_common(struct completion *x,
22542                    long (*action)(long), long timeout, int state)
22544         if (!x->done) {
22545 -               DECLARE_WAITQUEUE(wait, current);
22546 +               DECLARE_SWAITQUEUE(wait);
22548 -               __add_wait_queue_tail_exclusive(&x->wait, &wait);
22549 +               __prepare_to_swait(&x->wait, &wait);
22550                 do {
22551                         if (signal_pending_state(state, current)) {
22552                                 timeout = -ERESTARTSYS;
22553                                 break;
22554                         }
22555                         __set_current_state(state);
22556 -                       spin_unlock_irq(&x->wait.lock);
22557 +                       raw_spin_unlock_irq(&x->wait.lock);
22558                         timeout = action(timeout);
22559 -                       spin_lock_irq(&x->wait.lock);
22560 +                       raw_spin_lock_irq(&x->wait.lock);
22561                 } while (!x->done && timeout);
22562 -               __remove_wait_queue(&x->wait, &wait);
22563 +               __finish_swait(&x->wait, &wait);
22564                 if (!x->done)
22565                         return timeout;
22566         }
22567 @@ -89,9 +89,9 @@ __wait_for_common(struct completion *x,
22569         might_sleep();
22571 -       spin_lock_irq(&x->wait.lock);
22572 +       raw_spin_lock_irq(&x->wait.lock);
22573         timeout = do_wait_for_common(x, action, timeout, state);
22574 -       spin_unlock_irq(&x->wait.lock);
22575 +       raw_spin_unlock_irq(&x->wait.lock);
22576         return timeout;
22579 @@ -277,12 +277,12 @@ bool try_wait_for_completion(struct completion *x)
22580         if (!READ_ONCE(x->done))
22581                 return 0;
22583 -       spin_lock_irqsave(&x->wait.lock, flags);
22584 +       raw_spin_lock_irqsave(&x->wait.lock, flags);
22585         if (!x->done)
22586                 ret = 0;
22587         else
22588                 x->done--;
22589 -       spin_unlock_irqrestore(&x->wait.lock, flags);
22590 +       raw_spin_unlock_irqrestore(&x->wait.lock, flags);
22591         return ret;
22593  EXPORT_SYMBOL(try_wait_for_completion);
22594 @@ -311,7 +311,7 @@ bool completion_done(struct completion *x)
22595          * after it's acquired the lock.
22596          */
22597         smp_rmb();
22598 -       spin_unlock_wait(&x->wait.lock);
22599 +       raw_spin_unlock_wait(&x->wait.lock);
22600         return true;
22602  EXPORT_SYMBOL(completion_done);
22603 diff --git a/kernel/sched/core.c b/kernel/sched/core.c
22604 index 20253dbc8610..e9b8d518202e 100644
22605 --- a/kernel/sched/core.c
22606 +++ b/kernel/sched/core.c
22607 @@ -260,7 +260,11 @@ late_initcall(sched_init_debug);
22608   * Number of tasks to iterate in a single balance run.
22609   * Limited because this is done with IRQs disabled.
22610   */
22611 +#ifndef CONFIG_PREEMPT_RT_FULL
22612  const_debug unsigned int sysctl_sched_nr_migrate = 32;
22613 +#else
22614 +const_debug unsigned int sysctl_sched_nr_migrate = 8;
22615 +#endif
22617  /*
22618   * period over which we average the RT time consumption, measured
22619 @@ -438,6 +442,7 @@ static void init_rq_hrtick(struct rq *rq)
22621         hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
22622         rq->hrtick_timer.function = hrtick;
22623 +       rq->hrtick_timer.irqsafe = 1;
22625  #else  /* CONFIG_SCHED_HRTICK */
22626  static inline void hrtick_clear(struct rq *rq)
22627 @@ -542,7 +547,7 @@ void wake_q_add(struct wake_q_head *head, struct task_struct *task)
22628         head->lastp = &node->next;
22631 -void wake_up_q(struct wake_q_head *head)
22632 +void __wake_up_q(struct wake_q_head *head, bool sleeper)
22634         struct wake_q_node *node = head->first;
22636 @@ -559,7 +564,10 @@ void wake_up_q(struct wake_q_head *head)
22637                  * wake_up_process() implies a wmb() to pair with the queueing
22638                  * in wake_q_add() so as not to miss wakeups.
22639                  */
22640 -               wake_up_process(task);
22641 +               if (sleeper)
22642 +                       wake_up_lock_sleeper(task);
22643 +               else
22644 +                       wake_up_process(task);
22645                 put_task_struct(task);
22646         }
22648 @@ -595,6 +603,38 @@ void resched_curr(struct rq *rq)
22649                 trace_sched_wake_idle_without_ipi(cpu);
22652 +#ifdef CONFIG_PREEMPT_LAZY
22653 +void resched_curr_lazy(struct rq *rq)
22655 +       struct task_struct *curr = rq->curr;
22656 +       int cpu;
22658 +       if (!sched_feat(PREEMPT_LAZY)) {
22659 +               resched_curr(rq);
22660 +               return;
22661 +       }
22663 +       lockdep_assert_held(&rq->lock);
22665 +       if (test_tsk_need_resched(curr))
22666 +               return;
22668 +       if (test_tsk_need_resched_lazy(curr))
22669 +               return;
22671 +       set_tsk_need_resched_lazy(curr);
22673 +       cpu = cpu_of(rq);
22674 +       if (cpu == smp_processor_id())
22675 +               return;
22677 +       /* NEED_RESCHED_LAZY must be visible before we test polling */
22678 +       smp_mb();
22679 +       if (!tsk_is_polling(curr))
22680 +               smp_send_reschedule(cpu);
22682 +#endif
22684  void resched_cpu(int cpu)
22686         struct rq *rq = cpu_rq(cpu);
22687 @@ -618,11 +658,14 @@ void resched_cpu(int cpu)
22688   */
22689  int get_nohz_timer_target(void)
22691 -       int i, cpu = smp_processor_id();
22692 +       int i, cpu;
22693         struct sched_domain *sd;
22695 +       preempt_disable_rt();
22696 +       cpu = smp_processor_id();
22698         if (!idle_cpu(cpu) && is_housekeeping_cpu(cpu))
22699 -               return cpu;
22700 +               goto preempt_en_rt;
22702         rcu_read_lock();
22703         for_each_domain(cpu, sd) {
22704 @@ -641,6 +684,8 @@ int get_nohz_timer_target(void)
22705                 cpu = housekeeping_any_cpu();
22706  unlock:
22707         rcu_read_unlock();
22708 +preempt_en_rt:
22709 +       preempt_enable_rt();
22710         return cpu;
22712  /*
22713 @@ -1174,6 +1219,11 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
22715         lockdep_assert_held(&p->pi_lock);
22717 +       if (__migrate_disabled(p)) {
22718 +               cpumask_copy(&p->cpus_allowed, new_mask);
22719 +               return;
22720 +       }
22722         queued = task_on_rq_queued(p);
22723         running = task_current(rq, p);
22725 @@ -1196,6 +1246,84 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
22726                 enqueue_task(rq, p, ENQUEUE_RESTORE);
22729 +static DEFINE_PER_CPU(struct cpumask, sched_cpumasks);
22730 +static DEFINE_MUTEX(sched_down_mutex);
22731 +static cpumask_t sched_down_cpumask;
22733 +void tell_sched_cpu_down_begin(int cpu)
22735 +       mutex_lock(&sched_down_mutex);
22736 +       cpumask_set_cpu(cpu, &sched_down_cpumask);
22737 +       mutex_unlock(&sched_down_mutex);
22740 +void tell_sched_cpu_down_done(int cpu)
22742 +       mutex_lock(&sched_down_mutex);
22743 +       cpumask_clear_cpu(cpu, &sched_down_cpumask);
22744 +       mutex_unlock(&sched_down_mutex);
22747 +/**
22748 + * migrate_me - try to move the current task off this cpu
22749 + *
22750 + * Used by the pin_current_cpu() code to try to get tasks
22751 + * to move off the current CPU as it is going down.
22752 + * It will only move the task if the task isn't pinned to
22753 + * the CPU (with migrate_disable, affinity or NO_SETAFFINITY)
22754 + * and the task has to be in a RUNNING state. Otherwise the
22755 + * movement of the task will wake it up (change its state
22756 + * to running) when the task did not expect it.
22757 + *
22758 + * Returns 1 if it succeeded in moving the current task
22759 + *         0 otherwise.
22760 + */
22761 +int migrate_me(void)
22763 +       struct task_struct *p = current;
22764 +       struct migration_arg arg;
22765 +       struct cpumask *cpumask;
22766 +       struct cpumask *mask;
22767 +       unsigned long flags;
22768 +       unsigned int dest_cpu;
22769 +       struct rq *rq;
22771 +       /*
22772 +        * We can not migrate tasks bounded to a CPU or tasks not
22773 +        * running. The movement of the task will wake it up.
22774 +        */
22775 +       if (p->flags & PF_NO_SETAFFINITY || p->state)
22776 +               return 0;
22778 +       mutex_lock(&sched_down_mutex);
22779 +       rq = task_rq_lock(p, &flags);
22781 +       cpumask = this_cpu_ptr(&sched_cpumasks);
22782 +       mask = &p->cpus_allowed;
22784 +       cpumask_andnot(cpumask, mask, &sched_down_cpumask);
22786 +       if (!cpumask_weight(cpumask)) {
22787 +               /* It's only on this CPU? */
22788 +               task_rq_unlock(rq, p, &flags);
22789 +               mutex_unlock(&sched_down_mutex);
22790 +               return 0;
22791 +       }
22793 +       dest_cpu = cpumask_any_and(cpu_active_mask, cpumask);
22795 +       arg.task = p;
22796 +       arg.dest_cpu = dest_cpu;
22798 +       task_rq_unlock(rq, p, &flags);
22800 +       stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
22801 +       tlb_migrate_finish(p->mm);
22802 +       mutex_unlock(&sched_down_mutex);
22804 +       return 1;
22807  /*
22808   * Change a given task's CPU affinity. Migrate the thread to a
22809   * proper CPU and schedule it away if the CPU it's executing on
22810 @@ -1235,7 +1363,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
22811         do_set_cpus_allowed(p, new_mask);
22813         /* Can the task run on the task's current CPU? If so, we're done */
22814 -       if (cpumask_test_cpu(task_cpu(p), new_mask))
22815 +       if (cpumask_test_cpu(task_cpu(p), new_mask) || __migrate_disabled(p))
22816                 goto out;
22818         dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
22819 @@ -1411,6 +1539,18 @@ out:
22820         return ret;
22823 +static bool check_task_state(struct task_struct *p, long match_state)
22825 +       bool match = false;
22827 +       raw_spin_lock_irq(&p->pi_lock);
22828 +       if (p->state == match_state || p->saved_state == match_state)
22829 +               match = true;
22830 +       raw_spin_unlock_irq(&p->pi_lock);
22832 +       return match;
22835  /*
22836   * wait_task_inactive - wait for a thread to unschedule.
22837   *
22838 @@ -1455,7 +1595,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
22839                  * is actually now running somewhere else!
22840                  */
22841                 while (task_running(rq, p)) {
22842 -                       if (match_state && unlikely(p->state != match_state))
22843 +                       if (match_state && !check_task_state(p, match_state))
22844                                 return 0;
22845                         cpu_relax();
22846                 }
22847 @@ -1470,7 +1610,8 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
22848                 running = task_running(rq, p);
22849                 queued = task_on_rq_queued(p);
22850                 ncsw = 0;
22851 -               if (!match_state || p->state == match_state)
22852 +               if (!match_state || p->state == match_state ||
22853 +                   p->saved_state == match_state)
22854                         ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
22855                 task_rq_unlock(rq, p, &flags);
22857 @@ -1627,7 +1768,7 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
22859         lockdep_assert_held(&p->pi_lock);
22861 -       if (p->nr_cpus_allowed > 1)
22862 +       if (tsk_nr_cpus_allowed(p) > 1)
22863                 cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
22865         /*
22866 @@ -1707,10 +1848,6 @@ static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_fl
22868         activate_task(rq, p, en_flags);
22869         p->on_rq = TASK_ON_RQ_QUEUED;
22871 -       /* if a worker is waking up, notify workqueue */
22872 -       if (p->flags & PF_WQ_WORKER)
22873 -               wq_worker_waking_up(p, cpu_of(rq));
22876  /*
22877 @@ -1937,8 +2074,27 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
22878          */
22879         smp_mb__before_spinlock();
22880         raw_spin_lock_irqsave(&p->pi_lock, flags);
22881 -       if (!(p->state & state))
22882 +       if (!(p->state & state)) {
22883 +               /*
22884 +                * The task might be running due to a spinlock sleeper
22885 +                * wakeup. Check the saved state and set it to running
22886 +                * if the wakeup condition is true.
22887 +                */
22888 +               if (!(wake_flags & WF_LOCK_SLEEPER)) {
22889 +                       if (p->saved_state & state) {
22890 +                               p->saved_state = TASK_RUNNING;
22891 +                               success = 1;
22892 +                       }
22893 +               }
22894                 goto out;
22895 +       }
22897 +       /*
22898 +        * If this is a regular wakeup, then we can unconditionally
22899 +        * clear the saved state of a "lock sleeper".
22900 +        */
22901 +       if (!(wake_flags & WF_LOCK_SLEEPER))
22902 +               p->saved_state = TASK_RUNNING;
22904         trace_sched_waking(p);
22906 @@ -2030,52 +2186,6 @@ out:
22909  /**
22910 - * try_to_wake_up_local - try to wake up a local task with rq lock held
22911 - * @p: the thread to be awakened
22912 - *
22913 - * Put @p on the run-queue if it's not already there. The caller must
22914 - * ensure that this_rq() is locked, @p is bound to this_rq() and not
22915 - * the current task.
22916 - */
22917 -static void try_to_wake_up_local(struct task_struct *p)
22919 -       struct rq *rq = task_rq(p);
22921 -       if (WARN_ON_ONCE(rq != this_rq()) ||
22922 -           WARN_ON_ONCE(p == current))
22923 -               return;
22925 -       lockdep_assert_held(&rq->lock);
22927 -       if (!raw_spin_trylock(&p->pi_lock)) {
22928 -               /*
22929 -                * This is OK, because current is on_cpu, which avoids it being
22930 -                * picked for load-balance and preemption/IRQs are still
22931 -                * disabled avoiding further scheduler activity on it and we've
22932 -                * not yet picked a replacement task.
22933 -                */
22934 -               lockdep_unpin_lock(&rq->lock);
22935 -               raw_spin_unlock(&rq->lock);
22936 -               raw_spin_lock(&p->pi_lock);
22937 -               raw_spin_lock(&rq->lock);
22938 -               lockdep_pin_lock(&rq->lock);
22939 -       }
22941 -       if (!(p->state & TASK_NORMAL))
22942 -               goto out;
22944 -       trace_sched_waking(p);
22946 -       if (!task_on_rq_queued(p))
22947 -               ttwu_activate(rq, p, ENQUEUE_WAKEUP);
22949 -       ttwu_do_wakeup(rq, p, 0);
22950 -       ttwu_stat(p, smp_processor_id(), 0);
22951 -out:
22952 -       raw_spin_unlock(&p->pi_lock);
22955 -/**
22956   * wake_up_process - Wake up a specific process
22957   * @p: The process to be woken up.
22958   *
22959 @@ -2093,6 +2203,18 @@ int wake_up_process(struct task_struct *p)
22961  EXPORT_SYMBOL(wake_up_process);
22963 +/**
22964 + * wake_up_lock_sleeper - Wake up a specific process blocked on a "sleeping lock"
22965 + * @p: The process to be woken up.
22966 + *
22967 + * Same as wake_up_process() above, but wake_flags=WF_LOCK_SLEEPER to indicate
22968 + * the nature of the wakeup.
22969 + */
22970 +int wake_up_lock_sleeper(struct task_struct *p)
22972 +       return try_to_wake_up(p, TASK_ALL, WF_LOCK_SLEEPER);
22975  int wake_up_state(struct task_struct *p, unsigned int state)
22977         return try_to_wake_up(p, state, 0);
22978 @@ -2279,6 +2401,9 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
22979         p->on_cpu = 0;
22980  #endif
22981         init_task_preempt_count(p);
22982 +#ifdef CONFIG_HAVE_PREEMPT_LAZY
22983 +       task_thread_info(p)->preempt_lazy_count = 0;
22984 +#endif
22985  #ifdef CONFIG_SMP
22986         plist_node_init(&p->pushable_tasks, MAX_PRIO);
22987         RB_CLEAR_NODE(&p->pushable_dl_tasks);
22988 @@ -2603,8 +2728,12 @@ static struct rq *finish_task_switch(struct task_struct *prev)
22989         finish_arch_post_lock_switch();
22991         fire_sched_in_preempt_notifiers(current);
22992 +       /*
22993 +        * We use mmdrop_delayed() here so we don't have to do the
22994 +        * full __mmdrop() when we are the last user.
22995 +        */
22996         if (mm)
22997 -               mmdrop(mm);
22998 +               mmdrop_delayed(mm);
22999         if (unlikely(prev_state == TASK_DEAD)) {
23000                 if (prev->sched_class->task_dead)
23001                         prev->sched_class->task_dead(prev);
23002 @@ -2935,16 +3064,6 @@ u64 scheduler_tick_max_deferment(void)
23004  #endif
23006 -notrace unsigned long get_parent_ip(unsigned long addr)
23008 -       if (in_lock_functions(addr)) {
23009 -               addr = CALLER_ADDR2;
23010 -               if (in_lock_functions(addr))
23011 -                       addr = CALLER_ADDR3;
23012 -       }
23013 -       return addr;
23016  #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
23017                                 defined(CONFIG_PREEMPT_TRACER))
23019 @@ -2966,7 +3085,7 @@ void preempt_count_add(int val)
23020                                 PREEMPT_MASK - 10);
23021  #endif
23022         if (preempt_count() == val) {
23023 -               unsigned long ip = get_parent_ip(CALLER_ADDR1);
23024 +               unsigned long ip = get_lock_parent_ip();
23025  #ifdef CONFIG_DEBUG_PREEMPT
23026                 current->preempt_disable_ip = ip;
23027  #endif
23028 @@ -2993,7 +3112,7 @@ void preempt_count_sub(int val)
23029  #endif
23031         if (preempt_count() == val)
23032 -               trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
23033 +               trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip());
23034         __preempt_count_sub(val);
23036  EXPORT_SYMBOL(preempt_count_sub);
23037 @@ -3048,6 +3167,77 @@ static inline void schedule_debug(struct task_struct *prev)
23038         schedstat_inc(this_rq(), sched_count);
23041 +#if defined(CONFIG_PREEMPT_RT_FULL) && defined(CONFIG_SMP)
23043 +void migrate_disable(void)
23045 +       struct task_struct *p = current;
23047 +       if (in_atomic() || irqs_disabled()) {
23048 +#ifdef CONFIG_SCHED_DEBUG
23049 +               p->migrate_disable_atomic++;
23050 +#endif
23051 +               return;
23052 +       }
23054 +#ifdef CONFIG_SCHED_DEBUG
23055 +       if (unlikely(p->migrate_disable_atomic)) {
23056 +               tracing_off();
23057 +               WARN_ON_ONCE(1);
23058 +       }
23059 +#endif
23061 +       if (p->migrate_disable) {
23062 +               p->migrate_disable++;
23063 +               return;
23064 +       }
23066 +       preempt_disable();
23067 +       preempt_lazy_disable();
23068 +       pin_current_cpu();
23069 +       p->migrate_disable = 1;
23070 +       preempt_enable();
23072 +EXPORT_SYMBOL(migrate_disable);
23074 +void migrate_enable(void)
23076 +       struct task_struct *p = current;
23078 +       if (in_atomic() || irqs_disabled()) {
23079 +#ifdef CONFIG_SCHED_DEBUG
23080 +               p->migrate_disable_atomic--;
23081 +#endif
23082 +               return;
23083 +       }
23085 +#ifdef CONFIG_SCHED_DEBUG
23086 +       if (unlikely(p->migrate_disable_atomic)) {
23087 +               tracing_off();
23088 +               WARN_ON_ONCE(1);
23089 +       }
23090 +#endif
23091 +       WARN_ON_ONCE(p->migrate_disable <= 0);
23093 +       if (p->migrate_disable > 1) {
23094 +               p->migrate_disable--;
23095 +               return;
23096 +       }
23098 +       preempt_disable();
23099 +       /*
23100 +        * Clearing migrate_disable causes tsk_cpus_allowed to
23101 +        * show the tasks original cpu affinity.
23102 +        */
23103 +       p->migrate_disable = 0;
23105 +       unpin_current_cpu();
23106 +       preempt_enable();
23107 +       preempt_lazy_enable();
23109 +EXPORT_SYMBOL(migrate_enable);
23110 +#endif
23112  /*
23113   * Pick up the highest-prio task:
23114   */
23115 @@ -3172,19 +3362,6 @@ static void __sched notrace __schedule(bool preempt)
23116                 } else {
23117                         deactivate_task(rq, prev, DEQUEUE_SLEEP);
23118                         prev->on_rq = 0;
23120 -                       /*
23121 -                        * If a worker went to sleep, notify and ask workqueue
23122 -                        * whether it wants to wake up a task to maintain
23123 -                        * concurrency.
23124 -                        */
23125 -                       if (prev->flags & PF_WQ_WORKER) {
23126 -                               struct task_struct *to_wakeup;
23128 -                               to_wakeup = wq_worker_sleeping(prev, cpu);
23129 -                               if (to_wakeup)
23130 -                                       try_to_wake_up_local(to_wakeup);
23131 -                       }
23132                 }
23133                 switch_count = &prev->nvcsw;
23134         }
23135 @@ -3194,6 +3371,7 @@ static void __sched notrace __schedule(bool preempt)
23137         next = pick_next_task(rq, prev);
23138         clear_tsk_need_resched(prev);
23139 +       clear_tsk_need_resched_lazy(prev);
23140         clear_preempt_need_resched();
23141         rq->clock_skip_update = 0;
23143 @@ -3215,9 +3393,20 @@ static void __sched notrace __schedule(bool preempt)
23145  static inline void sched_submit_work(struct task_struct *tsk)
23147 -       if (!tsk->state || tsk_is_pi_blocked(tsk))
23148 +       if (!tsk->state)
23149                 return;
23150         /*
23151 +        * If a worker went to sleep, notify and ask workqueue whether
23152 +        * it wants to wake up a task to maintain concurrency.
23153 +        */
23154 +       if (tsk->flags & PF_WQ_WORKER)
23155 +               wq_worker_sleeping(tsk);
23158 +       if (tsk_is_pi_blocked(tsk))
23159 +               return;
23161 +       /*
23162          * If we are going to sleep and we have plugged IO queued,
23163          * make sure to submit it to avoid deadlocks.
23164          */
23165 @@ -3225,6 +3414,12 @@ static inline void sched_submit_work(struct task_struct *tsk)
23166                 blk_schedule_flush_plug(tsk);
23169 +static void sched_update_worker(struct task_struct *tsk)
23171 +       if (tsk->flags & PF_WQ_WORKER)
23172 +               wq_worker_running(tsk);
23175  asmlinkage __visible void __sched schedule(void)
23177         struct task_struct *tsk = current;
23178 @@ -3235,6 +3430,7 @@ asmlinkage __visible void __sched schedule(void)
23179                 __schedule(false);
23180                 sched_preempt_enable_no_resched();
23181         } while (need_resched());
23182 +       sched_update_worker(tsk);
23184  EXPORT_SYMBOL(schedule);
23186 @@ -3283,6 +3479,30 @@ static void __sched notrace preempt_schedule_common(void)
23187         } while (need_resched());
23190 +#ifdef CONFIG_PREEMPT_LAZY
23192 + * If TIF_NEED_RESCHED is then we allow to be scheduled away since this is
23193 + * set by a RT task. Oterwise we try to avoid beeing scheduled out as long as
23194 + * preempt_lazy_count counter >0.
23195 + */
23196 +static __always_inline int preemptible_lazy(void)
23198 +       if (test_thread_flag(TIF_NEED_RESCHED))
23199 +               return 1;
23200 +       if (current_thread_info()->preempt_lazy_count)
23201 +               return 0;
23202 +       return 1;
23205 +#else
23207 +static inline int preemptible_lazy(void)
23209 +       return 1;
23212 +#endif
23214  #ifdef CONFIG_PREEMPT
23215  /*
23216   * this is the entry point to schedule() from in-kernel preemption
23217 @@ -3297,6 +3517,8 @@ asmlinkage __visible void __sched notrace preempt_schedule(void)
23218          */
23219         if (likely(!preemptible()))
23220                 return;
23221 +       if (!preemptible_lazy())
23222 +               return;
23224         preempt_schedule_common();
23226 @@ -3323,6 +3545,8 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
23228         if (likely(!preemptible()))
23229                 return;
23230 +       if (!preemptible_lazy())
23231 +               return;
23233         do {
23234                 preempt_disable_notrace();
23235 @@ -3332,7 +3556,16 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
23236                  * an infinite recursion.
23237                  */
23238                 prev_ctx = exception_enter();
23239 +               /*
23240 +                * The add/subtract must not be traced by the function
23241 +                * tracer. But we still want to account for the
23242 +                * preempt off latency tracer. Since the _notrace versions
23243 +                * of add/subtract skip the accounting for latency tracer
23244 +                * we must force it manually.
23245 +                */
23246 +               start_critical_timings();
23247                 __schedule(true);
23248 +               stop_critical_timings();
23249                 exception_exit(prev_ctx);
23251                 preempt_enable_no_resched_notrace();
23252 @@ -4676,6 +4909,7 @@ int __cond_resched_lock(spinlock_t *lock)
23254  EXPORT_SYMBOL(__cond_resched_lock);
23256 +#ifndef CONFIG_PREEMPT_RT_FULL
23257  int __sched __cond_resched_softirq(void)
23259         BUG_ON(!in_softirq());
23260 @@ -4689,6 +4923,7 @@ int __sched __cond_resched_softirq(void)
23261         return 0;
23263  EXPORT_SYMBOL(__cond_resched_softirq);
23264 +#endif
23266  /**
23267   * yield - yield the current processor to other threads.
23268 @@ -5055,7 +5290,9 @@ void init_idle(struct task_struct *idle, int cpu)
23270         /* Set the preempt count _outside_ the spinlocks! */
23271         init_idle_preempt_count(idle, cpu);
23273 +#ifdef CONFIG_HAVE_PREEMPT_LAZY
23274 +       task_thread_info(idle)->preempt_lazy_count = 0;
23275 +#endif
23276         /*
23277          * The idle tasks have their own, simple scheduling class:
23278          */
23279 @@ -5196,6 +5433,8 @@ void sched_setnuma(struct task_struct *p, int nid)
23280  #endif /* CONFIG_NUMA_BALANCING */
23282  #ifdef CONFIG_HOTPLUG_CPU
23283 +static DEFINE_PER_CPU(struct mm_struct *, idle_last_mm);
23285  /*
23286   * Ensures that the idle task is using init_mm right before its cpu goes
23287   * offline.
23288 @@ -5210,7 +5449,11 @@ void idle_task_exit(void)
23289                 switch_mm(mm, &init_mm, current);
23290                 finish_arch_post_lock_switch();
23291         }
23292 -       mmdrop(mm);
23293 +       /*
23294 +        * Defer the cleanup to an alive cpu. On RT we can neither
23295 +        * call mmdrop() nor mmdrop_delayed() from here.
23296 +        */
23297 +       per_cpu(idle_last_mm, smp_processor_id()) = mm;
23300  /*
23301 @@ -5583,6 +5826,10 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
23303         case CPU_DEAD:
23304                 calc_load_migrate(rq);
23305 +               if (per_cpu(idle_last_mm, cpu)) {
23306 +                       mmdrop(per_cpu(idle_last_mm, cpu));
23307 +                       per_cpu(idle_last_mm, cpu) = NULL;
23308 +               }
23309                 break;
23310  #endif
23311         }
23312 @@ -7566,7 +7813,7 @@ void __init sched_init(void)
23313  #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
23314  static inline int preempt_count_equals(int preempt_offset)
23316 -       int nested = preempt_count() + rcu_preempt_depth();
23317 +       int nested = preempt_count() + sched_rcu_preempt_depth();
23319         return (nested == preempt_offset);
23321 diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
23322 index 5a75b08cfd85..5be58820465c 100644
23323 --- a/kernel/sched/cpudeadline.c
23324 +++ b/kernel/sched/cpudeadline.c
23325 @@ -103,10 +103,10 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
23326         const struct sched_dl_entity *dl_se = &p->dl;
23328         if (later_mask &&
23329 -           cpumask_and(later_mask, cp->free_cpus, &p->cpus_allowed)) {
23330 +           cpumask_and(later_mask, cp->free_cpus, tsk_cpus_allowed(p))) {
23331                 best_cpu = cpumask_any(later_mask);
23332                 goto out;
23333 -       } else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) &&
23334 +       } else if (cpumask_test_cpu(cpudl_maximum(cp), tsk_cpus_allowed(p)) &&
23335                         dl_time_before(dl_se->deadline, cp->elements[0].dl)) {
23336                 best_cpu = cpudl_maximum(cp);
23337                 if (later_mask)
23338 diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
23339 index 981fcd7dc394..11e9705bf937 100644
23340 --- a/kernel/sched/cpupri.c
23341 +++ b/kernel/sched/cpupri.c
23342 @@ -103,11 +103,11 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
23343                 if (skip)
23344                         continue;
23346 -               if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)
23347 +               if (cpumask_any_and(tsk_cpus_allowed(p), vec->mask) >= nr_cpu_ids)
23348                         continue;
23350                 if (lowest_mask) {
23351 -                       cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);
23352 +                       cpumask_and(lowest_mask, tsk_cpus_allowed(p), vec->mask);
23354                         /*
23355                          * We have to ensure that we have at least one bit
23356 diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
23357 index a1aecbedf5b1..558b98af241d 100644
23358 --- a/kernel/sched/cputime.c
23359 +++ b/kernel/sched/cputime.c
23360 @@ -685,7 +685,7 @@ static cputime_t get_vtime_delta(struct task_struct *tsk)
23362         unsigned long long delta = vtime_delta(tsk);
23364 -       WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_SLEEPING);
23365 +       WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE);
23366         tsk->vtime_snap += delta;
23368         /* CHECKME: always safe to convert nsecs to cputime? */
23369 @@ -701,37 +701,37 @@ static void __vtime_account_system(struct task_struct *tsk)
23371  void vtime_account_system(struct task_struct *tsk)
23373 -       write_seqlock(&tsk->vtime_seqlock);
23374 +       write_seqcount_begin(&tsk->vtime_seqcount);
23375         __vtime_account_system(tsk);
23376 -       write_sequnlock(&tsk->vtime_seqlock);
23377 +       write_seqcount_end(&tsk->vtime_seqcount);
23380  void vtime_gen_account_irq_exit(struct task_struct *tsk)
23382 -       write_seqlock(&tsk->vtime_seqlock);
23383 +       write_seqcount_begin(&tsk->vtime_seqcount);
23384         __vtime_account_system(tsk);
23385         if (context_tracking_in_user())
23386                 tsk->vtime_snap_whence = VTIME_USER;
23387 -       write_sequnlock(&tsk->vtime_seqlock);
23388 +       write_seqcount_end(&tsk->vtime_seqcount);
23391  void vtime_account_user(struct task_struct *tsk)
23393         cputime_t delta_cpu;
23395 -       write_seqlock(&tsk->vtime_seqlock);
23396 +       write_seqcount_begin(&tsk->vtime_seqcount);
23397         delta_cpu = get_vtime_delta(tsk);
23398         tsk->vtime_snap_whence = VTIME_SYS;
23399         account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu));
23400 -       write_sequnlock(&tsk->vtime_seqlock);
23401 +       write_seqcount_end(&tsk->vtime_seqcount);
23404  void vtime_user_enter(struct task_struct *tsk)
23406 -       write_seqlock(&tsk->vtime_seqlock);
23407 +       write_seqcount_begin(&tsk->vtime_seqcount);
23408         __vtime_account_system(tsk);
23409         tsk->vtime_snap_whence = VTIME_USER;
23410 -       write_sequnlock(&tsk->vtime_seqlock);
23411 +       write_seqcount_end(&tsk->vtime_seqcount);
23414  void vtime_guest_enter(struct task_struct *tsk)
23415 @@ -743,19 +743,19 @@ void vtime_guest_enter(struct task_struct *tsk)
23416          * synchronization against the reader (task_gtime())
23417          * that can thus safely catch up with a tickless delta.
23418          */
23419 -       write_seqlock(&tsk->vtime_seqlock);
23420 +       write_seqcount_begin(&tsk->vtime_seqcount);
23421         __vtime_account_system(tsk);
23422         current->flags |= PF_VCPU;
23423 -       write_sequnlock(&tsk->vtime_seqlock);
23424 +       write_seqcount_end(&tsk->vtime_seqcount);
23426  EXPORT_SYMBOL_GPL(vtime_guest_enter);
23428  void vtime_guest_exit(struct task_struct *tsk)
23430 -       write_seqlock(&tsk->vtime_seqlock);
23431 +       write_seqcount_begin(&tsk->vtime_seqcount);
23432         __vtime_account_system(tsk);
23433         current->flags &= ~PF_VCPU;
23434 -       write_sequnlock(&tsk->vtime_seqlock);
23435 +       write_seqcount_end(&tsk->vtime_seqcount);
23437  EXPORT_SYMBOL_GPL(vtime_guest_exit);
23439 @@ -768,24 +768,26 @@ void vtime_account_idle(struct task_struct *tsk)
23441  void arch_vtime_task_switch(struct task_struct *prev)
23443 -       write_seqlock(&prev->vtime_seqlock);
23444 -       prev->vtime_snap_whence = VTIME_SLEEPING;
23445 -       write_sequnlock(&prev->vtime_seqlock);
23446 +       write_seqcount_begin(&prev->vtime_seqcount);
23447 +       prev->vtime_snap_whence = VTIME_INACTIVE;
23448 +       write_seqcount_end(&prev->vtime_seqcount);
23450 -       write_seqlock(&current->vtime_seqlock);
23451 +       write_seqcount_begin(&current->vtime_seqcount);
23452         current->vtime_snap_whence = VTIME_SYS;
23453         current->vtime_snap = sched_clock_cpu(smp_processor_id());
23454 -       write_sequnlock(&current->vtime_seqlock);
23455 +       write_seqcount_end(&current->vtime_seqcount);
23458  void vtime_init_idle(struct task_struct *t, int cpu)
23460         unsigned long flags;
23462 -       write_seqlock_irqsave(&t->vtime_seqlock, flags);
23463 +       local_irq_save(flags);
23464 +       write_seqcount_begin(&t->vtime_seqcount);
23465         t->vtime_snap_whence = VTIME_SYS;
23466         t->vtime_snap = sched_clock_cpu(cpu);
23467 -       write_sequnlock_irqrestore(&t->vtime_seqlock, flags);
23468 +       write_seqcount_end(&t->vtime_seqcount);
23469 +       local_irq_restore(flags);
23472  cputime_t task_gtime(struct task_struct *t)
23473 @@ -797,13 +799,13 @@ cputime_t task_gtime(struct task_struct *t)
23474                 return t->gtime;
23476         do {
23477 -               seq = read_seqbegin(&t->vtime_seqlock);
23478 +               seq = read_seqcount_begin(&t->vtime_seqcount);
23480                 gtime = t->gtime;
23481                 if (t->flags & PF_VCPU)
23482                         gtime += vtime_delta(t);
23484 -       } while (read_seqretry(&t->vtime_seqlock, seq));
23485 +       } while (read_seqcount_retry(&t->vtime_seqcount, seq));
23487         return gtime;
23489 @@ -826,7 +828,7 @@ fetch_task_cputime(struct task_struct *t,
23490                 *udelta = 0;
23491                 *sdelta = 0;
23493 -               seq = read_seqbegin(&t->vtime_seqlock);
23494 +               seq = read_seqcount_begin(&t->vtime_seqcount);
23496                 if (u_dst)
23497                         *u_dst = *u_src;
23498 @@ -834,7 +836,7 @@ fetch_task_cputime(struct task_struct *t,
23499                         *s_dst = *s_src;
23501                 /* Task is sleeping, nothing to add */
23502 -               if (t->vtime_snap_whence == VTIME_SLEEPING ||
23503 +               if (t->vtime_snap_whence == VTIME_INACTIVE ||
23504                     is_idle_task(t))
23505                         continue;
23507 @@ -850,7 +852,7 @@ fetch_task_cputime(struct task_struct *t,
23508                         if (t->vtime_snap_whence == VTIME_SYS)
23509                                 *sdelta = delta;
23510                 }
23511 -       } while (read_seqretry(&t->vtime_seqlock, seq));
23512 +       } while (read_seqcount_retry(&t->vtime_seqcount, seq));
23516 diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
23517 index e984f059e5fc..20e0c9b9ace5 100644
23518 --- a/kernel/sched/deadline.c
23519 +++ b/kernel/sched/deadline.c
23520 @@ -134,7 +134,7 @@ static void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
23522         struct task_struct *p = dl_task_of(dl_se);
23524 -       if (p->nr_cpus_allowed > 1)
23525 +       if (tsk_nr_cpus_allowed(p) > 1)
23526                 dl_rq->dl_nr_migratory++;
23528         update_dl_migration(dl_rq);
23529 @@ -144,7 +144,7 @@ static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
23531         struct task_struct *p = dl_task_of(dl_se);
23533 -       if (p->nr_cpus_allowed > 1)
23534 +       if (tsk_nr_cpus_allowed(p) > 1)
23535                 dl_rq->dl_nr_migratory--;
23537         update_dl_migration(dl_rq);
23538 @@ -697,6 +697,7 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se)
23540         hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
23541         timer->function = dl_task_timer;
23542 +       timer->irqsafe = 1;
23545  static
23546 @@ -989,7 +990,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
23548         enqueue_dl_entity(&p->dl, pi_se, flags);
23550 -       if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
23551 +       if (!task_current(rq, p) && tsk_nr_cpus_allowed(p) > 1)
23552                 enqueue_pushable_dl_task(rq, p);
23555 @@ -1067,9 +1068,9 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
23556          * try to make it stay here, it might be important.
23557          */
23558         if (unlikely(dl_task(curr)) &&
23559 -           (curr->nr_cpus_allowed < 2 ||
23560 +           (tsk_nr_cpus_allowed(curr) < 2 ||
23561              !dl_entity_preempt(&p->dl, &curr->dl)) &&
23562 -           (p->nr_cpus_allowed > 1)) {
23563 +           (tsk_nr_cpus_allowed(p) > 1)) {
23564                 int target = find_later_rq(p);
23566                 if (target != -1 &&
23567 @@ -1090,7 +1091,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
23568          * Current can't be migrated, useless to reschedule,
23569          * let's hope p can move out.
23570          */
23571 -       if (rq->curr->nr_cpus_allowed == 1 ||
23572 +       if (tsk_nr_cpus_allowed(rq->curr) == 1 ||
23573             cpudl_find(&rq->rd->cpudl, rq->curr, NULL) == -1)
23574                 return;
23576 @@ -1098,7 +1099,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
23577          * p is migratable, so let's not schedule it and
23578          * see if it is pushed or pulled somewhere else.
23579          */
23580 -       if (p->nr_cpus_allowed != 1 &&
23581 +       if (tsk_nr_cpus_allowed(p) != 1 &&
23582             cpudl_find(&rq->rd->cpudl, p, NULL) != -1)
23583                 return;
23585 @@ -1212,7 +1213,7 @@ static void put_prev_task_dl(struct rq *rq, struct task_struct *p)
23587         update_curr_dl(rq);
23589 -       if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1)
23590 +       if (on_dl_rq(&p->dl) && tsk_nr_cpus_allowed(p) > 1)
23591                 enqueue_pushable_dl_task(rq, p);
23594 @@ -1335,7 +1336,7 @@ static int find_later_rq(struct task_struct *task)
23595         if (unlikely(!later_mask))
23596                 return -1;
23598 -       if (task->nr_cpus_allowed == 1)
23599 +       if (tsk_nr_cpus_allowed(task) == 1)
23600                 return -1;
23602         /*
23603 @@ -1441,7 +1442,7 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
23604                 if (double_lock_balance(rq, later_rq)) {
23605                         if (unlikely(task_rq(task) != rq ||
23606                                      !cpumask_test_cpu(later_rq->cpu,
23607 -                                                      &task->cpus_allowed) ||
23608 +                                                      tsk_cpus_allowed(task)) ||
23609                                      task_running(rq, task) ||
23610                                      !task_on_rq_queued(task))) {
23611                                 double_unlock_balance(rq, later_rq);
23612 @@ -1480,7 +1481,7 @@ static struct task_struct *pick_next_pushable_dl_task(struct rq *rq)
23614         BUG_ON(rq->cpu != task_cpu(p));
23615         BUG_ON(task_current(rq, p));
23616 -       BUG_ON(p->nr_cpus_allowed <= 1);
23617 +       BUG_ON(tsk_nr_cpus_allowed(p) <= 1);
23619         BUG_ON(!task_on_rq_queued(p));
23620         BUG_ON(!dl_task(p));
23621 @@ -1519,7 +1520,7 @@ retry:
23622          */
23623         if (dl_task(rq->curr) &&
23624             dl_time_before(next_task->dl.deadline, rq->curr->dl.deadline) &&
23625 -           rq->curr->nr_cpus_allowed > 1) {
23626 +           tsk_nr_cpus_allowed(rq->curr) > 1) {
23627                 resched_curr(rq);
23628                 return 0;
23629         }
23630 @@ -1666,9 +1667,9 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p)
23632         if (!task_running(rq, p) &&
23633             !test_tsk_need_resched(rq->curr) &&
23634 -           p->nr_cpus_allowed > 1 &&
23635 +           tsk_nr_cpus_allowed(p) > 1 &&
23636             dl_task(rq->curr) &&
23637 -           (rq->curr->nr_cpus_allowed < 2 ||
23638 +           (tsk_nr_cpus_allowed(rq->curr) < 2 ||
23639              !dl_entity_preempt(&p->dl, &rq->curr->dl))) {
23640                 push_dl_tasks(rq);
23641         }
23642 @@ -1769,7 +1770,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
23644         if (task_on_rq_queued(p) && rq->curr != p) {
23645  #ifdef CONFIG_SMP
23646 -               if (p->nr_cpus_allowed > 1 && rq->dl.overloaded)
23647 +               if (tsk_nr_cpus_allowed(p) > 1 && rq->dl.overloaded)
23648                         queue_push_tasks(rq);
23649  #endif
23650                 if (dl_task(rq->curr))
23651 diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
23652 index 641511771ae6..a2d69b883623 100644
23653 --- a/kernel/sched/debug.c
23654 +++ b/kernel/sched/debug.c
23655 @@ -251,6 +251,9 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
23656         P(rt_throttled);
23657         PN(rt_time);
23658         PN(rt_runtime);
23659 +#ifdef CONFIG_SMP
23660 +       P(rt_nr_migratory);
23661 +#endif
23663  #undef PN
23664  #undef P
23665 @@ -635,6 +638,10 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
23666  #endif
23667         P(policy);
23668         P(prio);
23669 +#ifdef CONFIG_PREEMPT_RT_FULL
23670 +       P(migrate_disable);
23671 +#endif
23672 +       P(nr_cpus_allowed);
23673  #undef PN
23674  #undef __PN
23675  #undef P
23676 diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
23677 index 812069b66f47..ddf1424bcc78 100644
23678 --- a/kernel/sched/fair.c
23679 +++ b/kernel/sched/fair.c
23680 @@ -3166,7 +3166,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
23681         ideal_runtime = sched_slice(cfs_rq, curr);
23682         delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
23683         if (delta_exec > ideal_runtime) {
23684 -               resched_curr(rq_of(cfs_rq));
23685 +               resched_curr_lazy(rq_of(cfs_rq));
23686                 /*
23687                  * The current task ran long enough, ensure it doesn't get
23688                  * re-elected due to buddy favours.
23689 @@ -3190,7 +3190,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
23690                 return;
23692         if (delta > ideal_runtime)
23693 -               resched_curr(rq_of(cfs_rq));
23694 +               resched_curr_lazy(rq_of(cfs_rq));
23697  static void
23698 @@ -3330,7 +3330,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
23699          * validating it and just reschedule.
23700          */
23701         if (queued) {
23702 -               resched_curr(rq_of(cfs_rq));
23703 +               resched_curr_lazy(rq_of(cfs_rq));
23704                 return;
23705         }
23706         /*
23707 @@ -3512,7 +3512,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
23708          * hierarchy can be throttled
23709          */
23710         if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
23711 -               resched_curr(rq_of(cfs_rq));
23712 +               resched_curr_lazy(rq_of(cfs_rq));
23715  static __always_inline
23716 @@ -4144,7 +4144,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
23718                 if (delta < 0) {
23719                         if (rq->curr == p)
23720 -                               resched_curr(rq);
23721 +                               resched_curr_lazy(rq);
23722                         return;
23723                 }
23724                 hrtick_start(rq, delta);
23725 @@ -5232,7 +5232,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
23726         return;
23728  preempt:
23729 -       resched_curr(rq);
23730 +       resched_curr_lazy(rq);
23731         /*
23732          * Only set the backward buddy when the current task is still
23733          * on the rq. This can happen when a wakeup gets interleaved
23734 @@ -7983,7 +7983,7 @@ static void task_fork_fair(struct task_struct *p)
23735                  * 'current' within the tree based on its new key value.
23736                  */
23737                 swap(curr->vruntime, se->vruntime);
23738 -               resched_curr(rq);
23739 +               resched_curr_lazy(rq);
23740         }
23742         se->vruntime -= cfs_rq->min_vruntime;
23743 @@ -8008,7 +8008,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
23744          */
23745         if (rq->curr == p) {
23746                 if (p->prio > oldprio)
23747 -                       resched_curr(rq);
23748 +                       resched_curr_lazy(rq);
23749         } else
23750                 check_preempt_curr(rq, p, 0);
23752 diff --git a/kernel/sched/features.h b/kernel/sched/features.h
23753 index 69631fa46c2f..6d28fcd08872 100644
23754 --- a/kernel/sched/features.h
23755 +++ b/kernel/sched/features.h
23756 @@ -45,11 +45,19 @@ SCHED_FEAT(LB_BIAS, true)
23757   */
23758  SCHED_FEAT(NONTASK_CAPACITY, true)
23760 +#ifdef CONFIG_PREEMPT_RT_FULL
23761 +SCHED_FEAT(TTWU_QUEUE, false)
23762 +# ifdef CONFIG_PREEMPT_LAZY
23763 +SCHED_FEAT(PREEMPT_LAZY, true)
23764 +# endif
23765 +#else
23767  /*
23768   * Queue remote wakeups on the target CPU and process them
23769   * using the scheduler IPI. Reduces rq->lock contention/bounces.
23770   */
23771  SCHED_FEAT(TTWU_QUEUE, true)
23772 +#endif
23774  #ifdef HAVE_RT_PUSH_IPI
23775  /*
23776 diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
23777 index 78ae5c1d9412..4ac6937f4a65 100644
23778 --- a/kernel/sched/rt.c
23779 +++ b/kernel/sched/rt.c
23780 @@ -47,6 +47,7 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
23782         hrtimer_init(&rt_b->rt_period_timer,
23783                         CLOCK_MONOTONIC, HRTIMER_MODE_REL);
23784 +       rt_b->rt_period_timer.irqsafe = 1;
23785         rt_b->rt_period_timer.function = sched_rt_period_timer;
23788 @@ -93,6 +94,7 @@ void init_rt_rq(struct rt_rq *rt_rq)
23789         rt_rq->push_cpu = nr_cpu_ids;
23790         raw_spin_lock_init(&rt_rq->push_lock);
23791         init_irq_work(&rt_rq->push_work, push_irq_work_func);
23792 +       rt_rq->push_work.flags |= IRQ_WORK_HARD_IRQ;
23793  #endif
23794  #endif /* CONFIG_SMP */
23795         /* We start is dequeued state, because no RT tasks are queued */
23796 @@ -326,7 +328,7 @@ static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
23797         rt_rq = &rq_of_rt_rq(rt_rq)->rt;
23799         rt_rq->rt_nr_total++;
23800 -       if (p->nr_cpus_allowed > 1)
23801 +       if (tsk_nr_cpus_allowed(p) > 1)
23802                 rt_rq->rt_nr_migratory++;
23804         update_rt_migration(rt_rq);
23805 @@ -343,7 +345,7 @@ static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
23806         rt_rq = &rq_of_rt_rq(rt_rq)->rt;
23808         rt_rq->rt_nr_total--;
23809 -       if (p->nr_cpus_allowed > 1)
23810 +       if (tsk_nr_cpus_allowed(p) > 1)
23811                 rt_rq->rt_nr_migratory--;
23813         update_rt_migration(rt_rq);
23814 @@ -1262,7 +1264,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
23816         enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD);
23818 -       if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
23819 +       if (!task_current(rq, p) && tsk_nr_cpus_allowed(p) > 1)
23820                 enqueue_pushable_task(rq, p);
23823 @@ -1351,7 +1353,7 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
23824          * will have to sort it out.
23825          */
23826         if (curr && unlikely(rt_task(curr)) &&
23827 -           (curr->nr_cpus_allowed < 2 ||
23828 +           (tsk_nr_cpus_allowed(curr) < 2 ||
23829              curr->prio <= p->prio)) {
23830                 int target = find_lowest_rq(p);
23832 @@ -1375,7 +1377,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
23833          * Current can't be migrated, useless to reschedule,
23834          * let's hope p can move out.
23835          */
23836 -       if (rq->curr->nr_cpus_allowed == 1 ||
23837 +       if (tsk_nr_cpus_allowed(rq->curr) == 1 ||
23838             !cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
23839                 return;
23841 @@ -1383,7 +1385,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
23842          * p is migratable, so let's not schedule it and
23843          * see if it is pushed or pulled somewhere else.
23844          */
23845 -       if (p->nr_cpus_allowed != 1
23846 +       if (tsk_nr_cpus_allowed(p) != 1
23847             && cpupri_find(&rq->rd->cpupri, p, NULL))
23848                 return;
23850 @@ -1517,7 +1519,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
23851          * The previous task needs to be made eligible for pushing
23852          * if it is still active
23853          */
23854 -       if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1)
23855 +       if (on_rt_rq(&p->rt) && tsk_nr_cpus_allowed(p) > 1)
23856                 enqueue_pushable_task(rq, p);
23859 @@ -1567,7 +1569,7 @@ static int find_lowest_rq(struct task_struct *task)
23860         if (unlikely(!lowest_mask))
23861                 return -1;
23863 -       if (task->nr_cpus_allowed == 1)
23864 +       if (tsk_nr_cpus_allowed(task) == 1)
23865                 return -1; /* No other targets possible */
23867         if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
23868 @@ -1699,7 +1701,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq)
23870         BUG_ON(rq->cpu != task_cpu(p));
23871         BUG_ON(task_current(rq, p));
23872 -       BUG_ON(p->nr_cpus_allowed <= 1);
23873 +       BUG_ON(tsk_nr_cpus_allowed(p) <= 1);
23875         BUG_ON(!task_on_rq_queued(p));
23876         BUG_ON(!rt_task(p));
23877 @@ -2059,9 +2061,9 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
23879         if (!task_running(rq, p) &&
23880             !test_tsk_need_resched(rq->curr) &&
23881 -           p->nr_cpus_allowed > 1 &&
23882 +           tsk_nr_cpus_allowed(p) > 1 &&
23883             (dl_task(rq->curr) || rt_task(rq->curr)) &&
23884 -           (rq->curr->nr_cpus_allowed < 2 ||
23885 +           (tsk_nr_cpus_allowed(rq->curr) < 2 ||
23886              rq->curr->prio <= p->prio))
23887                 push_rt_tasks(rq);
23889 @@ -2134,7 +2136,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
23890          */
23891         if (task_on_rq_queued(p) && rq->curr != p) {
23892  #ifdef CONFIG_SMP
23893 -               if (p->nr_cpus_allowed > 1 && rq->rt.overloaded)
23894 +               if (tsk_nr_cpus_allowed(p) > 1 && rq->rt.overloaded)
23895                         queue_push_tasks(rq);
23896  #endif /* CONFIG_SMP */
23897                 if (p->prio < rq->curr->prio)
23898 diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
23899 index 4e5db65d1aab..ae26df3bd63f 100644
23900 --- a/kernel/sched/sched.h
23901 +++ b/kernel/sched/sched.h
23902 @@ -1100,6 +1100,7 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
23903  #define WF_SYNC                0x01            /* waker goes to sleep after wakeup */
23904  #define WF_FORK                0x02            /* child wakeup after fork */
23905  #define WF_MIGRATED    0x4             /* internal use, task got migrated */
23906 +#define WF_LOCK_SLEEPER        0x08            /* wakeup spinlock "sleeper" */
23908  /*
23909   * To aid in avoiding the subversion of "niceness" due to uneven distribution
23910 @@ -1299,6 +1300,15 @@ extern void init_sched_fair_class(void);
23911  extern void resched_curr(struct rq *rq);
23912  extern void resched_cpu(int cpu);
23914 +#ifdef CONFIG_PREEMPT_LAZY
23915 +extern void resched_curr_lazy(struct rq *rq);
23916 +#else
23917 +static inline void resched_curr_lazy(struct rq *rq)
23919 +       resched_curr(rq);
23921 +#endif
23923  extern struct rt_bandwidth def_rt_bandwidth;
23924  extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
23926 diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c
23927 new file mode 100644
23928 index 000000000000..205fe36868f9
23929 --- /dev/null
23930 +++ b/kernel/sched/swait.c
23931 @@ -0,0 +1,143 @@
23932 +#include <linux/sched.h>
23933 +#include <linux/swait.h>
23934 +#include <linux/suspend.h>
23936 +void __init_swait_queue_head(struct swait_queue_head *q, const char *name,
23937 +                            struct lock_class_key *key)
23939 +       raw_spin_lock_init(&q->lock);
23940 +       lockdep_set_class_and_name(&q->lock, key, name);
23941 +       INIT_LIST_HEAD(&q->task_list);
23943 +EXPORT_SYMBOL(__init_swait_queue_head);
23946 + * The thing about the wake_up_state() return value; I think we can ignore it.
23947 + *
23948 + * If for some reason it would return 0, that means the previously waiting
23949 + * task is already running, so it will observe condition true (or has already).
23950 + */
23951 +void swake_up_locked(struct swait_queue_head *q)
23953 +       struct swait_queue *curr;
23955 +       if (list_empty(&q->task_list))
23956 +               return;
23958 +       curr = list_first_entry(&q->task_list, typeof(*curr), task_list);
23959 +       wake_up_process(curr->task);
23960 +       list_del_init(&curr->task_list);
23962 +EXPORT_SYMBOL(swake_up_locked);
23964 +void swake_up_all_locked(struct swait_queue_head *q)
23966 +       struct swait_queue *curr;
23967 +       int wakes = 0;
23969 +       while (!list_empty(&q->task_list)) {
23971 +               curr = list_first_entry(&q->task_list, typeof(*curr),
23972 +                                       task_list);
23973 +               wake_up_process(curr->task);
23974 +               list_del_init(&curr->task_list);
23975 +               wakes++;
23976 +       }
23977 +       if (pm_in_action)
23978 +               return;
23979 +       WARN(wakes > 2, "complate_all() with %d waiters\n", wakes);
23981 +EXPORT_SYMBOL(swake_up_all_locked);
23983 +void swake_up(struct swait_queue_head *q)
23985 +       unsigned long flags;
23987 +       if (!swait_active(q))
23988 +               return;
23990 +       raw_spin_lock_irqsave(&q->lock, flags);
23991 +       swake_up_locked(q);
23992 +       raw_spin_unlock_irqrestore(&q->lock, flags);
23994 +EXPORT_SYMBOL(swake_up);
23997 + * Does not allow usage from IRQ disabled, since we must be able to
23998 + * release IRQs to guarantee bounded hold time.
23999 + */
24000 +void swake_up_all(struct swait_queue_head *q)
24002 +       struct swait_queue *curr;
24003 +       LIST_HEAD(tmp);
24005 +       if (!swait_active(q))
24006 +               return;
24008 +       raw_spin_lock_irq(&q->lock);
24009 +       list_splice_init(&q->task_list, &tmp);
24010 +       while (!list_empty(&tmp)) {
24011 +               curr = list_first_entry(&tmp, typeof(*curr), task_list);
24013 +               wake_up_state(curr->task, TASK_NORMAL);
24014 +               list_del_init(&curr->task_list);
24016 +               if (list_empty(&tmp))
24017 +                       break;
24019 +               raw_spin_unlock_irq(&q->lock);
24020 +               raw_spin_lock_irq(&q->lock);
24021 +       }
24022 +       raw_spin_unlock_irq(&q->lock);
24024 +EXPORT_SYMBOL(swake_up_all);
24026 +void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait)
24028 +       wait->task = current;
24029 +       if (list_empty(&wait->task_list))
24030 +               list_add(&wait->task_list, &q->task_list);
24033 +void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int state)
24035 +       unsigned long flags;
24037 +       raw_spin_lock_irqsave(&q->lock, flags);
24038 +       __prepare_to_swait(q, wait);
24039 +       set_current_state(state);
24040 +       raw_spin_unlock_irqrestore(&q->lock, flags);
24042 +EXPORT_SYMBOL(prepare_to_swait);
24044 +long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait, int state)
24046 +       if (signal_pending_state(state, current))
24047 +               return -ERESTARTSYS;
24049 +       prepare_to_swait(q, wait, state);
24051 +       return 0;
24053 +EXPORT_SYMBOL(prepare_to_swait_event);
24055 +void __finish_swait(struct swait_queue_head *q, struct swait_queue *wait)
24057 +       __set_current_state(TASK_RUNNING);
24058 +       if (!list_empty(&wait->task_list))
24059 +               list_del_init(&wait->task_list);
24062 +void finish_swait(struct swait_queue_head *q, struct swait_queue *wait)
24064 +       unsigned long flags;
24066 +       __set_current_state(TASK_RUNNING);
24068 +       if (!list_empty_careful(&wait->task_list)) {
24069 +               raw_spin_lock_irqsave(&q->lock, flags);
24070 +               list_del_init(&wait->task_list);
24071 +               raw_spin_unlock_irqrestore(&q->lock, flags);
24072 +       }
24074 +EXPORT_SYMBOL(finish_swait);
24075 diff --git a/kernel/sched/swork.c b/kernel/sched/swork.c
24076 new file mode 100644
24077 index 000000000000..1950f40ca725
24078 --- /dev/null
24079 +++ b/kernel/sched/swork.c
24080 @@ -0,0 +1,173 @@
24082 + * Copyright (C) 2014 BMW Car IT GmbH, Daniel Wagner daniel.wagner@bmw-carit.de
24083 + *
24084 + * Provides a framework for enqueuing callbacks from irq context
24085 + * PREEMPT_RT_FULL safe. The callbacks are executed in kthread context.
24086 + */
24088 +#include <linux/swait.h>
24089 +#include <linux/swork.h>
24090 +#include <linux/kthread.h>
24091 +#include <linux/slab.h>
24092 +#include <linux/spinlock.h>
24093 +#include <linux/export.h>
24095 +#define SWORK_EVENT_PENDING     (1 << 0)
24097 +static DEFINE_MUTEX(worker_mutex);
24098 +static struct sworker *glob_worker;
24100 +struct sworker {
24101 +       struct list_head events;
24102 +       struct swait_queue_head wq;
24104 +       raw_spinlock_t lock;
24106 +       struct task_struct *task;
24107 +       int refs;
24110 +static bool swork_readable(struct sworker *worker)
24112 +       bool r;
24114 +       if (kthread_should_stop())
24115 +               return true;
24117 +       raw_spin_lock_irq(&worker->lock);
24118 +       r = !list_empty(&worker->events);
24119 +       raw_spin_unlock_irq(&worker->lock);
24121 +       return r;
24124 +static int swork_kthread(void *arg)
24126 +       struct sworker *worker = arg;
24128 +       for (;;) {
24129 +               swait_event_interruptible(worker->wq,
24130 +                                       swork_readable(worker));
24131 +               if (kthread_should_stop())
24132 +                       break;
24134 +               raw_spin_lock_irq(&worker->lock);
24135 +               while (!list_empty(&worker->events)) {
24136 +                       struct swork_event *sev;
24138 +                       sev = list_first_entry(&worker->events,
24139 +                                       struct swork_event, item);
24140 +                       list_del(&sev->item);
24141 +                       raw_spin_unlock_irq(&worker->lock);
24143 +                       WARN_ON_ONCE(!test_and_clear_bit(SWORK_EVENT_PENDING,
24144 +                                                        &sev->flags));
24145 +                       sev->func(sev);
24146 +                       raw_spin_lock_irq(&worker->lock);
24147 +               }
24148 +               raw_spin_unlock_irq(&worker->lock);
24149 +       }
24150 +       return 0;
24153 +static struct sworker *swork_create(void)
24155 +       struct sworker *worker;
24157 +       worker = kzalloc(sizeof(*worker), GFP_KERNEL);
24158 +       if (!worker)
24159 +               return ERR_PTR(-ENOMEM);
24161 +       INIT_LIST_HEAD(&worker->events);
24162 +       raw_spin_lock_init(&worker->lock);
24163 +       init_swait_queue_head(&worker->wq);
24165 +       worker->task = kthread_run(swork_kthread, worker, "kswork");
24166 +       if (IS_ERR(worker->task)) {
24167 +               kfree(worker);
24168 +               return ERR_PTR(-ENOMEM);
24169 +       }
24171 +       return worker;
24174 +static void swork_destroy(struct sworker *worker)
24176 +       kthread_stop(worker->task);
24178 +       WARN_ON(!list_empty(&worker->events));
24179 +       kfree(worker);
24182 +/**
24183 + * swork_queue - queue swork
24184 + *
24185 + * Returns %false if @work was already on a queue, %true otherwise.
24186 + *
24187 + * The work is queued and processed on a random CPU
24188 + */
24189 +bool swork_queue(struct swork_event *sev)
24191 +       unsigned long flags;
24193 +       if (test_and_set_bit(SWORK_EVENT_PENDING, &sev->flags))
24194 +               return false;
24196 +       raw_spin_lock_irqsave(&glob_worker->lock, flags);
24197 +       list_add_tail(&sev->item, &glob_worker->events);
24198 +       raw_spin_unlock_irqrestore(&glob_worker->lock, flags);
24200 +       swake_up(&glob_worker->wq);
24201 +       return true;
24203 +EXPORT_SYMBOL_GPL(swork_queue);
24205 +/**
24206 + * swork_get - get an instance of the sworker
24207 + *
24208 + * Returns an negative error code if the initialization if the worker did not
24209 + * work, %0 otherwise.
24210 + *
24211 + */
24212 +int swork_get(void)
24214 +       struct sworker *worker;
24216 +       mutex_lock(&worker_mutex);
24217 +       if (!glob_worker) {
24218 +               worker = swork_create();
24219 +               if (IS_ERR(worker)) {
24220 +                       mutex_unlock(&worker_mutex);
24221 +                       return -ENOMEM;
24222 +               }
24224 +               glob_worker = worker;
24225 +       }
24227 +       glob_worker->refs++;
24228 +       mutex_unlock(&worker_mutex);
24230 +       return 0;
24232 +EXPORT_SYMBOL_GPL(swork_get);
24234 +/**
24235 + * swork_put - puts an instance of the sworker
24236 + *
24237 + * Will destroy the sworker thread. This function must not be called until all
24238 + * queued events have been completed.
24239 + */
24240 +void swork_put(void)
24242 +       mutex_lock(&worker_mutex);
24244 +       glob_worker->refs--;
24245 +       if (glob_worker->refs > 0)
24246 +               goto out;
24248 +       swork_destroy(glob_worker);
24249 +       glob_worker = NULL;
24250 +out:
24251 +       mutex_unlock(&worker_mutex);
24253 +EXPORT_SYMBOL_GPL(swork_put);
24254 diff --git a/kernel/signal.c b/kernel/signal.c
24255 index f3f1f7a972fd..bc2c990f3f63 100644
24256 --- a/kernel/signal.c
24257 +++ b/kernel/signal.c
24258 @@ -14,6 +14,7 @@
24259  #include <linux/export.h>
24260  #include <linux/init.h>
24261  #include <linux/sched.h>
24262 +#include <linux/sched/rt.h>
24263  #include <linux/fs.h>
24264  #include <linux/tty.h>
24265  #include <linux/binfmts.h>
24266 @@ -352,13 +353,30 @@ static bool task_participate_group_stop(struct task_struct *task)
24267         return false;
24270 +static inline struct sigqueue *get_task_cache(struct task_struct *t)
24272 +       struct sigqueue *q = t->sigqueue_cache;
24274 +       if (cmpxchg(&t->sigqueue_cache, q, NULL) != q)
24275 +               return NULL;
24276 +       return q;
24279 +static inline int put_task_cache(struct task_struct *t, struct sigqueue *q)
24281 +       if (cmpxchg(&t->sigqueue_cache, NULL, q) == NULL)
24282 +               return 0;
24283 +       return 1;
24286  /*
24287   * allocate a new signal queue record
24288   * - this may be called without locks if and only if t == current, otherwise an
24289   *   appropriate lock must be held to stop the target task from exiting
24290   */
24291  static struct sigqueue *
24292 -__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimit)
24293 +__sigqueue_do_alloc(int sig, struct task_struct *t, gfp_t flags,
24294 +                   int override_rlimit, int fromslab)
24296         struct sigqueue *q = NULL;
24297         struct user_struct *user;
24298 @@ -375,7 +393,10 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi
24299         if (override_rlimit ||
24300             atomic_read(&user->sigpending) <=
24301                         task_rlimit(t, RLIMIT_SIGPENDING)) {
24302 -               q = kmem_cache_alloc(sigqueue_cachep, flags);
24303 +               if (!fromslab)
24304 +                       q = get_task_cache(t);
24305 +               if (!q)
24306 +                       q = kmem_cache_alloc(sigqueue_cachep, flags);
24307         } else {
24308                 print_dropped_signal(sig);
24309         }
24310 @@ -392,6 +413,13 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi
24311         return q;
24314 +static struct sigqueue *
24315 +__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags,
24316 +                int override_rlimit)
24318 +       return __sigqueue_do_alloc(sig, t, flags, override_rlimit, 0);
24321  static void __sigqueue_free(struct sigqueue *q)
24323         if (q->flags & SIGQUEUE_PREALLOC)
24324 @@ -401,6 +429,21 @@ static void __sigqueue_free(struct sigqueue *q)
24325         kmem_cache_free(sigqueue_cachep, q);
24328 +static void sigqueue_free_current(struct sigqueue *q)
24330 +       struct user_struct *up;
24332 +       if (q->flags & SIGQUEUE_PREALLOC)
24333 +               return;
24335 +       up = q->user;
24336 +       if (rt_prio(current->normal_prio) && !put_task_cache(current, q)) {
24337 +               atomic_dec(&up->sigpending);
24338 +               free_uid(up);
24339 +       } else
24340 +                 __sigqueue_free(q);
24343  void flush_sigqueue(struct sigpending *queue)
24345         struct sigqueue *q;
24346 @@ -414,6 +457,21 @@ void flush_sigqueue(struct sigpending *queue)
24349  /*
24350 + * Called from __exit_signal. Flush tsk->pending and
24351 + * tsk->sigqueue_cache
24352 + */
24353 +void flush_task_sigqueue(struct task_struct *tsk)
24355 +       struct sigqueue *q;
24357 +       flush_sigqueue(&tsk->pending);
24359 +       q = get_task_cache(tsk);
24360 +       if (q)
24361 +               kmem_cache_free(sigqueue_cachep, q);
24365   * Flush all pending signals for this kthread.
24366   */
24367  void flush_signals(struct task_struct *t)
24368 @@ -525,7 +583,7 @@ static void collect_signal(int sig, struct sigpending *list, siginfo_t *info)
24369  still_pending:
24370                 list_del_init(&first->list);
24371                 copy_siginfo(info, &first->info);
24372 -               __sigqueue_free(first);
24373 +               sigqueue_free_current(first);
24374         } else {
24375                 /*
24376                  * Ok, it wasn't in the queue.  This must be
24377 @@ -560,6 +618,8 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
24379         int signr;
24381 +       WARN_ON_ONCE(tsk != current);
24383         /* We only dequeue private signals from ourselves, we don't let
24384          * signalfd steal them
24385          */
24386 @@ -1156,8 +1216,8 @@ int do_send_sig_info(int sig, struct siginfo *info, struct task_struct *p,
24387   * We don't want to have recursive SIGSEGV's etc, for example,
24388   * that is why we also clear SIGNAL_UNKILLABLE.
24389   */
24390 -int
24391 -force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
24392 +static int
24393 +do_force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
24395         unsigned long int flags;
24396         int ret, blocked, ignored;
24397 @@ -1182,6 +1242,39 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
24398         return ret;
24401 +int force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
24404 + * On some archs, PREEMPT_RT has to delay sending a signal from a trap
24405 + * since it can not enable preemption, and the signal code's spin_locks
24406 + * turn into mutexes. Instead, it must set TIF_NOTIFY_RESUME which will
24407 + * send the signal on exit of the trap.
24408 + */
24409 +#ifdef ARCH_RT_DELAYS_SIGNAL_SEND
24410 +       if (in_atomic()) {
24411 +               if (WARN_ON_ONCE(t != current))
24412 +                       return 0;
24413 +               if (WARN_ON_ONCE(t->forced_info.si_signo))
24414 +                       return 0;
24416 +               if (is_si_special(info)) {
24417 +                       WARN_ON_ONCE(info != SEND_SIG_PRIV);
24418 +                       t->forced_info.si_signo = sig;
24419 +                       t->forced_info.si_errno = 0;
24420 +                       t->forced_info.si_code = SI_KERNEL;
24421 +                       t->forced_info.si_pid = 0;
24422 +                       t->forced_info.si_uid = 0;
24423 +               } else {
24424 +                       t->forced_info = *info;
24425 +               }
24427 +               set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
24428 +               return 0;
24429 +       }
24430 +#endif
24431 +       return do_force_sig_info(sig, info, t);
24434  /*
24435   * Nuke all other threads in the group.
24436   */
24437 @@ -1216,12 +1309,12 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
24438                  * Disable interrupts early to avoid deadlocks.
24439                  * See rcu_read_unlock() comment header for details.
24440                  */
24441 -               local_irq_save(*flags);
24442 +               local_irq_save_nort(*flags);
24443                 rcu_read_lock();
24444                 sighand = rcu_dereference(tsk->sighand);
24445                 if (unlikely(sighand == NULL)) {
24446                         rcu_read_unlock();
24447 -                       local_irq_restore(*flags);
24448 +                       local_irq_restore_nort(*flags);
24449                         break;
24450                 }
24451                 /*
24452 @@ -1242,7 +1335,7 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
24453                 }
24454                 spin_unlock(&sighand->siglock);
24455                 rcu_read_unlock();
24456 -               local_irq_restore(*flags);
24457 +               local_irq_restore_nort(*flags);
24458         }
24460         return sighand;
24461 @@ -1485,7 +1578,8 @@ EXPORT_SYMBOL(kill_pid);
24462   */
24463  struct sigqueue *sigqueue_alloc(void)
24465 -       struct sigqueue *q = __sigqueue_alloc(-1, current, GFP_KERNEL, 0);
24466 +       /* Preallocated sigqueue objects always from the slabcache ! */
24467 +       struct sigqueue *q = __sigqueue_do_alloc(-1, current, GFP_KERNEL, 0, 1);
24469         if (q)
24470                 q->flags |= SIGQUEUE_PREALLOC;
24471 @@ -1846,15 +1940,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
24472                 if (gstop_done && ptrace_reparented(current))
24473                         do_notify_parent_cldstop(current, false, why);
24475 -               /*
24476 -                * Don't want to allow preemption here, because
24477 -                * sys_ptrace() needs this task to be inactive.
24478 -                *
24479 -                * XXX: implement read_unlock_no_resched().
24480 -                */
24481 -               preempt_disable();
24482                 read_unlock(&tasklist_lock);
24483 -               preempt_enable_no_resched();
24484                 freezable_schedule();
24485         } else {
24486                 /*
24487 diff --git a/kernel/softirq.c b/kernel/softirq.c
24488 index 479e4436f787..cb9c1d5dee10 100644
24489 --- a/kernel/softirq.c
24490 +++ b/kernel/softirq.c
24491 @@ -21,10 +21,12 @@
24492  #include <linux/freezer.h>
24493  #include <linux/kthread.h>
24494  #include <linux/rcupdate.h>
24495 +#include <linux/delay.h>
24496  #include <linux/ftrace.h>
24497  #include <linux/smp.h>
24498  #include <linux/smpboot.h>
24499  #include <linux/tick.h>
24500 +#include <linux/locallock.h>
24501  #include <linux/irq.h>
24503  #define CREATE_TRACE_POINTS
24504 @@ -56,12 +58,108 @@ EXPORT_SYMBOL(irq_stat);
24505  static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp;
24507  DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
24508 +#ifdef CONFIG_PREEMPT_RT_FULL
24509 +#define TIMER_SOFTIRQS ((1 << TIMER_SOFTIRQ) | (1 << HRTIMER_SOFTIRQ))
24510 +DEFINE_PER_CPU(struct task_struct *, ktimer_softirqd);
24511 +#endif
24513  const char * const softirq_to_name[NR_SOFTIRQS] = {
24514         "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL",
24515         "TASKLET", "SCHED", "HRTIMER", "RCU"
24516  };
24518 +#ifdef CONFIG_NO_HZ_COMMON
24519 +# ifdef CONFIG_PREEMPT_RT_FULL
24521 +struct softirq_runner {
24522 +       struct task_struct *runner[NR_SOFTIRQS];
24525 +static DEFINE_PER_CPU(struct softirq_runner, softirq_runners);
24527 +static inline void softirq_set_runner(unsigned int sirq)
24529 +       struct softirq_runner *sr = this_cpu_ptr(&softirq_runners);
24531 +       sr->runner[sirq] = current;
24534 +static inline void softirq_clr_runner(unsigned int sirq)
24536 +       struct softirq_runner *sr = this_cpu_ptr(&softirq_runners);
24538 +       sr->runner[sirq] = NULL;
24542 + * On preempt-rt a softirq running context might be blocked on a
24543 + * lock. There might be no other runnable task on this CPU because the
24544 + * lock owner runs on some other CPU. So we have to go into idle with
24545 + * the pending bit set. Therefor we need to check this otherwise we
24546 + * warn about false positives which confuses users and defeats the
24547 + * whole purpose of this test.
24548 + *
24549 + * This code is called with interrupts disabled.
24550 + */
24551 +void softirq_check_pending_idle(void)
24553 +       static int rate_limit;
24554 +       struct softirq_runner *sr = this_cpu_ptr(&softirq_runners);
24555 +       u32 warnpending;
24556 +       int i;
24558 +       if (rate_limit >= 10)
24559 +               return;
24561 +       warnpending = local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK;
24562 +       for (i = 0; i < NR_SOFTIRQS; i++) {
24563 +               struct task_struct *tsk = sr->runner[i];
24565 +               /*
24566 +                * The wakeup code in rtmutex.c wakes up the task
24567 +                * _before_ it sets pi_blocked_on to NULL under
24568 +                * tsk->pi_lock. So we need to check for both: state
24569 +                * and pi_blocked_on.
24570 +                */
24571 +               if (tsk) {
24572 +                       raw_spin_lock(&tsk->pi_lock);
24573 +                       if (tsk->pi_blocked_on || tsk->state == TASK_RUNNING) {
24574 +                               /* Clear all bits pending in that task */
24575 +                               warnpending &= ~(tsk->softirqs_raised);
24576 +                               warnpending &= ~(1 << i);
24577 +                       }
24578 +                       raw_spin_unlock(&tsk->pi_lock);
24579 +               }
24580 +       }
24582 +       if (warnpending) {
24583 +               printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
24584 +                      warnpending);
24585 +               rate_limit++;
24586 +       }
24588 +# else
24590 + * On !PREEMPT_RT we just printk rate limited:
24591 + */
24592 +void softirq_check_pending_idle(void)
24594 +       static int rate_limit;
24596 +       if (rate_limit < 10 &&
24597 +                       (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) {
24598 +               printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
24599 +                      local_softirq_pending());
24600 +               rate_limit++;
24601 +       }
24603 +# endif
24605 +#else /* !CONFIG_NO_HZ_COMMON */
24606 +static inline void softirq_set_runner(unsigned int sirq) { }
24607 +static inline void softirq_clr_runner(unsigned int sirq) { }
24608 +#endif
24610  /*
24611   * we cannot loop indefinitely here to avoid userspace starvation,
24612   * but we also don't want to introduce a worst case 1/HZ latency
24613 @@ -77,6 +175,79 @@ static void wakeup_softirqd(void)
24614                 wake_up_process(tsk);
24617 +#ifdef CONFIG_PREEMPT_RT_FULL
24618 +static void wakeup_timer_softirqd(void)
24620 +       /* Interrupts are disabled: no need to stop preemption */
24621 +       struct task_struct *tsk = __this_cpu_read(ktimer_softirqd);
24623 +       if (tsk && tsk->state != TASK_RUNNING)
24624 +               wake_up_process(tsk);
24626 +#endif
24628 +static void handle_softirq(unsigned int vec_nr)
24630 +       struct softirq_action *h = softirq_vec + vec_nr;
24631 +       int prev_count;
24633 +       prev_count = preempt_count();
24635 +       kstat_incr_softirqs_this_cpu(vec_nr);
24637 +       trace_softirq_entry(vec_nr);
24638 +       h->action(h);
24639 +       trace_softirq_exit(vec_nr);
24640 +       if (unlikely(prev_count != preempt_count())) {
24641 +               pr_err("huh, entered softirq %u %s %p with preempt_count %08x, exited with %08x?\n",
24642 +                      vec_nr, softirq_to_name[vec_nr], h->action,
24643 +                      prev_count, preempt_count());
24644 +               preempt_count_set(prev_count);
24645 +       }
24648 +#ifndef CONFIG_PREEMPT_RT_FULL
24649 +static inline int ksoftirqd_softirq_pending(void)
24651 +       return local_softirq_pending();
24654 +static void handle_pending_softirqs(u32 pending)
24656 +       struct softirq_action *h = softirq_vec;
24657 +       int softirq_bit;
24659 +       local_irq_enable();
24661 +       h = softirq_vec;
24663 +       while ((softirq_bit = ffs(pending))) {
24664 +               unsigned int vec_nr;
24666 +               h += softirq_bit - 1;
24667 +               vec_nr = h - softirq_vec;
24668 +               handle_softirq(vec_nr);
24670 +               h++;
24671 +               pending >>= softirq_bit;
24672 +       }
24674 +       rcu_bh_qs();
24675 +       local_irq_disable();
24678 +static void run_ksoftirqd(unsigned int cpu)
24680 +       local_irq_disable();
24681 +       if (ksoftirqd_softirq_pending()) {
24682 +               __do_softirq();
24683 +               local_irq_enable();
24684 +               cond_resched_rcu_qs();
24685 +               return;
24686 +       }
24687 +       local_irq_enable();
24690  /*
24691   * preempt_count and SOFTIRQ_OFFSET usage:
24692   * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving
24693 @@ -116,9 +287,9 @@ void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
24695         if (preempt_count() == cnt) {
24696  #ifdef CONFIG_DEBUG_PREEMPT
24697 -               current->preempt_disable_ip = get_parent_ip(CALLER_ADDR1);
24698 +               current->preempt_disable_ip = get_lock_parent_ip();
24699  #endif
24700 -               trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
24701 +               trace_preempt_off(CALLER_ADDR0, get_lock_parent_ip());
24702         }
24704  EXPORT_SYMBOL(__local_bh_disable_ip);
24705 @@ -232,10 +403,8 @@ asmlinkage __visible void __do_softirq(void)
24706         unsigned long end = jiffies + MAX_SOFTIRQ_TIME;
24707         unsigned long old_flags = current->flags;
24708         int max_restart = MAX_SOFTIRQ_RESTART;
24709 -       struct softirq_action *h;
24710         bool in_hardirq;
24711         __u32 pending;
24712 -       int softirq_bit;
24714         /*
24715          * Mask out PF_MEMALLOC s current task context is borrowed for the
24716 @@ -254,36 +423,7 @@ restart:
24717         /* Reset the pending bitmask before enabling irqs */
24718         set_softirq_pending(0);
24720 -       local_irq_enable();
24722 -       h = softirq_vec;
24724 -       while ((softirq_bit = ffs(pending))) {
24725 -               unsigned int vec_nr;
24726 -               int prev_count;
24728 -               h += softirq_bit - 1;
24730 -               vec_nr = h - softirq_vec;
24731 -               prev_count = preempt_count();
24733 -               kstat_incr_softirqs_this_cpu(vec_nr);
24735 -               trace_softirq_entry(vec_nr);
24736 -               h->action(h);
24737 -               trace_softirq_exit(vec_nr);
24738 -               if (unlikely(prev_count != preempt_count())) {
24739 -                       pr_err("huh, entered softirq %u %s %p with preempt_count %08x, exited with %08x?\n",
24740 -                              vec_nr, softirq_to_name[vec_nr], h->action,
24741 -                              prev_count, preempt_count());
24742 -                       preempt_count_set(prev_count);
24743 -               }
24744 -               h++;
24745 -               pending >>= softirq_bit;
24746 -       }
24748 -       rcu_bh_qs();
24749 -       local_irq_disable();
24750 +       handle_pending_softirqs(pending);
24752         pending = local_softirq_pending();
24753         if (pending) {
24754 @@ -320,6 +460,310 @@ asmlinkage __visible void do_softirq(void)
24757  /*
24758 + * This function must run with irqs disabled!
24759 + */
24760 +void raise_softirq_irqoff(unsigned int nr)
24762 +       __raise_softirq_irqoff(nr);
24764 +       /*
24765 +        * If we're in an interrupt or softirq, we're done
24766 +        * (this also catches softirq-disabled code). We will
24767 +        * actually run the softirq once we return from
24768 +        * the irq or softirq.
24769 +        *
24770 +        * Otherwise we wake up ksoftirqd to make sure we
24771 +        * schedule the softirq soon.
24772 +        */
24773 +       if (!in_interrupt())
24774 +               wakeup_softirqd();
24777 +void __raise_softirq_irqoff(unsigned int nr)
24779 +       trace_softirq_raise(nr);
24780 +       or_softirq_pending(1UL << nr);
24783 +static inline void local_bh_disable_nort(void) { local_bh_disable(); }
24784 +static inline void _local_bh_enable_nort(void) { _local_bh_enable(); }
24785 +static void ksoftirqd_set_sched_params(unsigned int cpu) { }
24787 +#else /* !PREEMPT_RT_FULL */
24790 + * On RT we serialize softirq execution with a cpu local lock per softirq
24791 + */
24792 +static DEFINE_PER_CPU(struct local_irq_lock [NR_SOFTIRQS], local_softirq_locks);
24794 +void __init softirq_early_init(void)
24796 +       int i;
24798 +       for (i = 0; i < NR_SOFTIRQS; i++)
24799 +               local_irq_lock_init(local_softirq_locks[i]);
24802 +static void lock_softirq(int which)
24804 +       local_lock(local_softirq_locks[which]);
24807 +static void unlock_softirq(int which)
24809 +       local_unlock(local_softirq_locks[which]);
24812 +static void do_single_softirq(int which)
24814 +       unsigned long old_flags = current->flags;
24816 +       current->flags &= ~PF_MEMALLOC;
24817 +       vtime_account_irq_enter(current);
24818 +       current->flags |= PF_IN_SOFTIRQ;
24819 +       lockdep_softirq_enter();
24820 +       local_irq_enable();
24821 +       handle_softirq(which);
24822 +       local_irq_disable();
24823 +       lockdep_softirq_exit();
24824 +       current->flags &= ~PF_IN_SOFTIRQ;
24825 +       vtime_account_irq_enter(current);
24826 +       tsk_restore_flags(current, old_flags, PF_MEMALLOC);
24830 + * Called with interrupts disabled. Process softirqs which were raised
24831 + * in current context (or on behalf of ksoftirqd).
24832 + */
24833 +static void do_current_softirqs(void)
24835 +       while (current->softirqs_raised) {
24836 +               int i = __ffs(current->softirqs_raised);
24837 +               unsigned int pending, mask = (1U << i);
24839 +               current->softirqs_raised &= ~mask;
24840 +               local_irq_enable();
24842 +               /*
24843 +                * If the lock is contended, we boost the owner to
24844 +                * process the softirq or leave the critical section
24845 +                * now.
24846 +                */
24847 +               lock_softirq(i);
24848 +               local_irq_disable();
24849 +               softirq_set_runner(i);
24850 +               /*
24851 +                * Check with the local_softirq_pending() bits,
24852 +                * whether we need to process this still or if someone
24853 +                * else took care of it.
24854 +                */
24855 +               pending = local_softirq_pending();
24856 +               if (pending & mask) {
24857 +                       set_softirq_pending(pending & ~mask);
24858 +                       do_single_softirq(i);
24859 +               }
24860 +               softirq_clr_runner(i);
24861 +               WARN_ON(current->softirq_nestcnt != 1);
24862 +               local_irq_enable();
24863 +               unlock_softirq(i);
24864 +               local_irq_disable();
24865 +       }
24868 +void __local_bh_disable(void)
24870 +       if (++current->softirq_nestcnt == 1)
24871 +               migrate_disable();
24873 +EXPORT_SYMBOL(__local_bh_disable);
24875 +void __local_bh_enable(void)
24877 +       if (WARN_ON(current->softirq_nestcnt == 0))
24878 +               return;
24880 +       local_irq_disable();
24881 +       if (current->softirq_nestcnt == 1 && current->softirqs_raised)
24882 +               do_current_softirqs();
24883 +       local_irq_enable();
24885 +       if (--current->softirq_nestcnt == 0)
24886 +               migrate_enable();
24888 +EXPORT_SYMBOL(__local_bh_enable);
24890 +void _local_bh_enable(void)
24892 +       if (WARN_ON(current->softirq_nestcnt == 0))
24893 +               return;
24894 +       if (--current->softirq_nestcnt == 0)
24895 +               migrate_enable();
24897 +EXPORT_SYMBOL(_local_bh_enable);
24899 +int in_serving_softirq(void)
24901 +       return current->flags & PF_IN_SOFTIRQ;
24903 +EXPORT_SYMBOL(in_serving_softirq);
24905 +/* Called with preemption disabled */
24906 +static void run_ksoftirqd(unsigned int cpu)
24908 +       local_irq_disable();
24909 +       current->softirq_nestcnt++;
24911 +       do_current_softirqs();
24912 +       current->softirq_nestcnt--;
24913 +       local_irq_enable();
24914 +       cond_resched_rcu_qs();
24918 + * Called from netif_rx_ni(). Preemption enabled, but migration
24919 + * disabled. So the cpu can't go away under us.
24920 + */
24921 +void thread_do_softirq(void)
24923 +       if (!in_serving_softirq() && current->softirqs_raised) {
24924 +               current->softirq_nestcnt++;
24925 +               do_current_softirqs();
24926 +               current->softirq_nestcnt--;
24927 +       }
24930 +static void do_raise_softirq_irqoff(unsigned int nr)
24932 +       unsigned int mask;
24934 +       mask = 1UL << nr;
24936 +       trace_softirq_raise(nr);
24937 +       or_softirq_pending(mask);
24939 +       /*
24940 +        * If we are not in a hard interrupt and inside a bh disabled
24941 +        * region, we simply raise the flag on current. local_bh_enable()
24942 +        * will make sure that the softirq is executed. Otherwise we
24943 +        * delegate it to ksoftirqd.
24944 +        */
24945 +       if (!in_irq() && current->softirq_nestcnt)
24946 +               current->softirqs_raised |= mask;
24947 +       else if (!__this_cpu_read(ksoftirqd) || !__this_cpu_read(ktimer_softirqd))
24948 +               return;
24950 +       if (mask & TIMER_SOFTIRQS)
24951 +               __this_cpu_read(ktimer_softirqd)->softirqs_raised |= mask;
24952 +       else
24953 +               __this_cpu_read(ksoftirqd)->softirqs_raised |= mask;
24956 +static void wakeup_proper_softirq(unsigned int nr)
24958 +       if ((1UL << nr) & TIMER_SOFTIRQS)
24959 +               wakeup_timer_softirqd();
24960 +       else
24961 +               wakeup_softirqd();
24965 +void __raise_softirq_irqoff(unsigned int nr)
24967 +       do_raise_softirq_irqoff(nr);
24968 +       if (!in_irq() && !current->softirq_nestcnt)
24969 +               wakeup_proper_softirq(nr);
24973 + * Same as __raise_softirq_irqoff() but will process them in ksoftirqd
24974 + */
24975 +void __raise_softirq_irqoff_ksoft(unsigned int nr)
24977 +       unsigned int mask;
24979 +       if (WARN_ON_ONCE(!__this_cpu_read(ksoftirqd) ||
24980 +                        !__this_cpu_read(ktimer_softirqd)))
24981 +               return;
24982 +       mask = 1UL << nr;
24984 +       trace_softirq_raise(nr);
24985 +       or_softirq_pending(mask);
24986 +       if (mask & TIMER_SOFTIRQS)
24987 +               __this_cpu_read(ktimer_softirqd)->softirqs_raised |= mask;
24988 +       else
24989 +               __this_cpu_read(ksoftirqd)->softirqs_raised |= mask;
24990 +       wakeup_proper_softirq(nr);
24994 + * This function must run with irqs disabled!
24995 + */
24996 +void raise_softirq_irqoff(unsigned int nr)
24998 +       do_raise_softirq_irqoff(nr);
25000 +       /*
25001 +        * If we're in an hard interrupt we let irq return code deal
25002 +        * with the wakeup of ksoftirqd.
25003 +        */
25004 +       if (in_irq())
25005 +               return;
25006 +       /*
25007 +        * If we are in thread context but outside of a bh disabled
25008 +        * region, we need to wake ksoftirqd as well.
25009 +        *
25010 +        * CHECKME: Some of the places which do that could be wrapped
25011 +        * into local_bh_disable/enable pairs. Though it's unclear
25012 +        * whether this is worth the effort. To find those places just
25013 +        * raise a WARN() if the condition is met.
25014 +        */
25015 +       if (!current->softirq_nestcnt)
25016 +               wakeup_proper_softirq(nr);
25019 +static inline int ksoftirqd_softirq_pending(void)
25021 +       return current->softirqs_raised;
25024 +static inline void local_bh_disable_nort(void) { }
25025 +static inline void _local_bh_enable_nort(void) { }
25027 +static inline void ksoftirqd_set_sched_params(unsigned int cpu)
25029 +       /* Take over all but timer pending softirqs when starting */
25030 +       local_irq_disable();
25031 +       current->softirqs_raised = local_softirq_pending() & ~TIMER_SOFTIRQS;
25032 +       local_irq_enable();
25035 +static inline void ktimer_softirqd_set_sched_params(unsigned int cpu)
25037 +       struct sched_param param = { .sched_priority = 1 };
25039 +       sched_setscheduler(current, SCHED_FIFO, &param);
25041 +       /* Take over timer pending softirqs when starting */
25042 +       local_irq_disable();
25043 +       current->softirqs_raised = local_softirq_pending() & TIMER_SOFTIRQS;
25044 +       local_irq_enable();
25047 +static inline void ktimer_softirqd_clr_sched_params(unsigned int cpu,
25048 +                                                   bool online)
25050 +       struct sched_param param = { .sched_priority = 0 };
25052 +       sched_setscheduler(current, SCHED_NORMAL, &param);
25055 +static int ktimer_softirqd_should_run(unsigned int cpu)
25057 +       return current->softirqs_raised;
25060 +#endif /* PREEMPT_RT_FULL */
25062   * Enter an interrupt context.
25063   */
25064  void irq_enter(void)
25065 @@ -330,9 +774,9 @@ void irq_enter(void)
25066                  * Prevent raise_softirq from needlessly waking up ksoftirqd
25067                  * here, as softirq will be serviced on return from interrupt.
25068                  */
25069 -               local_bh_disable();
25070 +               local_bh_disable_nort();
25071                 tick_irq_enter();
25072 -               _local_bh_enable();
25073 +               _local_bh_enable_nort();
25074         }
25076         __irq_enter();
25077 @@ -340,6 +784,7 @@ void irq_enter(void)
25079  static inline void invoke_softirq(void)
25081 +#ifndef CONFIG_PREEMPT_RT_FULL
25082         if (!force_irqthreads) {
25083  #ifdef CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK
25084                 /*
25085 @@ -359,6 +804,18 @@ static inline void invoke_softirq(void)
25086         } else {
25087                 wakeup_softirqd();
25088         }
25089 +#else /* PREEMPT_RT_FULL */
25090 +       unsigned long flags;
25092 +       local_irq_save(flags);
25093 +       if (__this_cpu_read(ksoftirqd) &&
25094 +                       __this_cpu_read(ksoftirqd)->softirqs_raised)
25095 +               wakeup_softirqd();
25096 +       if (__this_cpu_read(ktimer_softirqd) &&
25097 +                       __this_cpu_read(ktimer_softirqd)->softirqs_raised)
25098 +               wakeup_timer_softirqd();
25099 +       local_irq_restore(flags);
25100 +#endif
25103  static inline void tick_irq_exit(void)
25104 @@ -395,26 +852,6 @@ void irq_exit(void)
25105         trace_hardirq_exit(); /* must be last! */
25109 - * This function must run with irqs disabled!
25110 - */
25111 -inline void raise_softirq_irqoff(unsigned int nr)
25113 -       __raise_softirq_irqoff(nr);
25115 -       /*
25116 -        * If we're in an interrupt or softirq, we're done
25117 -        * (this also catches softirq-disabled code). We will
25118 -        * actually run the softirq once we return from
25119 -        * the irq or softirq.
25120 -        *
25121 -        * Otherwise we wake up ksoftirqd to make sure we
25122 -        * schedule the softirq soon.
25123 -        */
25124 -       if (!in_interrupt())
25125 -               wakeup_softirqd();
25128  void raise_softirq(unsigned int nr)
25130         unsigned long flags;
25131 @@ -424,12 +861,6 @@ void raise_softirq(unsigned int nr)
25132         local_irq_restore(flags);
25135 -void __raise_softirq_irqoff(unsigned int nr)
25137 -       trace_softirq_raise(nr);
25138 -       or_softirq_pending(1UL << nr);
25141  void open_softirq(int nr, void (*action)(struct softirq_action *))
25143         softirq_vec[nr].action = action;
25144 @@ -446,15 +877,45 @@ struct tasklet_head {
25145  static DEFINE_PER_CPU(struct tasklet_head, tasklet_vec);
25146  static DEFINE_PER_CPU(struct tasklet_head, tasklet_hi_vec);
25148 +static void inline
25149 +__tasklet_common_schedule(struct tasklet_struct *t, struct tasklet_head *head, unsigned int nr)
25151 +       if (tasklet_trylock(t)) {
25152 +again:
25153 +               /* We may have been preempted before tasklet_trylock
25154 +                * and __tasklet_action may have already run.
25155 +                * So double check the sched bit while the takslet
25156 +                * is locked before adding it to the list.
25157 +                */
25158 +               if (test_bit(TASKLET_STATE_SCHED, &t->state)) {
25159 +                       t->next = NULL;
25160 +                       *head->tail = t;
25161 +                       head->tail = &(t->next);
25162 +                       raise_softirq_irqoff(nr);
25163 +                       tasklet_unlock(t);
25164 +               } else {
25165 +                       /* This is subtle. If we hit the corner case above
25166 +                        * It is possible that we get preempted right here,
25167 +                        * and another task has successfully called
25168 +                        * tasklet_schedule(), then this function, and
25169 +                        * failed on the trylock. Thus we must be sure
25170 +                        * before releasing the tasklet lock, that the
25171 +                        * SCHED_BIT is clear. Otherwise the tasklet
25172 +                        * may get its SCHED_BIT set, but not added to the
25173 +                        * list
25174 +                        */
25175 +                       if (!tasklet_tryunlock(t))
25176 +                               goto again;
25177 +               }
25178 +       }
25181  void __tasklet_schedule(struct tasklet_struct *t)
25183         unsigned long flags;
25185         local_irq_save(flags);
25186 -       t->next = NULL;
25187 -       *__this_cpu_read(tasklet_vec.tail) = t;
25188 -       __this_cpu_write(tasklet_vec.tail, &(t->next));
25189 -       raise_softirq_irqoff(TASKLET_SOFTIRQ);
25190 +       __tasklet_common_schedule(t, this_cpu_ptr(&tasklet_vec), TASKLET_SOFTIRQ);
25191         local_irq_restore(flags);
25193  EXPORT_SYMBOL(__tasklet_schedule);
25194 @@ -464,10 +925,7 @@ void __tasklet_hi_schedule(struct tasklet_struct *t)
25195         unsigned long flags;
25197         local_irq_save(flags);
25198 -       t->next = NULL;
25199 -       *__this_cpu_read(tasklet_hi_vec.tail) = t;
25200 -       __this_cpu_write(tasklet_hi_vec.tail,  &(t->next));
25201 -       raise_softirq_irqoff(HI_SOFTIRQ);
25202 +       __tasklet_common_schedule(t, this_cpu_ptr(&tasklet_hi_vec), HI_SOFTIRQ);
25203         local_irq_restore(flags);
25205  EXPORT_SYMBOL(__tasklet_hi_schedule);
25206 @@ -476,82 +934,122 @@ void __tasklet_hi_schedule_first(struct tasklet_struct *t)
25208         BUG_ON(!irqs_disabled());
25210 -       t->next = __this_cpu_read(tasklet_hi_vec.head);
25211 -       __this_cpu_write(tasklet_hi_vec.head, t);
25212 -       __raise_softirq_irqoff(HI_SOFTIRQ);
25213 +       __tasklet_hi_schedule(t);
25215  EXPORT_SYMBOL(__tasklet_hi_schedule_first);
25217 -static void tasklet_action(struct softirq_action *a)
25218 +void  tasklet_enable(struct tasklet_struct *t)
25220 -       struct tasklet_struct *list;
25221 +       if (!atomic_dec_and_test(&t->count))
25222 +               return;
25223 +       if (test_and_clear_bit(TASKLET_STATE_PENDING, &t->state))
25224 +               tasklet_schedule(t);
25226 +EXPORT_SYMBOL(tasklet_enable);
25228 -       local_irq_disable();
25229 -       list = __this_cpu_read(tasklet_vec.head);
25230 -       __this_cpu_write(tasklet_vec.head, NULL);
25231 -       __this_cpu_write(tasklet_vec.tail, this_cpu_ptr(&tasklet_vec.head));
25232 -       local_irq_enable();
25233 +static void __tasklet_action(struct softirq_action *a,
25234 +                            struct tasklet_struct *list)
25236 +       int loops = 1000000;
25238         while (list) {
25239                 struct tasklet_struct *t = list;
25241                 list = list->next;
25243 -               if (tasklet_trylock(t)) {
25244 -                       if (!atomic_read(&t->count)) {
25245 -                               if (!test_and_clear_bit(TASKLET_STATE_SCHED,
25246 -                                                       &t->state))
25247 -                                       BUG();
25248 -                               t->func(t->data);
25249 -                               tasklet_unlock(t);
25250 -                               continue;
25251 -                       }
25252 -                       tasklet_unlock(t);
25253 +               /*
25254 +                * Should always succeed - after a tasklist got on the
25255 +                * list (after getting the SCHED bit set from 0 to 1),
25256 +                * nothing but the tasklet softirq it got queued to can
25257 +                * lock it:
25258 +                */
25259 +               if (!tasklet_trylock(t)) {
25260 +                       WARN_ON(1);
25261 +                       continue;
25262                 }
25264 -               local_irq_disable();
25265                 t->next = NULL;
25266 -               *__this_cpu_read(tasklet_vec.tail) = t;
25267 -               __this_cpu_write(tasklet_vec.tail, &(t->next));
25268 -               __raise_softirq_irqoff(TASKLET_SOFTIRQ);
25269 -               local_irq_enable();
25271 +               /*
25272 +                * If we cannot handle the tasklet because it's disabled,
25273 +                * mark it as pending. tasklet_enable() will later
25274 +                * re-schedule the tasklet.
25275 +                */
25276 +               if (unlikely(atomic_read(&t->count))) {
25277 +out_disabled:
25278 +                       /* implicit unlock: */
25279 +                       wmb();
25280 +                       t->state = TASKLET_STATEF_PENDING;
25281 +                       continue;
25282 +               }
25284 +               /*
25285 +                * After this point on the tasklet might be rescheduled
25286 +                * on another CPU, but it can only be added to another
25287 +                * CPU's tasklet list if we unlock the tasklet (which we
25288 +                * dont do yet).
25289 +                */
25290 +               if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
25291 +                       WARN_ON(1);
25293 +again:
25294 +               t->func(t->data);
25296 +               /*
25297 +                * Try to unlock the tasklet. We must use cmpxchg, because
25298 +                * another CPU might have scheduled or disabled the tasklet.
25299 +                * We only allow the STATE_RUN -> 0 transition here.
25300 +                */
25301 +               while (!tasklet_tryunlock(t)) {
25302 +                       /*
25303 +                        * If it got disabled meanwhile, bail out:
25304 +                        */
25305 +                       if (atomic_read(&t->count))
25306 +                               goto out_disabled;
25307 +                       /*
25308 +                        * If it got scheduled meanwhile, re-execute
25309 +                        * the tasklet function:
25310 +                        */
25311 +                       if (test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
25312 +                               goto again;
25313 +                       if (!--loops) {
25314 +                               printk("hm, tasklet state: %08lx\n", t->state);
25315 +                               WARN_ON(1);
25316 +                               tasklet_unlock(t);
25317 +                               break;
25318 +                       }
25319 +               }
25320         }
25323 +static void tasklet_action(struct softirq_action *a)
25325 +       struct tasklet_struct *list;
25327 +       local_irq_disable();
25329 +       list = __this_cpu_read(tasklet_vec.head);
25330 +       __this_cpu_write(tasklet_vec.head, NULL);
25331 +       __this_cpu_write(tasklet_vec.tail, this_cpu_ptr(&tasklet_vec.head));
25333 +       local_irq_enable();
25335 +       __tasklet_action(a, list);
25338  static void tasklet_hi_action(struct softirq_action *a)
25340         struct tasklet_struct *list;
25342         local_irq_disable();
25344         list = __this_cpu_read(tasklet_hi_vec.head);
25345         __this_cpu_write(tasklet_hi_vec.head, NULL);
25346         __this_cpu_write(tasklet_hi_vec.tail, this_cpu_ptr(&tasklet_hi_vec.head));
25347 -       local_irq_enable();
25349 -       while (list) {
25350 -               struct tasklet_struct *t = list;
25352 -               list = list->next;
25354 -               if (tasklet_trylock(t)) {
25355 -                       if (!atomic_read(&t->count)) {
25356 -                               if (!test_and_clear_bit(TASKLET_STATE_SCHED,
25357 -                                                       &t->state))
25358 -                                       BUG();
25359 -                               t->func(t->data);
25360 -                               tasklet_unlock(t);
25361 -                               continue;
25362 -                       }
25363 -                       tasklet_unlock(t);
25364 -               }
25365 +       local_irq_enable();
25367 -               local_irq_disable();
25368 -               t->next = NULL;
25369 -               *__this_cpu_read(tasklet_hi_vec.tail) = t;
25370 -               __this_cpu_write(tasklet_hi_vec.tail, &(t->next));
25371 -               __raise_softirq_irqoff(HI_SOFTIRQ);
25372 -               local_irq_enable();
25373 -       }
25374 +       __tasklet_action(a, list);
25377  void tasklet_init(struct tasklet_struct *t,
25378 @@ -572,7 +1070,7 @@ void tasklet_kill(struct tasklet_struct *t)
25380         while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) {
25381                 do {
25382 -                       yield();
25383 +                       msleep(1);
25384                 } while (test_bit(TASKLET_STATE_SCHED, &t->state));
25385         }
25386         tasklet_unlock_wait(t);
25387 @@ -646,25 +1144,26 @@ void __init softirq_init(void)
25388         open_softirq(HI_SOFTIRQ, tasklet_hi_action);
25391 -static int ksoftirqd_should_run(unsigned int cpu)
25393 -       return local_softirq_pending();
25396 -static void run_ksoftirqd(unsigned int cpu)
25397 +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
25398 +void tasklet_unlock_wait(struct tasklet_struct *t)
25400 -       local_irq_disable();
25401 -       if (local_softirq_pending()) {
25402 +       while (test_bit(TASKLET_STATE_RUN, &(t)->state)) {
25403                 /*
25404 -                * We can safely run softirq on inline stack, as we are not deep
25405 -                * in the task stack here.
25406 +                * Hack for now to avoid this busy-loop:
25407                  */
25408 -               __do_softirq();
25409 -               local_irq_enable();
25410 -               cond_resched_rcu_qs();
25411 -               return;
25412 +#ifdef CONFIG_PREEMPT_RT_FULL
25413 +               msleep(1);
25414 +#else
25415 +               barrier();
25416 +#endif
25417         }
25418 -       local_irq_enable();
25420 +EXPORT_SYMBOL(tasklet_unlock_wait);
25421 +#endif
25423 +static int ksoftirqd_should_run(unsigned int cpu)
25425 +       return ksoftirqd_softirq_pending();
25428  #ifdef CONFIG_HOTPLUG_CPU
25429 @@ -746,16 +1245,31 @@ static struct notifier_block cpu_nfb = {
25431  static struct smp_hotplug_thread softirq_threads = {
25432         .store                  = &ksoftirqd,
25433 +       .setup                  = ksoftirqd_set_sched_params,
25434         .thread_should_run      = ksoftirqd_should_run,
25435         .thread_fn              = run_ksoftirqd,
25436         .thread_comm            = "ksoftirqd/%u",
25437  };
25439 +#ifdef CONFIG_PREEMPT_RT_FULL
25440 +static struct smp_hotplug_thread softirq_timer_threads = {
25441 +       .store                  = &ktimer_softirqd,
25442 +       .setup                  = ktimer_softirqd_set_sched_params,
25443 +       .cleanup                = ktimer_softirqd_clr_sched_params,
25444 +       .thread_should_run      = ktimer_softirqd_should_run,
25445 +       .thread_fn              = run_ksoftirqd,
25446 +       .thread_comm            = "ktimersoftd/%u",
25448 +#endif
25450  static __init int spawn_ksoftirqd(void)
25452         register_cpu_notifier(&cpu_nfb);
25454         BUG_ON(smpboot_register_percpu_thread(&softirq_threads));
25455 +#ifdef CONFIG_PREEMPT_RT_FULL
25456 +       BUG_ON(smpboot_register_percpu_thread(&softirq_timer_threads));
25457 +#endif
25459         return 0;
25461 diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
25462 index a3bbaee77c58..f84d3b45cda7 100644
25463 --- a/kernel/stop_machine.c
25464 +++ b/kernel/stop_machine.c
25465 @@ -37,7 +37,7 @@ struct cpu_stop_done {
25466  struct cpu_stopper {
25467         struct task_struct      *thread;
25469 -       spinlock_t              lock;
25470 +       raw_spinlock_t          lock;
25471         bool                    enabled;        /* is this stopper enabled? */
25472         struct list_head        works;          /* list of pending works */
25474 @@ -86,12 +86,12 @@ static void cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
25475         struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
25476         unsigned long flags;
25478 -       spin_lock_irqsave(&stopper->lock, flags);
25479 +       raw_spin_lock_irqsave(&stopper->lock, flags);
25480         if (stopper->enabled)
25481                 __cpu_stop_queue_work(stopper, work);
25482         else
25483                 cpu_stop_signal_done(work->done, false);
25484 -       spin_unlock_irqrestore(&stopper->lock, flags);
25485 +       raw_spin_unlock_irqrestore(&stopper->lock, flags);
25488  /**
25489 @@ -224,8 +224,8 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1,
25490         int err;
25492         lg_double_lock(&stop_cpus_lock, cpu1, cpu2);
25493 -       spin_lock_irq(&stopper1->lock);
25494 -       spin_lock_nested(&stopper2->lock, SINGLE_DEPTH_NESTING);
25495 +       raw_spin_lock_irq(&stopper1->lock);
25496 +       raw_spin_lock_nested(&stopper2->lock, SINGLE_DEPTH_NESTING);
25498         err = -ENOENT;
25499         if (!stopper1->enabled || !stopper2->enabled)
25500 @@ -235,8 +235,8 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1,
25501         __cpu_stop_queue_work(stopper1, work1);
25502         __cpu_stop_queue_work(stopper2, work2);
25503  unlock:
25504 -       spin_unlock(&stopper2->lock);
25505 -       spin_unlock_irq(&stopper1->lock);
25506 +       raw_spin_unlock(&stopper2->lock);
25507 +       raw_spin_unlock_irq(&stopper1->lock);
25508         lg_double_unlock(&stop_cpus_lock, cpu1, cpu2);
25510         return err;
25511 @@ -258,7 +258,7 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
25512         struct cpu_stop_work work1, work2;
25513         struct multi_stop_data msdata;
25515 -       preempt_disable();
25516 +       preempt_disable_nort();
25517         msdata = (struct multi_stop_data){
25518                 .fn = fn,
25519                 .data = arg,
25520 @@ -278,11 +278,11 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
25521         if (cpu1 > cpu2)
25522                 swap(cpu1, cpu2);
25523         if (cpu_stop_queue_two_works(cpu1, &work1, cpu2, &work2)) {
25524 -               preempt_enable();
25525 +               preempt_enable_nort();
25526                 return -ENOENT;
25527         }
25529 -       preempt_enable();
25530 +       preempt_enable_nort();
25532         wait_for_completion(&done.completion);
25534 @@ -315,17 +315,20 @@ static DEFINE_MUTEX(stop_cpus_mutex);
25536  static void queue_stop_cpus_work(const struct cpumask *cpumask,
25537                                  cpu_stop_fn_t fn, void *arg,
25538 -                                struct cpu_stop_done *done)
25539 +                                struct cpu_stop_done *done, bool inactive)
25541         struct cpu_stop_work *work;
25542         unsigned int cpu;
25544         /*
25545 -        * Disable preemption while queueing to avoid getting
25546 -        * preempted by a stopper which might wait for other stoppers
25547 -        * to enter @fn which can lead to deadlock.
25548 +        * Make sure that all work is queued on all cpus before
25549 +        * any of the cpus can execute it.
25550          */
25551 -       lg_global_lock(&stop_cpus_lock);
25552 +       if (!inactive)
25553 +               lg_global_lock(&stop_cpus_lock);
25554 +       else
25555 +               lg_global_trylock_relax(&stop_cpus_lock);
25557         for_each_cpu(cpu, cpumask) {
25558                 work = &per_cpu(cpu_stopper.stop_work, cpu);
25559                 work->fn = fn;
25560 @@ -342,7 +345,7 @@ static int __stop_cpus(const struct cpumask *cpumask,
25561         struct cpu_stop_done done;
25563         cpu_stop_init_done(&done, cpumask_weight(cpumask));
25564 -       queue_stop_cpus_work(cpumask, fn, arg, &done);
25565 +       queue_stop_cpus_work(cpumask, fn, arg, &done, false);
25566         wait_for_completion(&done.completion);
25567         return done.executed ? done.ret : -ENOENT;
25569 @@ -422,9 +425,9 @@ static int cpu_stop_should_run(unsigned int cpu)
25570         unsigned long flags;
25571         int run;
25573 -       spin_lock_irqsave(&stopper->lock, flags);
25574 +       raw_spin_lock_irqsave(&stopper->lock, flags);
25575         run = !list_empty(&stopper->works);
25576 -       spin_unlock_irqrestore(&stopper->lock, flags);
25577 +       raw_spin_unlock_irqrestore(&stopper->lock, flags);
25578         return run;
25581 @@ -436,13 +439,13 @@ static void cpu_stopper_thread(unsigned int cpu)
25583  repeat:
25584         work = NULL;
25585 -       spin_lock_irq(&stopper->lock);
25586 +       raw_spin_lock_irq(&stopper->lock);
25587         if (!list_empty(&stopper->works)) {
25588                 work = list_first_entry(&stopper->works,
25589                                         struct cpu_stop_work, list);
25590                 list_del_init(&work->list);
25591         }
25592 -       spin_unlock_irq(&stopper->lock);
25593 +       raw_spin_unlock_irq(&stopper->lock);
25595         if (work) {
25596                 cpu_stop_fn_t fn = work->fn;
25597 @@ -450,6 +453,16 @@ repeat:
25598                 struct cpu_stop_done *done = work->done;
25599                 char ksym_buf[KSYM_NAME_LEN] __maybe_unused;
25601 +               /*
25602 +                * Wait until the stopper finished scheduling on all
25603 +                * cpus
25604 +                */
25605 +               lg_global_lock(&stop_cpus_lock);
25606 +               /*
25607 +                * Let other cpu threads continue as well
25608 +                */
25609 +               lg_global_unlock(&stop_cpus_lock);
25611                 /* cpu stop callbacks are not allowed to sleep */
25612                 preempt_disable();
25614 @@ -520,10 +533,12 @@ static int __init cpu_stop_init(void)
25615         for_each_possible_cpu(cpu) {
25616                 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
25618 -               spin_lock_init(&stopper->lock);
25619 +               raw_spin_lock_init(&stopper->lock);
25620                 INIT_LIST_HEAD(&stopper->works);
25621         }
25623 +       lg_lock_init(&stop_cpus_lock, "stop_cpus_lock");
25625         BUG_ON(smpboot_register_percpu_thread(&cpu_stop_threads));
25626         stop_machine_unpark(raw_smp_processor_id());
25627         stop_machine_initialized = true;
25628 @@ -620,7 +635,7 @@ int stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data,
25629         set_state(&msdata, MULTI_STOP_PREPARE);
25630         cpu_stop_init_done(&done, num_active_cpus());
25631         queue_stop_cpus_work(cpu_active_mask, multi_cpu_stop, &msdata,
25632 -                            &done);
25633 +                            &done, true);
25634         ret = multi_cpu_stop(&msdata);
25636         /* Busy wait for completion. */
25637 diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
25638 index 17f7bcff1e02..ba3d60144838 100644
25639 --- a/kernel/time/hrtimer.c
25640 +++ b/kernel/time/hrtimer.c
25641 @@ -48,11 +48,13 @@
25642  #include <linux/sched/rt.h>
25643  #include <linux/sched/deadline.h>
25644  #include <linux/timer.h>
25645 +#include <linux/kthread.h>
25646  #include <linux/freezer.h>
25648  #include <asm/uaccess.h>
25650  #include <trace/events/timer.h>
25651 +#include <trace/events/hist.h>
25653  #include "tick-internal.h"
25655 @@ -717,6 +719,44 @@ static void clock_was_set_work(struct work_struct *work)
25657  static DECLARE_WORK(hrtimer_work, clock_was_set_work);
25659 +#ifdef CONFIG_PREEMPT_RT_FULL
25661 + * RT can not call schedule_work from real interrupt context.
25662 + * Need to make a thread to do the real work.
25663 + */
25664 +static struct task_struct *clock_set_delay_thread;
25665 +static bool do_clock_set_delay;
25667 +static int run_clock_set_delay(void *ignore)
25669 +       while (!kthread_should_stop()) {
25670 +               set_current_state(TASK_INTERRUPTIBLE);
25671 +               if (do_clock_set_delay) {
25672 +                       do_clock_set_delay = false;
25673 +                       schedule_work(&hrtimer_work);
25674 +               }
25675 +               schedule();
25676 +       }
25677 +       __set_current_state(TASK_RUNNING);
25678 +       return 0;
25681 +void clock_was_set_delayed(void)
25683 +       do_clock_set_delay = true;
25684 +       /* Make visible before waking up process */
25685 +       smp_wmb();
25686 +       wake_up_process(clock_set_delay_thread);
25689 +static __init int create_clock_set_delay_thread(void)
25691 +       clock_set_delay_thread = kthread_run(run_clock_set_delay, NULL, "kclksetdelayd");
25692 +       BUG_ON(!clock_set_delay_thread);
25693 +       return 0;
25695 +early_initcall(create_clock_set_delay_thread);
25696 +#else /* PREEMPT_RT_FULL */
25697  /*
25698   * Called from timekeeping and resume code to reprogramm the hrtimer
25699   * interrupt device on all cpus.
25700 @@ -725,6 +765,7 @@ void clock_was_set_delayed(void)
25702         schedule_work(&hrtimer_work);
25704 +#endif
25706  #else
25708 @@ -734,11 +775,8 @@ static inline int hrtimer_is_hres_enabled(void) { return 0; }
25709  static inline void hrtimer_switch_to_hres(void) { }
25710  static inline void
25711  hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { }
25712 -static inline int hrtimer_reprogram(struct hrtimer *timer,
25713 -                                   struct hrtimer_clock_base *base)
25715 -       return 0;
25717 +static inline void hrtimer_reprogram(struct hrtimer *timer,
25718 +                                    struct hrtimer_clock_base *base) { }
25719  static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { }
25720  static inline void retrigger_next_event(void *arg) { }
25722 @@ -870,6 +908,32 @@ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
25724  EXPORT_SYMBOL_GPL(hrtimer_forward);
25726 +#ifdef CONFIG_PREEMPT_RT_BASE
25727 +# define wake_up_timer_waiters(b)      wake_up(&(b)->wait)
25729 +/**
25730 + * hrtimer_wait_for_timer - Wait for a running timer
25731 + *
25732 + * @timer:     timer to wait for
25733 + *
25734 + * The function waits in case the timers callback function is
25735 + * currently executed on the waitqueue of the timer base. The
25736 + * waitqueue is woken up after the timer callback function has
25737 + * finished execution.
25738 + */
25739 +void hrtimer_wait_for_timer(const struct hrtimer *timer)
25741 +       struct hrtimer_clock_base *base = timer->base;
25743 +       if (base && base->cpu_base && !timer->irqsafe)
25744 +               wait_event(base->cpu_base->wait,
25745 +                               !(hrtimer_callback_running(timer)));
25748 +#else
25749 +# define wake_up_timer_waiters(b)      do { } while (0)
25750 +#endif
25752  /*
25753   * enqueue_hrtimer - internal function to (re)start a timer
25754   *
25755 @@ -911,6 +975,11 @@ static void __remove_hrtimer(struct hrtimer *timer,
25756         if (!(state & HRTIMER_STATE_ENQUEUED))
25757                 return;
25759 +       if (unlikely(!list_empty(&timer->cb_entry))) {
25760 +               list_del_init(&timer->cb_entry);
25761 +               return;
25762 +       }
25764         if (!timerqueue_del(&base->active, &timer->node))
25765                 cpu_base->active_bases &= ~(1 << base->index);
25767 @@ -1006,7 +1075,16 @@ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
25768         new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED);
25770         timer_stats_hrtimer_set_start_info(timer);
25771 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
25772 +       {
25773 +               ktime_t now = new_base->get_time();
25775 +               if (ktime_to_ns(tim) < ktime_to_ns(now))
25776 +                       timer->praecox = now;
25777 +               else
25778 +                       timer->praecox = ktime_set(0, 0);
25779 +       }
25780 +#endif
25781         leftmost = enqueue_hrtimer(timer, new_base);
25782         if (!leftmost)
25783                 goto unlock;
25784 @@ -1078,7 +1156,7 @@ int hrtimer_cancel(struct hrtimer *timer)
25786                 if (ret >= 0)
25787                         return ret;
25788 -               cpu_relax();
25789 +               hrtimer_wait_for_timer(timer);
25790         }
25792  EXPORT_SYMBOL_GPL(hrtimer_cancel);
25793 @@ -1142,6 +1220,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
25795         base = hrtimer_clockid_to_base(clock_id);
25796         timer->base = &cpu_base->clock_base[base];
25797 +       INIT_LIST_HEAD(&timer->cb_entry);
25798         timerqueue_init(&timer->node);
25800  #ifdef CONFIG_TIMER_STATS
25801 @@ -1182,6 +1261,7 @@ bool hrtimer_active(const struct hrtimer *timer)
25802                 seq = raw_read_seqcount_begin(&cpu_base->seq);
25804                 if (timer->state != HRTIMER_STATE_INACTIVE ||
25805 +                   cpu_base->running_soft == timer ||
25806                     cpu_base->running == timer)
25807                         return true;
25809 @@ -1280,10 +1360,112 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
25810         cpu_base->running = NULL;
25813 +#ifdef CONFIG_PREEMPT_RT_BASE
25814 +static void hrtimer_rt_reprogram(int restart, struct hrtimer *timer,
25815 +                                struct hrtimer_clock_base *base)
25817 +       int leftmost;
25819 +       if (restart != HRTIMER_NORESTART &&
25820 +           !(timer->state & HRTIMER_STATE_ENQUEUED)) {
25822 +               leftmost = enqueue_hrtimer(timer, base);
25823 +               if (!leftmost)
25824 +                       return;
25825 +#ifdef CONFIG_HIGH_RES_TIMERS
25826 +               if (!hrtimer_is_hres_active(timer)) {
25827 +                       /*
25828 +                        * Kick to reschedule the next tick to handle the new timer
25829 +                        * on dynticks target.
25830 +                        */
25831 +                       if (base->cpu_base->nohz_active)
25832 +                               wake_up_nohz_cpu(base->cpu_base->cpu);
25833 +               } else {
25835 +                       hrtimer_reprogram(timer, base);
25836 +               }
25837 +#endif
25838 +       }
25842 + * The changes in mainline which removed the callback modes from
25843 + * hrtimer are not yet working with -rt. The non wakeup_process()
25844 + * based callbacks which involve sleeping locks need to be treated
25845 + * seperately.
25846 + */
25847 +static void hrtimer_rt_run_pending(void)
25849 +       enum hrtimer_restart (*fn)(struct hrtimer *);
25850 +       struct hrtimer_cpu_base *cpu_base;
25851 +       struct hrtimer_clock_base *base;
25852 +       struct hrtimer *timer;
25853 +       int index, restart;
25855 +       local_irq_disable();
25856 +       cpu_base = &per_cpu(hrtimer_bases, smp_processor_id());
25858 +       raw_spin_lock(&cpu_base->lock);
25860 +       for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) {
25861 +               base = &cpu_base->clock_base[index];
25863 +               while (!list_empty(&base->expired)) {
25864 +                       timer = list_first_entry(&base->expired,
25865 +                                                struct hrtimer, cb_entry);
25867 +                       /*
25868 +                        * Same as the above __run_hrtimer function
25869 +                        * just we run with interrupts enabled.
25870 +                        */
25871 +                       debug_deactivate(timer);
25872 +                       cpu_base->running_soft = timer;
25873 +                       raw_write_seqcount_barrier(&cpu_base->seq);
25875 +                       __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0);
25876 +                       timer_stats_account_hrtimer(timer);
25877 +                       fn = timer->function;
25879 +                       raw_spin_unlock_irq(&cpu_base->lock);
25880 +                       restart = fn(timer);
25881 +                       raw_spin_lock_irq(&cpu_base->lock);
25883 +                       hrtimer_rt_reprogram(restart, timer, base);
25884 +                       raw_write_seqcount_barrier(&cpu_base->seq);
25886 +                       WARN_ON_ONCE(cpu_base->running_soft != timer);
25887 +                       cpu_base->running_soft = NULL;
25888 +               }
25889 +       }
25891 +       raw_spin_unlock_irq(&cpu_base->lock);
25893 +       wake_up_timer_waiters(cpu_base);
25896 +static int hrtimer_rt_defer(struct hrtimer *timer)
25898 +       if (timer->irqsafe)
25899 +               return 0;
25901 +       __remove_hrtimer(timer, timer->base, timer->state, 0);
25902 +       list_add_tail(&timer->cb_entry, &timer->base->expired);
25903 +       return 1;
25906 +#else
25908 +static inline int hrtimer_rt_defer(struct hrtimer *timer) { return 0; }
25910 +#endif
25912 +static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer);
25914  static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)
25916         struct hrtimer_clock_base *base = cpu_base->clock_base;
25917         unsigned int active = cpu_base->active_bases;
25918 +       int raise = 0;
25920         for (; active; base++, active >>= 1) {
25921                 struct timerqueue_node *node;
25922 @@ -1299,6 +1481,15 @@ static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)
25924                         timer = container_of(node, struct hrtimer, node);
25926 +                       trace_hrtimer_interrupt(raw_smp_processor_id(),
25927 +                           ktime_to_ns(ktime_sub(ktime_to_ns(timer->praecox) ?
25928 +                               timer->praecox : hrtimer_get_expires(timer),
25929 +                               basenow)),
25930 +                           current,
25931 +                           timer->function == hrtimer_wakeup ?
25932 +                           container_of(timer, struct hrtimer_sleeper,
25933 +                               timer)->task : NULL);
25935                         /*
25936                          * The immediate goal for using the softexpires is
25937                          * minimizing wakeups, not running timers at the
25938 @@ -1314,9 +1505,14 @@ static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)
25939                         if (basenow.tv64 < hrtimer_get_softexpires_tv64(timer))
25940                                 break;
25942 -                       __run_hrtimer(cpu_base, base, timer, &basenow);
25943 +                       if (!hrtimer_rt_defer(timer))
25944 +                               __run_hrtimer(cpu_base, base, timer, &basenow);
25945 +                       else
25946 +                               raise = 1;
25947                 }
25948         }
25949 +       if (raise)
25950 +               raise_softirq_irqoff(HRTIMER_SOFTIRQ);
25953  #ifdef CONFIG_HIGH_RES_TIMERS
25954 @@ -1479,16 +1675,18 @@ static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer)
25955  void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
25957         sl->timer.function = hrtimer_wakeup;
25958 +       sl->timer.irqsafe = 1;
25959         sl->task = task;
25961  EXPORT_SYMBOL_GPL(hrtimer_init_sleeper);
25963 -static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode)
25964 +static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode,
25965 +                               unsigned long state)
25967         hrtimer_init_sleeper(t, current);
25969         do {
25970 -               set_current_state(TASK_INTERRUPTIBLE);
25971 +               set_current_state(state);
25972                 hrtimer_start_expires(&t->timer, mode);
25974                 if (likely(t->task))
25975 @@ -1530,7 +1728,8 @@ long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
25976                                 HRTIMER_MODE_ABS);
25977         hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires);
25979 -       if (do_nanosleep(&t, HRTIMER_MODE_ABS))
25980 +       /* cpu_chill() does not care about restart state. */
25981 +       if (do_nanosleep(&t, HRTIMER_MODE_ABS, TASK_INTERRUPTIBLE))
25982                 goto out;
25984         rmtp = restart->nanosleep.rmtp;
25985 @@ -1547,8 +1746,10 @@ out:
25986         return ret;
25989 -long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
25990 -                      const enum hrtimer_mode mode, const clockid_t clockid)
25991 +static long
25992 +__hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
25993 +                   const enum hrtimer_mode mode, const clockid_t clockid,
25994 +                   unsigned long state)
25996         struct restart_block *restart;
25997         struct hrtimer_sleeper t;
25998 @@ -1561,7 +1762,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
26000         hrtimer_init_on_stack(&t.timer, clockid, mode);
26001         hrtimer_set_expires_range_ns(&t.timer, timespec_to_ktime(*rqtp), slack);
26002 -       if (do_nanosleep(&t, mode))
26003 +       if (do_nanosleep(&t, mode, state))
26004                 goto out;
26006         /* Absolute timers do not update the rmtp value and restart: */
26007 @@ -1588,6 +1789,12 @@ out:
26008         return ret;
26011 +long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
26012 +                      const enum hrtimer_mode mode, const clockid_t clockid)
26014 +       return __hrtimer_nanosleep(rqtp, rmtp, mode, clockid, TASK_INTERRUPTIBLE);
26017  SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp,
26018                 struct timespec __user *, rmtp)
26020 @@ -1602,6 +1809,26 @@ SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp,
26021         return hrtimer_nanosleep(&tu, rmtp, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
26024 +#ifdef CONFIG_PREEMPT_RT_FULL
26026 + * Sleep for 1 ms in hope whoever holds what we want will let it go.
26027 + */
26028 +void cpu_chill(void)
26030 +       struct timespec tu = {
26031 +               .tv_nsec = NSEC_PER_MSEC,
26032 +       };
26033 +       unsigned int freeze_flag = current->flags & PF_NOFREEZE;
26035 +       current->flags |= PF_NOFREEZE;
26036 +       __hrtimer_nanosleep(&tu, NULL, HRTIMER_MODE_REL, CLOCK_MONOTONIC,
26037 +                           TASK_UNINTERRUPTIBLE);
26038 +       if (!freeze_flag)
26039 +               current->flags &= ~PF_NOFREEZE;
26041 +EXPORT_SYMBOL(cpu_chill);
26042 +#endif
26044  /*
26045   * Functions related to boot-time initialization:
26046   */
26047 @@ -1613,10 +1840,14 @@ static void init_hrtimers_cpu(int cpu)
26048         for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
26049                 cpu_base->clock_base[i].cpu_base = cpu_base;
26050                 timerqueue_init_head(&cpu_base->clock_base[i].active);
26051 +               INIT_LIST_HEAD(&cpu_base->clock_base[i].expired);
26052         }
26054         cpu_base->cpu = cpu;
26055         hrtimer_init_hres(cpu_base);
26056 +#ifdef CONFIG_PREEMPT_RT_BASE
26057 +       init_waitqueue_head(&cpu_base->wait);
26058 +#endif
26061  #ifdef CONFIG_HOTPLUG_CPU
26062 @@ -1714,11 +1945,21 @@ static struct notifier_block hrtimers_nb = {
26063         .notifier_call = hrtimer_cpu_notify,
26064  };
26066 +#ifdef CONFIG_PREEMPT_RT_BASE
26067 +static void run_hrtimer_softirq(struct softirq_action *h)
26069 +       hrtimer_rt_run_pending();
26071 +#endif
26073  void __init hrtimers_init(void)
26075         hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE,
26076                           (void *)(long)smp_processor_id());
26077         register_cpu_notifier(&hrtimers_nb);
26078 +#ifdef CONFIG_PREEMPT_RT_BASE
26079 +       open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq);
26080 +#endif
26083  /**
26084 diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c
26085 index 1d5c7204ddc9..184de6751180 100644
26086 --- a/kernel/time/itimer.c
26087 +++ b/kernel/time/itimer.c
26088 @@ -213,6 +213,7 @@ again:
26089                 /* We are sharing ->siglock with it_real_fn() */
26090                 if (hrtimer_try_to_cancel(timer) < 0) {
26091                         spin_unlock_irq(&tsk->sighand->siglock);
26092 +                       hrtimer_wait_for_timer(&tsk->signal->real_timer);
26093                         goto again;
26094                 }
26095                 expires = timeval_to_ktime(value->it_value);
26096 diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
26097 index 347fecf86a3f..2ede47408a3e 100644
26098 --- a/kernel/time/jiffies.c
26099 +++ b/kernel/time/jiffies.c
26100 @@ -74,7 +74,8 @@ static struct clocksource clocksource_jiffies = {
26101         .max_cycles     = 10,
26102  };
26104 -__cacheline_aligned_in_smp DEFINE_SEQLOCK(jiffies_lock);
26105 +__cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(jiffies_lock);
26106 +__cacheline_aligned_in_smp seqcount_t jiffies_seq;
26108  #if (BITS_PER_LONG < 64)
26109  u64 get_jiffies_64(void)
26110 @@ -83,9 +84,9 @@ u64 get_jiffies_64(void)
26111         u64 ret;
26113         do {
26114 -               seq = read_seqbegin(&jiffies_lock);
26115 +               seq = read_seqcount_begin(&jiffies_seq);
26116                 ret = jiffies_64;
26117 -       } while (read_seqretry(&jiffies_lock, seq));
26118 +       } while (read_seqcount_retry(&jiffies_seq, seq));
26119         return ret;
26121  EXPORT_SYMBOL(get_jiffies_64);
26122 diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
26123 index ab861771e37f..0f6868fd2de6 100644
26124 --- a/kernel/time/ntp.c
26125 +++ b/kernel/time/ntp.c
26126 @@ -10,6 +10,7 @@
26127  #include <linux/workqueue.h>
26128  #include <linux/hrtimer.h>
26129  #include <linux/jiffies.h>
26130 +#include <linux/kthread.h>
26131  #include <linux/math64.h>
26132  #include <linux/timex.h>
26133  #include <linux/time.h>
26134 @@ -562,10 +563,52 @@ static void sync_cmos_clock(struct work_struct *work)
26135                            &sync_cmos_work, timespec64_to_jiffies(&next));
26138 +#ifdef CONFIG_PREEMPT_RT_FULL
26140 + * RT can not call schedule_delayed_work from real interrupt context.
26141 + * Need to make a thread to do the real work.
26142 + */
26143 +static struct task_struct *cmos_delay_thread;
26144 +static bool do_cmos_delay;
26146 +static int run_cmos_delay(void *ignore)
26148 +       while (!kthread_should_stop()) {
26149 +               set_current_state(TASK_INTERRUPTIBLE);
26150 +               if (do_cmos_delay) {
26151 +                       do_cmos_delay = false;
26152 +                       queue_delayed_work(system_power_efficient_wq,
26153 +                                          &sync_cmos_work, 0);
26154 +               }
26155 +               schedule();
26156 +       }
26157 +       __set_current_state(TASK_RUNNING);
26158 +       return 0;
26161 +void ntp_notify_cmos_timer(void)
26163 +       do_cmos_delay = true;
26164 +       /* Make visible before waking up process */
26165 +       smp_wmb();
26166 +       wake_up_process(cmos_delay_thread);
26169 +static __init int create_cmos_delay_thread(void)
26171 +       cmos_delay_thread = kthread_run(run_cmos_delay, NULL, "kcmosdelayd");
26172 +       BUG_ON(!cmos_delay_thread);
26173 +       return 0;
26175 +early_initcall(create_cmos_delay_thread);
26177 +#else
26179  void ntp_notify_cmos_timer(void)
26181         queue_delayed_work(system_power_efficient_wq, &sync_cmos_work, 0);
26183 +#endif /* CONFIG_PREEMPT_RT_FULL */
26185  #else
26186  void ntp_notify_cmos_timer(void) { }
26187 diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
26188 index 80016b329d94..b7342b6e6a5a 100644
26189 --- a/kernel/time/posix-cpu-timers.c
26190 +++ b/kernel/time/posix-cpu-timers.c
26191 @@ -3,6 +3,7 @@
26192   */
26194  #include <linux/sched.h>
26195 +#include <linux/sched/rt.h>
26196  #include <linux/posix-timers.h>
26197  #include <linux/errno.h>
26198  #include <linux/math64.h>
26199 @@ -650,7 +651,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
26200         /*
26201          * Disarm any old timer after extracting its expiry time.
26202          */
26203 -       WARN_ON_ONCE(!irqs_disabled());
26204 +       WARN_ON_ONCE_NONRT(!irqs_disabled());
26206         ret = 0;
26207         old_incr = timer->it.cpu.incr;
26208 @@ -1092,7 +1093,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
26209         /*
26210          * Now re-arm for the new expiry time.
26211          */
26212 -       WARN_ON_ONCE(!irqs_disabled());
26213 +       WARN_ON_ONCE_NONRT(!irqs_disabled());
26214         arm_timer(timer);
26215         unlock_task_sighand(p, &flags);
26217 @@ -1183,13 +1184,13 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
26218   * already updated our counts.  We need to check if any timers fire now.
26219   * Interrupts are disabled.
26220   */
26221 -void run_posix_cpu_timers(struct task_struct *tsk)
26222 +static void __run_posix_cpu_timers(struct task_struct *tsk)
26224         LIST_HEAD(firing);
26225         struct k_itimer *timer, *next;
26226         unsigned long flags;
26228 -       WARN_ON_ONCE(!irqs_disabled());
26229 +       WARN_ON_ONCE_NONRT(!irqs_disabled());
26231         /*
26232          * The fast path checks that there are no expired thread or thread
26233 @@ -1243,6 +1244,190 @@ void run_posix_cpu_timers(struct task_struct *tsk)
26234         }
26237 +#ifdef CONFIG_PREEMPT_RT_BASE
26238 +#include <linux/kthread.h>
26239 +#include <linux/cpu.h>
26240 +DEFINE_PER_CPU(struct task_struct *, posix_timer_task);
26241 +DEFINE_PER_CPU(struct task_struct *, posix_timer_tasklist);
26243 +static int posix_cpu_timers_thread(void *data)
26245 +       int cpu = (long)data;
26247 +       BUG_ON(per_cpu(posix_timer_task,cpu) != current);
26249 +       while (!kthread_should_stop()) {
26250 +               struct task_struct *tsk = NULL;
26251 +               struct task_struct *next = NULL;
26253 +               if (cpu_is_offline(cpu))
26254 +                       goto wait_to_die;
26256 +               /* grab task list */
26257 +               raw_local_irq_disable();
26258 +               tsk = per_cpu(posix_timer_tasklist, cpu);
26259 +               per_cpu(posix_timer_tasklist, cpu) = NULL;
26260 +               raw_local_irq_enable();
26262 +               /* its possible the list is empty, just return */
26263 +               if (!tsk) {
26264 +                       set_current_state(TASK_INTERRUPTIBLE);
26265 +                       schedule();
26266 +                       __set_current_state(TASK_RUNNING);
26267 +                       continue;
26268 +               }
26270 +               /* Process task list */
26271 +               while (1) {
26272 +                       /* save next */
26273 +                       next = tsk->posix_timer_list;
26275 +                       /* run the task timers, clear its ptr and
26276 +                        * unreference it
26277 +                        */
26278 +                       __run_posix_cpu_timers(tsk);
26279 +                       tsk->posix_timer_list = NULL;
26280 +                       put_task_struct(tsk);
26282 +                       /* check if this is the last on the list */
26283 +                       if (next == tsk)
26284 +                               break;
26285 +                       tsk = next;
26286 +               }
26287 +       }
26288 +       return 0;
26290 +wait_to_die:
26291 +       /* Wait for kthread_stop */
26292 +       set_current_state(TASK_INTERRUPTIBLE);
26293 +       while (!kthread_should_stop()) {
26294 +               schedule();
26295 +               set_current_state(TASK_INTERRUPTIBLE);
26296 +       }
26297 +       __set_current_state(TASK_RUNNING);
26298 +       return 0;
26301 +static inline int __fastpath_timer_check(struct task_struct *tsk)
26303 +       /* tsk == current, ensure it is safe to use ->signal/sighand */
26304 +       if (unlikely(tsk->exit_state))
26305 +               return 0;
26307 +       if (!task_cputime_zero(&tsk->cputime_expires))
26308 +                       return 1;
26310 +       if (!task_cputime_zero(&tsk->signal->cputime_expires))
26311 +                       return 1;
26313 +       return 0;
26316 +void run_posix_cpu_timers(struct task_struct *tsk)
26318 +       unsigned long cpu = smp_processor_id();
26319 +       struct task_struct *tasklist;
26321 +       BUG_ON(!irqs_disabled());
26322 +       if(!per_cpu(posix_timer_task, cpu))
26323 +               return;
26324 +       /* get per-cpu references */
26325 +       tasklist = per_cpu(posix_timer_tasklist, cpu);
26327 +       /* check to see if we're already queued */
26328 +       if (!tsk->posix_timer_list && __fastpath_timer_check(tsk)) {
26329 +               get_task_struct(tsk);
26330 +               if (tasklist) {
26331 +                       tsk->posix_timer_list = tasklist;
26332 +               } else {
26333 +                       /*
26334 +                        * The list is terminated by a self-pointing
26335 +                        * task_struct
26336 +                        */
26337 +                       tsk->posix_timer_list = tsk;
26338 +               }
26339 +               per_cpu(posix_timer_tasklist, cpu) = tsk;
26341 +               wake_up_process(per_cpu(posix_timer_task, cpu));
26342 +       }
26346 + * posix_cpu_thread_call - callback that gets triggered when a CPU is added.
26347 + * Here we can start up the necessary migration thread for the new CPU.
26348 + */
26349 +static int posix_cpu_thread_call(struct notifier_block *nfb,
26350 +                                unsigned long action, void *hcpu)
26352 +       int cpu = (long)hcpu;
26353 +       struct task_struct *p;
26354 +       struct sched_param param;
26356 +       switch (action) {
26357 +       case CPU_UP_PREPARE:
26358 +               p = kthread_create(posix_cpu_timers_thread, hcpu,
26359 +                                       "posixcputmr/%d",cpu);
26360 +               if (IS_ERR(p))
26361 +                       return NOTIFY_BAD;
26362 +               p->flags |= PF_NOFREEZE;
26363 +               kthread_bind(p, cpu);
26364 +               /* Must be high prio to avoid getting starved */
26365 +               param.sched_priority = MAX_RT_PRIO-1;
26366 +               sched_setscheduler(p, SCHED_FIFO, &param);
26367 +               per_cpu(posix_timer_task,cpu) = p;
26368 +               break;
26369 +       case CPU_ONLINE:
26370 +               /* Strictly unneccessary, as first user will wake it. */
26371 +               wake_up_process(per_cpu(posix_timer_task,cpu));
26372 +               break;
26373 +#ifdef CONFIG_HOTPLUG_CPU
26374 +       case CPU_UP_CANCELED:
26375 +               /* Unbind it from offline cpu so it can run.  Fall thru. */
26376 +               kthread_bind(per_cpu(posix_timer_task, cpu),
26377 +                            cpumask_any(cpu_online_mask));
26378 +               kthread_stop(per_cpu(posix_timer_task,cpu));
26379 +               per_cpu(posix_timer_task,cpu) = NULL;
26380 +               break;
26381 +       case CPU_DEAD:
26382 +               kthread_stop(per_cpu(posix_timer_task,cpu));
26383 +               per_cpu(posix_timer_task,cpu) = NULL;
26384 +               break;
26385 +#endif
26386 +       }
26387 +       return NOTIFY_OK;
26390 +/* Register at highest priority so that task migration (migrate_all_tasks)
26391 + * happens before everything else.
26392 + */
26393 +static struct notifier_block posix_cpu_thread_notifier = {
26394 +       .notifier_call = posix_cpu_thread_call,
26395 +       .priority = 10
26398 +static int __init posix_cpu_thread_init(void)
26400 +       void *hcpu = (void *)(long)smp_processor_id();
26401 +       /* Start one for boot CPU. */
26402 +       unsigned long cpu;
26404 +       /* init the per-cpu posix_timer_tasklets */
26405 +       for_each_possible_cpu(cpu)
26406 +               per_cpu(posix_timer_tasklist, cpu) = NULL;
26408 +       posix_cpu_thread_call(&posix_cpu_thread_notifier, CPU_UP_PREPARE, hcpu);
26409 +       posix_cpu_thread_call(&posix_cpu_thread_notifier, CPU_ONLINE, hcpu);
26410 +       register_cpu_notifier(&posix_cpu_thread_notifier);
26411 +       return 0;
26413 +early_initcall(posix_cpu_thread_init);
26414 +#else /* CONFIG_PREEMPT_RT_BASE */
26415 +void run_posix_cpu_timers(struct task_struct *tsk)
26417 +       __run_posix_cpu_timers(tsk);
26419 +#endif /* CONFIG_PREEMPT_RT_BASE */
26421  /*
26422   * Set one of the process-wide special case CPU timers or RLIMIT_CPU.
26423   * The tsk->sighand->siglock must be held by the caller.
26424 diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
26425 index f2826c35e918..464a98155a0e 100644
26426 --- a/kernel/time/posix-timers.c
26427 +++ b/kernel/time/posix-timers.c
26428 @@ -506,6 +506,7 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer)
26429  static struct pid *good_sigevent(sigevent_t * event)
26431         struct task_struct *rtn = current->group_leader;
26432 +       int sig = event->sigev_signo;
26434         if ((event->sigev_notify & SIGEV_THREAD_ID ) &&
26435                 (!(rtn = find_task_by_vpid(event->sigev_notify_thread_id)) ||
26436 @@ -514,7 +515,8 @@ static struct pid *good_sigevent(sigevent_t * event)
26437                 return NULL;
26439         if (((event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) &&
26440 -           ((event->sigev_signo <= 0) || (event->sigev_signo > SIGRTMAX)))
26441 +           (sig <= 0 || sig > SIGRTMAX || sig_kernel_only(sig) ||
26442 +            sig_kernel_coredump(sig)))
26443                 return NULL;
26445         return task_pid(rtn);
26446 @@ -826,6 +828,20 @@ SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id)
26447         return overrun;
26451 + * Protected by RCU!
26452 + */
26453 +static void timer_wait_for_callback(struct k_clock *kc, struct k_itimer *timr)
26455 +#ifdef CONFIG_PREEMPT_RT_FULL
26456 +       if (kc->timer_set == common_timer_set)
26457 +               hrtimer_wait_for_timer(&timr->it.real.timer);
26458 +       else
26459 +               /* FIXME: Whacky hack for posix-cpu-timers */
26460 +               schedule_timeout(1);
26461 +#endif
26464  /* Set a POSIX.1b interval timer. */
26465  /* timr->it_lock is taken. */
26466  static int
26467 @@ -903,6 +919,7 @@ retry:
26468         if (!timr)
26469                 return -EINVAL;
26471 +       rcu_read_lock();
26472         kc = clockid_to_kclock(timr->it_clock);
26473         if (WARN_ON_ONCE(!kc || !kc->timer_set))
26474                 error = -EINVAL;
26475 @@ -911,9 +928,12 @@ retry:
26477         unlock_timer(timr, flag);
26478         if (error == TIMER_RETRY) {
26479 +               timer_wait_for_callback(kc, timr);
26480                 rtn = NULL;     // We already got the old time...
26481 +               rcu_read_unlock();
26482                 goto retry;
26483         }
26484 +       rcu_read_unlock();
26486         if (old_setting && !error &&
26487             copy_to_user(old_setting, &old_spec, sizeof (old_spec)))
26488 @@ -951,10 +971,15 @@ retry_delete:
26489         if (!timer)
26490                 return -EINVAL;
26492 +       rcu_read_lock();
26493         if (timer_delete_hook(timer) == TIMER_RETRY) {
26494                 unlock_timer(timer, flags);
26495 +               timer_wait_for_callback(clockid_to_kclock(timer->it_clock),
26496 +                                       timer);
26497 +               rcu_read_unlock();
26498                 goto retry_delete;
26499         }
26500 +       rcu_read_unlock();
26502         spin_lock(&current->sighand->siglock);
26503         list_del(&timer->list);
26504 @@ -980,8 +1005,18 @@ static void itimer_delete(struct k_itimer *timer)
26505  retry_delete:
26506         spin_lock_irqsave(&timer->it_lock, flags);
26508 +       /* On RT we can race with a deletion */
26509 +       if (!timer->it_signal) {
26510 +               unlock_timer(timer, flags);
26511 +               return;
26512 +       }
26514         if (timer_delete_hook(timer) == TIMER_RETRY) {
26515 +               rcu_read_lock();
26516                 unlock_timer(timer, flags);
26517 +               timer_wait_for_callback(clockid_to_kclock(timer->it_clock),
26518 +                                       timer);
26519 +               rcu_read_unlock();
26520                 goto retry_delete;
26521         }
26522         list_del(&timer->list);
26523 diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c
26524 index 53d7184da0be..1b4ac3361c3f 100644
26525 --- a/kernel/time/tick-broadcast-hrtimer.c
26526 +++ b/kernel/time/tick-broadcast-hrtimer.c
26527 @@ -106,5 +106,6 @@ void tick_setup_hrtimer_broadcast(void)
26529         hrtimer_init(&bctimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
26530         bctimer.function = bc_handler;
26531 +       bctimer.irqsafe = true;
26532         clockevents_register_device(&ce_broadcast_hrtimer);
26534 diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
26535 index 4fcd99e12aa0..5a47f2e98faf 100644
26536 --- a/kernel/time/tick-common.c
26537 +++ b/kernel/time/tick-common.c
26538 @@ -79,13 +79,15 @@ int tick_is_oneshot_available(void)
26539  static void tick_periodic(int cpu)
26541         if (tick_do_timer_cpu == cpu) {
26542 -               write_seqlock(&jiffies_lock);
26543 +               raw_spin_lock(&jiffies_lock);
26544 +               write_seqcount_begin(&jiffies_seq);
26546                 /* Keep track of the next tick event */
26547                 tick_next_period = ktime_add(tick_next_period, tick_period);
26549                 do_timer(1);
26550 -               write_sequnlock(&jiffies_lock);
26551 +               write_seqcount_end(&jiffies_seq);
26552 +               raw_spin_unlock(&jiffies_lock);
26553                 update_wall_time();
26554         }
26556 @@ -157,9 +159,9 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
26557                 ktime_t next;
26559                 do {
26560 -                       seq = read_seqbegin(&jiffies_lock);
26561 +                       seq = read_seqcount_begin(&jiffies_seq);
26562                         next = tick_next_period;
26563 -               } while (read_seqretry(&jiffies_lock, seq));
26564 +               } while (read_seqcount_retry(&jiffies_seq, seq));
26566                 clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT);
26568 diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
26569 index 22c57e191a23..d536824cbd36 100644
26570 --- a/kernel/time/tick-sched.c
26571 +++ b/kernel/time/tick-sched.c
26572 @@ -62,7 +62,8 @@ static void tick_do_update_jiffies64(ktime_t now)
26573                 return;
26575         /* Reevalute with jiffies_lock held */
26576 -       write_seqlock(&jiffies_lock);
26577 +       raw_spin_lock(&jiffies_lock);
26578 +       write_seqcount_begin(&jiffies_seq);
26580         delta = ktime_sub(now, last_jiffies_update);
26581         if (delta.tv64 >= tick_period.tv64) {
26582 @@ -85,10 +86,12 @@ static void tick_do_update_jiffies64(ktime_t now)
26583                 /* Keep the tick_next_period variable up to date */
26584                 tick_next_period = ktime_add(last_jiffies_update, tick_period);
26585         } else {
26586 -               write_sequnlock(&jiffies_lock);
26587 +               write_seqcount_end(&jiffies_seq);
26588 +               raw_spin_unlock(&jiffies_lock);
26589                 return;
26590         }
26591 -       write_sequnlock(&jiffies_lock);
26592 +       write_seqcount_end(&jiffies_seq);
26593 +       raw_spin_unlock(&jiffies_lock);
26594         update_wall_time();
26597 @@ -99,12 +102,14 @@ static ktime_t tick_init_jiffy_update(void)
26599         ktime_t period;
26601 -       write_seqlock(&jiffies_lock);
26602 +       raw_spin_lock(&jiffies_lock);
26603 +       write_seqcount_begin(&jiffies_seq);
26604         /* Did we start the jiffies update yet ? */
26605         if (last_jiffies_update.tv64 == 0)
26606                 last_jiffies_update = tick_next_period;
26607         period = last_jiffies_update;
26608 -       write_sequnlock(&jiffies_lock);
26609 +       write_seqcount_end(&jiffies_seq);
26610 +       raw_spin_unlock(&jiffies_lock);
26611         return period;
26614 @@ -176,6 +181,11 @@ static bool can_stop_full_tick(void)
26615                 return false;
26616         }
26618 +       if (!arch_irq_work_has_interrupt()) {
26619 +               trace_tick_stop(0, "missing irq work interrupt\n");
26620 +               return false;
26621 +       }
26623         /* sched_clock_tick() needs us? */
26624  #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
26625         /*
26626 @@ -204,6 +214,7 @@ static void nohz_full_kick_work_func(struct irq_work *work)
26628  static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = {
26629         .func = nohz_full_kick_work_func,
26630 +       .flags = IRQ_WORK_HARD_IRQ,
26631  };
26633  /*
26634 @@ -578,10 +589,10 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
26636         /* Read jiffies and the time when jiffies were updated last */
26637         do {
26638 -               seq = read_seqbegin(&jiffies_lock);
26639 +               seq = read_seqcount_begin(&jiffies_seq);
26640                 basemono = last_jiffies_update.tv64;
26641                 basejiff = jiffies;
26642 -       } while (read_seqretry(&jiffies_lock, seq));
26643 +       } while (read_seqcount_retry(&jiffies_seq, seq));
26644         ts->last_jiffies = basejiff;
26646         if (rcu_needs_cpu(basemono, &next_rcu) ||
26647 @@ -753,14 +764,7 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
26648                 return false;
26650         if (unlikely(local_softirq_pending() && cpu_online(cpu))) {
26651 -               static int ratelimit;
26653 -               if (ratelimit < 10 &&
26654 -                   (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) {
26655 -                       pr_warn("NOHZ: local_softirq_pending %02x\n",
26656 -                               (unsigned int) local_softirq_pending());
26657 -                       ratelimit++;
26658 -               }
26659 +               softirq_check_pending_idle();
26660                 return false;
26661         }
26663 @@ -1100,6 +1104,7 @@ void tick_setup_sched_timer(void)
26664          * Emulate tick processing via per-CPU hrtimers:
26665          */
26666         hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
26667 +       ts->sched_timer.irqsafe = 1;
26668         ts->sched_timer.function = tick_sched_timer;
26670         /* Get the next period (per cpu) */
26671 diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
26672 index 738012d68117..e060b34d5603 100644
26673 --- a/kernel/time/timekeeping.c
26674 +++ b/kernel/time/timekeeping.c
26675 @@ -2070,8 +2070,10 @@ EXPORT_SYMBOL(hardpps);
26676   */
26677  void xtime_update(unsigned long ticks)
26679 -       write_seqlock(&jiffies_lock);
26680 +       raw_spin_lock(&jiffies_lock);
26681 +       write_seqcount_begin(&jiffies_seq);
26682         do_timer(ticks);
26683 -       write_sequnlock(&jiffies_lock);
26684 +       write_seqcount_end(&jiffies_seq);
26685 +       raw_spin_unlock(&jiffies_lock);
26686         update_wall_time();
26688 diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h
26689 index 704f595ce83f..763a3e5121ff 100644
26690 --- a/kernel/time/timekeeping.h
26691 +++ b/kernel/time/timekeeping.h
26692 @@ -19,7 +19,8 @@ extern void timekeeping_resume(void);
26693  extern void do_timer(unsigned long ticks);
26694  extern void update_wall_time(void);
26696 -extern seqlock_t jiffies_lock;
26697 +extern raw_spinlock_t jiffies_lock;
26698 +extern seqcount_t jiffies_seq;
26700  #define CS_NAME_LEN    32
26702 diff --git a/kernel/time/timer.c b/kernel/time/timer.c
26703 index bbc5d1114583..603699ff9411 100644
26704 --- a/kernel/time/timer.c
26705 +++ b/kernel/time/timer.c
26706 @@ -80,6 +80,9 @@ struct tvec_root {
26707  struct tvec_base {
26708         spinlock_t lock;
26709         struct timer_list *running_timer;
26710 +#ifdef CONFIG_PREEMPT_RT_FULL
26711 +       wait_queue_head_t wait_for_running_timer;
26712 +#endif
26713         unsigned long timer_jiffies;
26714         unsigned long next_timer;
26715         unsigned long active_timers;
26716 @@ -777,6 +780,39 @@ static struct tvec_base *lock_timer_base(struct timer_list *timer,
26717                 cpu_relax();
26718         }
26720 +#ifdef CONFIG_PREEMPT_RT_FULL
26721 +static inline struct tvec_base *switch_timer_base(struct timer_list *timer,
26722 +                                                 struct tvec_base *old,
26723 +                                                 struct tvec_base *new)
26725 +       /*
26726 +        * We cannot do the below because we might be preempted and
26727 +        * then the preempter would see NULL and loop forever.
26728 +        */
26729 +       if (spin_trylock(&new->lock)) {
26730 +               WRITE_ONCE(timer->flags,
26731 +                          (timer->flags & ~TIMER_BASEMASK) | new->cpu);
26732 +               spin_unlock(&old->lock);
26733 +               return new;
26734 +       }
26735 +       return old;
26738 +#else
26739 +static inline struct tvec_base *switch_timer_base(struct timer_list *timer,
26740 +                                                 struct tvec_base *old,
26741 +                                                 struct tvec_base *new)
26743 +       /* See the comment in lock_timer_base() */
26744 +       timer->flags |= TIMER_MIGRATING;
26746 +       spin_unlock(&old->lock);
26747 +       spin_lock(&new->lock);
26748 +       WRITE_ONCE(timer->flags,
26749 +                  (timer->flags & ~TIMER_BASEMASK) | new->cpu);
26750 +       return new;
26752 +#endif
26754  static inline int
26755  __mod_timer(struct timer_list *timer, unsigned long expires,
26756 @@ -807,16 +843,8 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
26757                  * handler yet has not finished. This also guarantees that
26758                  * the timer is serialized wrt itself.
26759                  */
26760 -               if (likely(base->running_timer != timer)) {
26761 -                       /* See the comment in lock_timer_base() */
26762 -                       timer->flags |= TIMER_MIGRATING;
26764 -                       spin_unlock(&base->lock);
26765 -                       base = new_base;
26766 -                       spin_lock(&base->lock);
26767 -                       WRITE_ONCE(timer->flags,
26768 -                                  (timer->flags & ~TIMER_BASEMASK) | base->cpu);
26769 -               }
26770 +               if (likely(base->running_timer != timer))
26771 +                       base = switch_timer_base(timer, base, new_base);
26772         }
26774         timer->expires = expires;
26775 @@ -1006,6 +1034,33 @@ void add_timer_on(struct timer_list *timer, int cpu)
26777  EXPORT_SYMBOL_GPL(add_timer_on);
26779 +#ifdef CONFIG_PREEMPT_RT_FULL
26781 + * Wait for a running timer
26782 + */
26783 +static void wait_for_running_timer(struct timer_list *timer)
26785 +       struct tvec_base *base;
26786 +       u32 tf = timer->flags;
26788 +       if (tf & TIMER_MIGRATING)
26789 +               return;
26791 +       base = per_cpu_ptr(&tvec_bases, tf & TIMER_CPUMASK);
26792 +       wait_event(base->wait_for_running_timer,
26793 +                  base->running_timer != timer);
26796 +# define wakeup_timer_waiters(b)       wake_up_all(&(b)->wait_for_running_timer)
26797 +#else
26798 +static inline void wait_for_running_timer(struct timer_list *timer)
26800 +       cpu_relax();
26803 +# define wakeup_timer_waiters(b)       do { } while (0)
26804 +#endif
26806  /**
26807   * del_timer - deactive a timer.
26808   * @timer: the timer to be deactivated
26809 @@ -1063,7 +1118,7 @@ int try_to_del_timer_sync(struct timer_list *timer)
26811  EXPORT_SYMBOL(try_to_del_timer_sync);
26813 -#ifdef CONFIG_SMP
26814 +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
26815  /**
26816   * del_timer_sync - deactivate a timer and wait for the handler to finish.
26817   * @timer: the timer to be deactivated
26818 @@ -1123,7 +1178,7 @@ int del_timer_sync(struct timer_list *timer)
26819                 int ret = try_to_del_timer_sync(timer);
26820                 if (ret >= 0)
26821                         return ret;
26822 -               cpu_relax();
26823 +               wait_for_running_timer(timer);
26824         }
26826  EXPORT_SYMBOL(del_timer_sync);
26827 @@ -1248,16 +1303,18 @@ static inline void __run_timers(struct tvec_base *base)
26828                         if (irqsafe) {
26829                                 spin_unlock(&base->lock);
26830                                 call_timer_fn(timer, fn, data);
26831 +                               base->running_timer = NULL;
26832                                 spin_lock(&base->lock);
26833                         } else {
26834                                 spin_unlock_irq(&base->lock);
26835                                 call_timer_fn(timer, fn, data);
26836 +                               base->running_timer = NULL;
26837                                 spin_lock_irq(&base->lock);
26838                         }
26839                 }
26840         }
26841 -       base->running_timer = NULL;
26842         spin_unlock_irq(&base->lock);
26843 +       wakeup_timer_waiters(base);
26846  #ifdef CONFIG_NO_HZ_COMMON
26847 @@ -1390,6 +1447,14 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
26848         if (cpu_is_offline(smp_processor_id()))
26849                 return expires;
26851 +#ifdef CONFIG_PREEMPT_RT_FULL
26852 +       /*
26853 +        * On PREEMPT_RT we cannot sleep here. As a result we can't take
26854 +        * the base lock to check when the next timer is pending and so
26855 +        * we assume the next jiffy.
26856 +        */
26857 +       return basem + TICK_NSEC;
26858 +#endif
26859         spin_lock(&base->lock);
26860         if (base->active_timers) {
26861                 if (time_before_eq(base->next_timer, base->timer_jiffies))
26862 @@ -1416,13 +1481,13 @@ void update_process_times(int user_tick)
26864         /* Note: this timer irq context must be accounted for as well. */
26865         account_process_tick(p, user_tick);
26866 +       scheduler_tick();
26867         run_local_timers();
26868         rcu_check_callbacks(user_tick);
26869 -#ifdef CONFIG_IRQ_WORK
26870 +#if defined(CONFIG_IRQ_WORK)
26871         if (in_irq())
26872                 irq_work_tick();
26873  #endif
26874 -       scheduler_tick();
26875         run_posix_cpu_timers(p);
26878 @@ -1433,6 +1498,8 @@ static void run_timer_softirq(struct softirq_action *h)
26880         struct tvec_base *base = this_cpu_ptr(&tvec_bases);
26882 +       irq_work_tick_soft();
26884         if (time_after_eq(jiffies, base->timer_jiffies))
26885                 __run_timers(base);
26887 @@ -1589,7 +1656,7 @@ static void migrate_timers(int cpu)
26889         BUG_ON(cpu_online(cpu));
26890         old_base = per_cpu_ptr(&tvec_bases, cpu);
26891 -       new_base = get_cpu_ptr(&tvec_bases);
26892 +       new_base = get_local_ptr(&tvec_bases);
26893         /*
26894          * The caller is globally serialized and nobody else
26895          * takes two locks at once, deadlock is not possible.
26896 @@ -1613,7 +1680,7 @@ static void migrate_timers(int cpu)
26898         spin_unlock(&old_base->lock);
26899         spin_unlock_irq(&new_base->lock);
26900 -       put_cpu_ptr(&tvec_bases);
26901 +       put_local_ptr(&tvec_bases);
26904  static int timer_cpu_notify(struct notifier_block *self,
26905 @@ -1645,6 +1712,9 @@ static void __init init_timer_cpu(int cpu)
26907         base->cpu = cpu;
26908         spin_lock_init(&base->lock);
26909 +#ifdef CONFIG_PREEMPT_RT_FULL
26910 +       init_waitqueue_head(&base->wait_for_running_timer);
26911 +#endif
26913         base->timer_jiffies = jiffies;
26914         base->next_timer = base->timer_jiffies;
26915 diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
26916 index e45db6b0d878..364ccd0eb57b 100644
26917 --- a/kernel/trace/Kconfig
26918 +++ b/kernel/trace/Kconfig
26919 @@ -187,6 +187,24 @@ config IRQSOFF_TRACER
26920           enabled. This option and the preempt-off timing option can be
26921           used together or separately.)
26923 +config INTERRUPT_OFF_HIST
26924 +       bool "Interrupts-off Latency Histogram"
26925 +       depends on IRQSOFF_TRACER
26926 +       help
26927 +         This option generates continuously updated histograms (one per cpu)
26928 +         of the duration of time periods with interrupts disabled. The
26929 +         histograms are disabled by default. To enable them, write a non-zero
26930 +         number to
26932 +             /sys/kernel/debug/tracing/latency_hist/enable/preemptirqsoff
26934 +         If PREEMPT_OFF_HIST is also selected, additional histograms (one
26935 +         per cpu) are generated that accumulate the duration of time periods
26936 +         when both interrupts and preemption are disabled. The histogram data
26937 +         will be located in the debug file system at
26939 +             /sys/kernel/debug/tracing/latency_hist/irqsoff
26941  config PREEMPT_TRACER
26942         bool "Preemption-off Latency Tracer"
26943         default n
26944 @@ -211,6 +229,24 @@ config PREEMPT_TRACER
26945           enabled. This option and the irqs-off timing option can be
26946           used together or separately.)
26948 +config PREEMPT_OFF_HIST
26949 +       bool "Preemption-off Latency Histogram"
26950 +       depends on PREEMPT_TRACER
26951 +       help
26952 +         This option generates continuously updated histograms (one per cpu)
26953 +         of the duration of time periods with preemption disabled. The
26954 +         histograms are disabled by default. To enable them, write a non-zero
26955 +         number to
26957 +             /sys/kernel/debug/tracing/latency_hist/enable/preemptirqsoff
26959 +         If INTERRUPT_OFF_HIST is also selected, additional histograms (one
26960 +         per cpu) are generated that accumulate the duration of time periods
26961 +         when both interrupts and preemption are disabled. The histogram data
26962 +         will be located in the debug file system at
26964 +             /sys/kernel/debug/tracing/latency_hist/preemptoff
26966  config SCHED_TRACER
26967         bool "Scheduling Latency Tracer"
26968         select GENERIC_TRACER
26969 @@ -221,6 +257,74 @@ config SCHED_TRACER
26970           This tracer tracks the latency of the highest priority task
26971           to be scheduled in, starting from the point it has woken up.
26973 +config WAKEUP_LATENCY_HIST
26974 +       bool "Scheduling Latency Histogram"
26975 +       depends on SCHED_TRACER
26976 +       help
26977 +         This option generates continuously updated histograms (one per cpu)
26978 +         of the scheduling latency of the highest priority task.
26979 +         The histograms are disabled by default. To enable them, write a
26980 +         non-zero number to
26982 +             /sys/kernel/debug/tracing/latency_hist/enable/wakeup
26984 +         Two different algorithms are used, one to determine the latency of
26985 +         processes that exclusively use the highest priority of the system and
26986 +         another one to determine the latency of processes that share the
26987 +         highest system priority with other processes. The former is used to
26988 +         improve hardware and system software, the latter to optimize the
26989 +         priority design of a given system. The histogram data will be
26990 +         located in the debug file system at
26992 +             /sys/kernel/debug/tracing/latency_hist/wakeup
26994 +         and
26996 +             /sys/kernel/debug/tracing/latency_hist/wakeup/sharedprio
26998 +         If both Scheduling Latency Histogram and Missed Timer Offsets
26999 +         Histogram are selected, additional histogram data will be collected
27000 +         that contain, in addition to the wakeup latency, the timer latency, in
27001 +         case the wakeup was triggered by an expired timer. These histograms
27002 +         are available in the
27004 +             /sys/kernel/debug/tracing/latency_hist/timerandwakeup
27006 +         directory. They reflect the apparent interrupt and scheduling latency
27007 +         and are best suitable to determine the worst-case latency of a given
27008 +         system. To enable these histograms, write a non-zero number to
27010 +             /sys/kernel/debug/tracing/latency_hist/enable/timerandwakeup
27012 +config MISSED_TIMER_OFFSETS_HIST
27013 +       depends on HIGH_RES_TIMERS
27014 +       select GENERIC_TRACER
27015 +       bool "Missed Timer Offsets Histogram"
27016 +       help
27017 +         Generate a histogram of missed timer offsets in microseconds. The
27018 +         histograms are disabled by default. To enable them, write a non-zero
27019 +         number to
27021 +             /sys/kernel/debug/tracing/latency_hist/enable/missed_timer_offsets
27023 +         The histogram data will be located in the debug file system at
27025 +             /sys/kernel/debug/tracing/latency_hist/missed_timer_offsets
27027 +         If both Scheduling Latency Histogram and Missed Timer Offsets
27028 +         Histogram are selected, additional histogram data will be collected
27029 +         that contain, in addition to the wakeup latency, the timer latency, in
27030 +         case the wakeup was triggered by an expired timer. These histograms
27031 +         are available in the
27033 +             /sys/kernel/debug/tracing/latency_hist/timerandwakeup
27035 +         directory. They reflect the apparent interrupt and scheduling latency
27036 +         and are best suitable to determine the worst-case latency of a given
27037 +         system. To enable these histograms, write a non-zero number to
27039 +             /sys/kernel/debug/tracing/latency_hist/enable/timerandwakeup
27041  config ENABLE_DEFAULT_TRACERS
27042         bool "Trace process context switches and events"
27043         depends on !GENERIC_TRACER
27044 diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
27045 index 05ea5167e6bb..bc08c67301ae 100644
27046 --- a/kernel/trace/Makefile
27047 +++ b/kernel/trace/Makefile
27048 @@ -40,6 +40,10 @@ obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o
27049  obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o
27050  obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o
27051  obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o
27052 +obj-$(CONFIG_INTERRUPT_OFF_HIST) += latency_hist.o
27053 +obj-$(CONFIG_PREEMPT_OFF_HIST) += latency_hist.o
27054 +obj-$(CONFIG_WAKEUP_LATENCY_HIST) += latency_hist.o
27055 +obj-$(CONFIG_MISSED_TIMER_OFFSETS_HIST) += latency_hist.o
27056  obj-$(CONFIG_NOP_TRACER) += trace_nop.o
27057  obj-$(CONFIG_STACK_TRACER) += trace_stack.o
27058  obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
27059 diff --git a/kernel/trace/latency_hist.c b/kernel/trace/latency_hist.c
27060 new file mode 100644
27061 index 000000000000..7f6ee70dea41
27062 --- /dev/null
27063 +++ b/kernel/trace/latency_hist.c
27064 @@ -0,0 +1,1178 @@
27066 + * kernel/trace/latency_hist.c
27067 + *
27068 + * Add support for histograms of preemption-off latency and
27069 + * interrupt-off latency and wakeup latency, it depends on
27070 + * Real-Time Preemption Support.
27071 + *
27072 + *  Copyright (C) 2005 MontaVista Software, Inc.
27073 + *  Yi Yang <yyang@ch.mvista.com>
27074 + *
27075 + *  Converted to work with the new latency tracer.
27076 + *  Copyright (C) 2008 Red Hat, Inc.
27077 + *    Steven Rostedt <srostedt@redhat.com>
27078 + *
27079 + */
27080 +#include <linux/module.h>
27081 +#include <linux/debugfs.h>
27082 +#include <linux/seq_file.h>
27083 +#include <linux/percpu.h>
27084 +#include <linux/kallsyms.h>
27085 +#include <linux/uaccess.h>
27086 +#include <linux/sched.h>
27087 +#include <linux/sched/rt.h>
27088 +#include <linux/slab.h>
27089 +#include <linux/atomic.h>
27090 +#include <asm/div64.h>
27092 +#include "trace.h"
27093 +#include <trace/events/sched.h>
27095 +#define NSECS_PER_USECS 1000L
27097 +#define CREATE_TRACE_POINTS
27098 +#include <trace/events/hist.h>
27100 +enum {
27101 +       IRQSOFF_LATENCY = 0,
27102 +       PREEMPTOFF_LATENCY,
27103 +       PREEMPTIRQSOFF_LATENCY,
27104 +       WAKEUP_LATENCY,
27105 +       WAKEUP_LATENCY_SHAREDPRIO,
27106 +       MISSED_TIMER_OFFSETS,
27107 +       TIMERANDWAKEUP_LATENCY,
27108 +       MAX_LATENCY_TYPE,
27111 +#define MAX_ENTRY_NUM 10240
27113 +struct hist_data {
27114 +       atomic_t hist_mode; /* 0 log, 1 don't log */
27115 +       long offset; /* set it to MAX_ENTRY_NUM/2 for a bipolar scale */
27116 +       long min_lat;
27117 +       long max_lat;
27118 +       unsigned long long below_hist_bound_samples;
27119 +       unsigned long long above_hist_bound_samples;
27120 +       long long accumulate_lat;
27121 +       unsigned long long total_samples;
27122 +       unsigned long long hist_array[MAX_ENTRY_NUM];
27125 +struct enable_data {
27126 +       int latency_type;
27127 +       int enabled;
27130 +static char *latency_hist_dir_root = "latency_hist";
27132 +#ifdef CONFIG_INTERRUPT_OFF_HIST
27133 +static DEFINE_PER_CPU(struct hist_data, irqsoff_hist);
27134 +static char *irqsoff_hist_dir = "irqsoff";
27135 +static DEFINE_PER_CPU(cycles_t, hist_irqsoff_start);
27136 +static DEFINE_PER_CPU(int, hist_irqsoff_counting);
27137 +#endif
27139 +#ifdef CONFIG_PREEMPT_OFF_HIST
27140 +static DEFINE_PER_CPU(struct hist_data, preemptoff_hist);
27141 +static char *preemptoff_hist_dir = "preemptoff";
27142 +static DEFINE_PER_CPU(cycles_t, hist_preemptoff_start);
27143 +static DEFINE_PER_CPU(int, hist_preemptoff_counting);
27144 +#endif
27146 +#if defined(CONFIG_PREEMPT_OFF_HIST) && defined(CONFIG_INTERRUPT_OFF_HIST)
27147 +static DEFINE_PER_CPU(struct hist_data, preemptirqsoff_hist);
27148 +static char *preemptirqsoff_hist_dir = "preemptirqsoff";
27149 +static DEFINE_PER_CPU(cycles_t, hist_preemptirqsoff_start);
27150 +static DEFINE_PER_CPU(int, hist_preemptirqsoff_counting);
27151 +#endif
27153 +#if defined(CONFIG_PREEMPT_OFF_HIST) || defined(CONFIG_INTERRUPT_OFF_HIST)
27154 +static notrace void probe_preemptirqsoff_hist(void *v, int reason, int start);
27155 +static struct enable_data preemptirqsoff_enabled_data = {
27156 +       .latency_type = PREEMPTIRQSOFF_LATENCY,
27157 +       .enabled = 0,
27159 +#endif
27161 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
27162 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
27163 +struct maxlatproc_data {
27164 +       char comm[FIELD_SIZEOF(struct task_struct, comm)];
27165 +       char current_comm[FIELD_SIZEOF(struct task_struct, comm)];
27166 +       int pid;
27167 +       int current_pid;
27168 +       int prio;
27169 +       int current_prio;
27170 +       long latency;
27171 +       long timeroffset;
27172 +       cycle_t timestamp;
27174 +#endif
27176 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
27177 +static DEFINE_PER_CPU(struct hist_data, wakeup_latency_hist);
27178 +static DEFINE_PER_CPU(struct hist_data, wakeup_latency_hist_sharedprio);
27179 +static char *wakeup_latency_hist_dir = "wakeup";
27180 +static char *wakeup_latency_hist_dir_sharedprio = "sharedprio";
27181 +static notrace void probe_wakeup_latency_hist_start(void *v,
27182 +       struct task_struct *p);
27183 +static notrace void probe_wakeup_latency_hist_stop(void *v,
27184 +       bool preempt, struct task_struct *prev, struct task_struct *next);
27185 +static notrace void probe_sched_migrate_task(void *,
27186 +       struct task_struct *task, int cpu);
27187 +static struct enable_data wakeup_latency_enabled_data = {
27188 +       .latency_type = WAKEUP_LATENCY,
27189 +       .enabled = 0,
27191 +static DEFINE_PER_CPU(struct maxlatproc_data, wakeup_maxlatproc);
27192 +static DEFINE_PER_CPU(struct maxlatproc_data, wakeup_maxlatproc_sharedprio);
27193 +static DEFINE_PER_CPU(struct task_struct *, wakeup_task);
27194 +static DEFINE_PER_CPU(int, wakeup_sharedprio);
27195 +static unsigned long wakeup_pid;
27196 +#endif
27198 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
27199 +static DEFINE_PER_CPU(struct hist_data, missed_timer_offsets);
27200 +static char *missed_timer_offsets_dir = "missed_timer_offsets";
27201 +static notrace void probe_hrtimer_interrupt(void *v, int cpu,
27202 +       long long offset, struct task_struct *curr, struct task_struct *task);
27203 +static struct enable_data missed_timer_offsets_enabled_data = {
27204 +       .latency_type = MISSED_TIMER_OFFSETS,
27205 +       .enabled = 0,
27207 +static DEFINE_PER_CPU(struct maxlatproc_data, missed_timer_offsets_maxlatproc);
27208 +static unsigned long missed_timer_offsets_pid;
27209 +#endif
27211 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
27212 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
27213 +static DEFINE_PER_CPU(struct hist_data, timerandwakeup_latency_hist);
27214 +static char *timerandwakeup_latency_hist_dir = "timerandwakeup";
27215 +static struct enable_data timerandwakeup_enabled_data = {
27216 +       .latency_type = TIMERANDWAKEUP_LATENCY,
27217 +       .enabled = 0,
27219 +static DEFINE_PER_CPU(struct maxlatproc_data, timerandwakeup_maxlatproc);
27220 +#endif
27222 +void notrace latency_hist(int latency_type, int cpu, long latency,
27223 +                         long timeroffset, cycle_t stop,
27224 +                         struct task_struct *p)
27226 +       struct hist_data *my_hist;
27227 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
27228 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
27229 +       struct maxlatproc_data *mp = NULL;
27230 +#endif
27232 +       if (!cpu_possible(cpu) || latency_type < 0 ||
27233 +           latency_type >= MAX_LATENCY_TYPE)
27234 +               return;
27236 +       switch (latency_type) {
27237 +#ifdef CONFIG_INTERRUPT_OFF_HIST
27238 +       case IRQSOFF_LATENCY:
27239 +               my_hist = &per_cpu(irqsoff_hist, cpu);
27240 +               break;
27241 +#endif
27242 +#ifdef CONFIG_PREEMPT_OFF_HIST
27243 +       case PREEMPTOFF_LATENCY:
27244 +               my_hist = &per_cpu(preemptoff_hist, cpu);
27245 +               break;
27246 +#endif
27247 +#if defined(CONFIG_PREEMPT_OFF_HIST) && defined(CONFIG_INTERRUPT_OFF_HIST)
27248 +       case PREEMPTIRQSOFF_LATENCY:
27249 +               my_hist = &per_cpu(preemptirqsoff_hist, cpu);
27250 +               break;
27251 +#endif
27252 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
27253 +       case WAKEUP_LATENCY:
27254 +               my_hist = &per_cpu(wakeup_latency_hist, cpu);
27255 +               mp = &per_cpu(wakeup_maxlatproc, cpu);
27256 +               break;
27257 +       case WAKEUP_LATENCY_SHAREDPRIO:
27258 +               my_hist = &per_cpu(wakeup_latency_hist_sharedprio, cpu);
27259 +               mp = &per_cpu(wakeup_maxlatproc_sharedprio, cpu);
27260 +               break;
27261 +#endif
27262 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
27263 +       case MISSED_TIMER_OFFSETS:
27264 +               my_hist = &per_cpu(missed_timer_offsets, cpu);
27265 +               mp = &per_cpu(missed_timer_offsets_maxlatproc, cpu);
27266 +               break;
27267 +#endif
27268 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
27269 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
27270 +       case TIMERANDWAKEUP_LATENCY:
27271 +               my_hist = &per_cpu(timerandwakeup_latency_hist, cpu);
27272 +               mp = &per_cpu(timerandwakeup_maxlatproc, cpu);
27273 +               break;
27274 +#endif
27276 +       default:
27277 +               return;
27278 +       }
27280 +       latency += my_hist->offset;
27282 +       if (atomic_read(&my_hist->hist_mode) == 0)
27283 +               return;
27285 +       if (latency < 0 || latency >= MAX_ENTRY_NUM) {
27286 +               if (latency < 0)
27287 +                       my_hist->below_hist_bound_samples++;
27288 +               else
27289 +                       my_hist->above_hist_bound_samples++;
27290 +       } else
27291 +               my_hist->hist_array[latency]++;
27293 +       if (unlikely(latency > my_hist->max_lat ||
27294 +           my_hist->min_lat == LONG_MAX)) {
27295 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
27296 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
27297 +               if (latency_type == WAKEUP_LATENCY ||
27298 +                   latency_type == WAKEUP_LATENCY_SHAREDPRIO ||
27299 +                   latency_type == MISSED_TIMER_OFFSETS ||
27300 +                   latency_type == TIMERANDWAKEUP_LATENCY) {
27301 +                       strncpy(mp->comm, p->comm, sizeof(mp->comm));
27302 +                       strncpy(mp->current_comm, current->comm,
27303 +                           sizeof(mp->current_comm));
27304 +                       mp->pid = task_pid_nr(p);
27305 +                       mp->current_pid = task_pid_nr(current);
27306 +                       mp->prio = p->prio;
27307 +                       mp->current_prio = current->prio;
27308 +                       mp->latency = latency;
27309 +                       mp->timeroffset = timeroffset;
27310 +                       mp->timestamp = stop;
27311 +               }
27312 +#endif
27313 +               my_hist->max_lat = latency;
27314 +       }
27315 +       if (unlikely(latency < my_hist->min_lat))
27316 +               my_hist->min_lat = latency;
27317 +       my_hist->total_samples++;
27318 +       my_hist->accumulate_lat += latency;
27321 +static void *l_start(struct seq_file *m, loff_t *pos)
27323 +       loff_t *index_ptr = NULL;
27324 +       loff_t index = *pos;
27325 +       struct hist_data *my_hist = m->private;
27327 +       if (index == 0) {
27328 +               char minstr[32], avgstr[32], maxstr[32];
27330 +               atomic_dec(&my_hist->hist_mode);
27332 +               if (likely(my_hist->total_samples)) {
27333 +                       long avg = (long) div64_s64(my_hist->accumulate_lat,
27334 +                           my_hist->total_samples);
27335 +                       snprintf(minstr, sizeof(minstr), "%ld",
27336 +                           my_hist->min_lat - my_hist->offset);
27337 +                       snprintf(avgstr, sizeof(avgstr), "%ld",
27338 +                           avg - my_hist->offset);
27339 +                       snprintf(maxstr, sizeof(maxstr), "%ld",
27340 +                           my_hist->max_lat - my_hist->offset);
27341 +               } else {
27342 +                       strcpy(minstr, "<undef>");
27343 +                       strcpy(avgstr, minstr);
27344 +                       strcpy(maxstr, minstr);
27345 +               }
27347 +               seq_printf(m, "#Minimum latency: %s microseconds\n"
27348 +                          "#Average latency: %s microseconds\n"
27349 +                          "#Maximum latency: %s microseconds\n"
27350 +                          "#Total samples: %llu\n"
27351 +                          "#There are %llu samples lower than %ld"
27352 +                          " microseconds.\n"
27353 +                          "#There are %llu samples greater or equal"
27354 +                          " than %ld microseconds.\n"
27355 +                          "#usecs\t%16s\n",
27356 +                          minstr, avgstr, maxstr,
27357 +                          my_hist->total_samples,
27358 +                          my_hist->below_hist_bound_samples,
27359 +                          -my_hist->offset,
27360 +                          my_hist->above_hist_bound_samples,
27361 +                          MAX_ENTRY_NUM - my_hist->offset,
27362 +                          "samples");
27363 +       }
27364 +       if (index < MAX_ENTRY_NUM) {
27365 +               index_ptr = kmalloc(sizeof(loff_t), GFP_KERNEL);
27366 +               if (index_ptr)
27367 +                       *index_ptr = index;
27368 +       }
27370 +       return index_ptr;
27373 +static void *l_next(struct seq_file *m, void *p, loff_t *pos)
27375 +       loff_t *index_ptr = p;
27376 +       struct hist_data *my_hist = m->private;
27378 +       if (++*pos >= MAX_ENTRY_NUM) {
27379 +               atomic_inc(&my_hist->hist_mode);
27380 +               return NULL;
27381 +       }
27382 +       *index_ptr = *pos;
27383 +       return index_ptr;
27386 +static void l_stop(struct seq_file *m, void *p)
27388 +       kfree(p);
27391 +static int l_show(struct seq_file *m, void *p)
27393 +       int index = *(loff_t *) p;
27394 +       struct hist_data *my_hist = m->private;
27396 +       seq_printf(m, "%6ld\t%16llu\n", index - my_hist->offset,
27397 +           my_hist->hist_array[index]);
27398 +       return 0;
27401 +static const struct seq_operations latency_hist_seq_op = {
27402 +       .start = l_start,
27403 +       .next  = l_next,
27404 +       .stop  = l_stop,
27405 +       .show  = l_show
27408 +static int latency_hist_open(struct inode *inode, struct file *file)
27410 +       int ret;
27412 +       ret = seq_open(file, &latency_hist_seq_op);
27413 +       if (!ret) {
27414 +               struct seq_file *seq = file->private_data;
27415 +               seq->private = inode->i_private;
27416 +       }
27417 +       return ret;
27420 +static const struct file_operations latency_hist_fops = {
27421 +       .open = latency_hist_open,
27422 +       .read = seq_read,
27423 +       .llseek = seq_lseek,
27424 +       .release = seq_release,
27427 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
27428 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
27429 +static void clear_maxlatprocdata(struct maxlatproc_data *mp)
27431 +       mp->comm[0] = mp->current_comm[0] = '\0';
27432 +       mp->prio = mp->current_prio = mp->pid = mp->current_pid =
27433 +           mp->latency = mp->timeroffset = -1;
27434 +       mp->timestamp = 0;
27436 +#endif
27438 +static void hist_reset(struct hist_data *hist)
27440 +       atomic_dec(&hist->hist_mode);
27442 +       memset(hist->hist_array, 0, sizeof(hist->hist_array));
27443 +       hist->below_hist_bound_samples = 0ULL;
27444 +       hist->above_hist_bound_samples = 0ULL;
27445 +       hist->min_lat = LONG_MAX;
27446 +       hist->max_lat = LONG_MIN;
27447 +       hist->total_samples = 0ULL;
27448 +       hist->accumulate_lat = 0LL;
27450 +       atomic_inc(&hist->hist_mode);
27453 +static ssize_t
27454 +latency_hist_reset(struct file *file, const char __user *a,
27455 +                  size_t size, loff_t *off)
27457 +       int cpu;
27458 +       struct hist_data *hist = NULL;
27459 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
27460 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
27461 +       struct maxlatproc_data *mp = NULL;
27462 +#endif
27463 +       off_t latency_type = (off_t) file->private_data;
27465 +       for_each_online_cpu(cpu) {
27467 +               switch (latency_type) {
27468 +#ifdef CONFIG_PREEMPT_OFF_HIST
27469 +               case PREEMPTOFF_LATENCY:
27470 +                       hist = &per_cpu(preemptoff_hist, cpu);
27471 +                       break;
27472 +#endif
27473 +#ifdef CONFIG_INTERRUPT_OFF_HIST
27474 +               case IRQSOFF_LATENCY:
27475 +                       hist = &per_cpu(irqsoff_hist, cpu);
27476 +                       break;
27477 +#endif
27478 +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
27479 +               case PREEMPTIRQSOFF_LATENCY:
27480 +                       hist = &per_cpu(preemptirqsoff_hist, cpu);
27481 +                       break;
27482 +#endif
27483 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
27484 +               case WAKEUP_LATENCY:
27485 +                       hist = &per_cpu(wakeup_latency_hist, cpu);
27486 +                       mp = &per_cpu(wakeup_maxlatproc, cpu);
27487 +                       break;
27488 +               case WAKEUP_LATENCY_SHAREDPRIO:
27489 +                       hist = &per_cpu(wakeup_latency_hist_sharedprio, cpu);
27490 +                       mp = &per_cpu(wakeup_maxlatproc_sharedprio, cpu);
27491 +                       break;
27492 +#endif
27493 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
27494 +               case MISSED_TIMER_OFFSETS:
27495 +                       hist = &per_cpu(missed_timer_offsets, cpu);
27496 +                       mp = &per_cpu(missed_timer_offsets_maxlatproc, cpu);
27497 +                       break;
27498 +#endif
27499 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
27500 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
27501 +               case TIMERANDWAKEUP_LATENCY:
27502 +                       hist = &per_cpu(timerandwakeup_latency_hist, cpu);
27503 +                       mp = &per_cpu(timerandwakeup_maxlatproc, cpu);
27504 +                       break;
27505 +#endif
27506 +               }
27508 +               hist_reset(hist);
27509 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
27510 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
27511 +               if (latency_type == WAKEUP_LATENCY ||
27512 +                   latency_type == WAKEUP_LATENCY_SHAREDPRIO ||
27513 +                   latency_type == MISSED_TIMER_OFFSETS ||
27514 +                   latency_type == TIMERANDWAKEUP_LATENCY)
27515 +                       clear_maxlatprocdata(mp);
27516 +#endif
27517 +       }
27519 +       return size;
27522 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
27523 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
27524 +static ssize_t
27525 +show_pid(struct file *file, char __user *ubuf, size_t cnt, loff_t *ppos)
27527 +       char buf[64];
27528 +       int r;
27529 +       unsigned long *this_pid = file->private_data;
27531 +       r = snprintf(buf, sizeof(buf), "%lu\n", *this_pid);
27532 +       return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
27535 +static ssize_t do_pid(struct file *file, const char __user *ubuf,
27536 +                     size_t cnt, loff_t *ppos)
27538 +       char buf[64];
27539 +       unsigned long pid;
27540 +       unsigned long *this_pid = file->private_data;
27542 +       if (cnt >= sizeof(buf))
27543 +               return -EINVAL;
27545 +       if (copy_from_user(&buf, ubuf, cnt))
27546 +               return -EFAULT;
27548 +       buf[cnt] = '\0';
27550 +       if (kstrtoul(buf, 10, &pid))
27551 +               return -EINVAL;
27553 +       *this_pid = pid;
27555 +       return cnt;
27557 +#endif
27559 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
27560 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
27561 +static ssize_t
27562 +show_maxlatproc(struct file *file, char __user *ubuf, size_t cnt, loff_t *ppos)
27564 +       int r;
27565 +       struct maxlatproc_data *mp = file->private_data;
27566 +       int strmaxlen = (TASK_COMM_LEN * 2) + (8 * 8);
27567 +       unsigned long long t;
27568 +       unsigned long usecs, secs;
27569 +       char *buf;
27571 +       if (mp->pid == -1 || mp->current_pid == -1) {
27572 +               buf = "(none)\n";
27573 +               return simple_read_from_buffer(ubuf, cnt, ppos, buf,
27574 +                   strlen(buf));
27575 +       }
27577 +       buf = kmalloc(strmaxlen, GFP_KERNEL);
27578 +       if (buf == NULL)
27579 +               return -ENOMEM;
27581 +       t = ns2usecs(mp->timestamp);
27582 +       usecs = do_div(t, USEC_PER_SEC);
27583 +       secs = (unsigned long) t;
27584 +       r = snprintf(buf, strmaxlen,
27585 +           "%d %d %ld (%ld) %s <- %d %d %s %lu.%06lu\n", mp->pid,
27586 +           MAX_RT_PRIO-1 - mp->prio, mp->latency, mp->timeroffset, mp->comm,
27587 +           mp->current_pid, MAX_RT_PRIO-1 - mp->current_prio, mp->current_comm,
27588 +           secs, usecs);
27589 +       r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
27590 +       kfree(buf);
27591 +       return r;
27593 +#endif
27595 +static ssize_t
27596 +show_enable(struct file *file, char __user *ubuf, size_t cnt, loff_t *ppos)
27598 +       char buf[64];
27599 +       struct enable_data *ed = file->private_data;
27600 +       int r;
27602 +       r = snprintf(buf, sizeof(buf), "%d\n", ed->enabled);
27603 +       return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
27606 +static ssize_t
27607 +do_enable(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos)
27609 +       char buf[64];
27610 +       long enable;
27611 +       struct enable_data *ed = file->private_data;
27613 +       if (cnt >= sizeof(buf))
27614 +               return -EINVAL;
27616 +       if (copy_from_user(&buf, ubuf, cnt))
27617 +               return -EFAULT;
27619 +       buf[cnt] = 0;
27621 +       if (kstrtoul(buf, 10, &enable))
27622 +               return -EINVAL;
27624 +       if ((enable && ed->enabled) || (!enable && !ed->enabled))
27625 +               return cnt;
27627 +       if (enable) {
27628 +               int ret;
27630 +               switch (ed->latency_type) {
27631 +#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST)
27632 +               case PREEMPTIRQSOFF_LATENCY:
27633 +                       ret = register_trace_preemptirqsoff_hist(
27634 +                           probe_preemptirqsoff_hist, NULL);
27635 +                       if (ret) {
27636 +                               pr_info("wakeup trace: Couldn't assign "
27637 +                                   "probe_preemptirqsoff_hist "
27638 +                                   "to trace_preemptirqsoff_hist\n");
27639 +                               return ret;
27640 +                       }
27641 +                       break;
27642 +#endif
27643 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
27644 +               case WAKEUP_LATENCY:
27645 +                       ret = register_trace_sched_wakeup(
27646 +                           probe_wakeup_latency_hist_start, NULL);
27647 +                       if (ret) {
27648 +                               pr_info("wakeup trace: Couldn't assign "
27649 +                                   "probe_wakeup_latency_hist_start "
27650 +                                   "to trace_sched_wakeup\n");
27651 +                               return ret;
27652 +                       }
27653 +                       ret = register_trace_sched_wakeup_new(
27654 +                           probe_wakeup_latency_hist_start, NULL);
27655 +                       if (ret) {
27656 +                               pr_info("wakeup trace: Couldn't assign "
27657 +                                   "probe_wakeup_latency_hist_start "
27658 +                                   "to trace_sched_wakeup_new\n");
27659 +                               unregister_trace_sched_wakeup(
27660 +                                   probe_wakeup_latency_hist_start, NULL);
27661 +                               return ret;
27662 +                       }
27663 +                       ret = register_trace_sched_switch(
27664 +                           probe_wakeup_latency_hist_stop, NULL);
27665 +                       if (ret) {
27666 +                               pr_info("wakeup trace: Couldn't assign "
27667 +                                   "probe_wakeup_latency_hist_stop "
27668 +                                   "to trace_sched_switch\n");
27669 +                               unregister_trace_sched_wakeup(
27670 +                                   probe_wakeup_latency_hist_start, NULL);
27671 +                               unregister_trace_sched_wakeup_new(
27672 +                                   probe_wakeup_latency_hist_start, NULL);
27673 +                               return ret;
27674 +                       }
27675 +                       ret = register_trace_sched_migrate_task(
27676 +                           probe_sched_migrate_task, NULL);
27677 +                       if (ret) {
27678 +                               pr_info("wakeup trace: Couldn't assign "
27679 +                                   "probe_sched_migrate_task "
27680 +                                   "to trace_sched_migrate_task\n");
27681 +                               unregister_trace_sched_wakeup(
27682 +                                   probe_wakeup_latency_hist_start, NULL);
27683 +                               unregister_trace_sched_wakeup_new(
27684 +                                   probe_wakeup_latency_hist_start, NULL);
27685 +                               unregister_trace_sched_switch(
27686 +                                   probe_wakeup_latency_hist_stop, NULL);
27687 +                               return ret;
27688 +                       }
27689 +                       break;
27690 +#endif
27691 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
27692 +               case MISSED_TIMER_OFFSETS:
27693 +                       ret = register_trace_hrtimer_interrupt(
27694 +                           probe_hrtimer_interrupt, NULL);
27695 +                       if (ret) {
27696 +                               pr_info("wakeup trace: Couldn't assign "
27697 +                                   "probe_hrtimer_interrupt "
27698 +                                   "to trace_hrtimer_interrupt\n");
27699 +                               return ret;
27700 +                       }
27701 +                       break;
27702 +#endif
27703 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
27704 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
27705 +               case TIMERANDWAKEUP_LATENCY:
27706 +                       if (!wakeup_latency_enabled_data.enabled ||
27707 +                           !missed_timer_offsets_enabled_data.enabled)
27708 +                               return -EINVAL;
27709 +                       break;
27710 +#endif
27711 +               default:
27712 +                       break;
27713 +               }
27714 +       } else {
27715 +               switch (ed->latency_type) {
27716 +#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST)
27717 +               case PREEMPTIRQSOFF_LATENCY:
27718 +                       {
27719 +                               int cpu;
27721 +                               unregister_trace_preemptirqsoff_hist(
27722 +                                   probe_preemptirqsoff_hist, NULL);
27723 +                               for_each_online_cpu(cpu) {
27724 +#ifdef CONFIG_INTERRUPT_OFF_HIST
27725 +                                       per_cpu(hist_irqsoff_counting,
27726 +                                           cpu) = 0;
27727 +#endif
27728 +#ifdef CONFIG_PREEMPT_OFF_HIST
27729 +                                       per_cpu(hist_preemptoff_counting,
27730 +                                           cpu) = 0;
27731 +#endif
27732 +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
27733 +                                       per_cpu(hist_preemptirqsoff_counting,
27734 +                                           cpu) = 0;
27735 +#endif
27736 +                               }
27737 +                       }
27738 +                       break;
27739 +#endif
27740 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
27741 +               case WAKEUP_LATENCY:
27742 +                       {
27743 +                               int cpu;
27745 +                               unregister_trace_sched_wakeup(
27746 +                                   probe_wakeup_latency_hist_start, NULL);
27747 +                               unregister_trace_sched_wakeup_new(
27748 +                                   probe_wakeup_latency_hist_start, NULL);
27749 +                               unregister_trace_sched_switch(
27750 +                                   probe_wakeup_latency_hist_stop, NULL);
27751 +                               unregister_trace_sched_migrate_task(
27752 +                                   probe_sched_migrate_task, NULL);
27754 +                               for_each_online_cpu(cpu) {
27755 +                                       per_cpu(wakeup_task, cpu) = NULL;
27756 +                                       per_cpu(wakeup_sharedprio, cpu) = 0;
27757 +                               }
27758 +                       }
27759 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
27760 +                       timerandwakeup_enabled_data.enabled = 0;
27761 +#endif
27762 +                       break;
27763 +#endif
27764 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
27765 +               case MISSED_TIMER_OFFSETS:
27766 +                       unregister_trace_hrtimer_interrupt(
27767 +                           probe_hrtimer_interrupt, NULL);
27768 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
27769 +                       timerandwakeup_enabled_data.enabled = 0;
27770 +#endif
27771 +                       break;
27772 +#endif
27773 +               default:
27774 +                       break;
27775 +               }
27776 +       }
27777 +       ed->enabled = enable;
27778 +       return cnt;
27781 +static const struct file_operations latency_hist_reset_fops = {
27782 +       .open = tracing_open_generic,
27783 +       .write = latency_hist_reset,
27786 +static const struct file_operations enable_fops = {
27787 +       .open = tracing_open_generic,
27788 +       .read = show_enable,
27789 +       .write = do_enable,
27792 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
27793 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
27794 +static const struct file_operations pid_fops = {
27795 +       .open = tracing_open_generic,
27796 +       .read = show_pid,
27797 +       .write = do_pid,
27800 +static const struct file_operations maxlatproc_fops = {
27801 +       .open = tracing_open_generic,
27802 +       .read = show_maxlatproc,
27804 +#endif
27806 +#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST)
27807 +static notrace void probe_preemptirqsoff_hist(void *v, int reason,
27808 +       int starthist)
27810 +       int cpu = raw_smp_processor_id();
27811 +       int time_set = 0;
27813 +       if (starthist) {
27814 +               cycle_t uninitialized_var(start);
27816 +               if (!preempt_count() && !irqs_disabled())
27817 +                       return;
27819 +#ifdef CONFIG_INTERRUPT_OFF_HIST
27820 +               if ((reason == IRQS_OFF || reason == TRACE_START) &&
27821 +                   !per_cpu(hist_irqsoff_counting, cpu)) {
27822 +                       per_cpu(hist_irqsoff_counting, cpu) = 1;
27823 +                       start = ftrace_now(cpu);
27824 +                       time_set++;
27825 +                       per_cpu(hist_irqsoff_start, cpu) = start;
27826 +               }
27827 +#endif
27829 +#ifdef CONFIG_PREEMPT_OFF_HIST
27830 +               if ((reason == PREEMPT_OFF || reason == TRACE_START) &&
27831 +                   !per_cpu(hist_preemptoff_counting, cpu)) {
27832 +                       per_cpu(hist_preemptoff_counting, cpu) = 1;
27833 +                       if (!(time_set++))
27834 +                               start = ftrace_now(cpu);
27835 +                       per_cpu(hist_preemptoff_start, cpu) = start;
27836 +               }
27837 +#endif
27839 +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
27840 +               if (per_cpu(hist_irqsoff_counting, cpu) &&
27841 +                   per_cpu(hist_preemptoff_counting, cpu) &&
27842 +                   !per_cpu(hist_preemptirqsoff_counting, cpu)) {
27843 +                       per_cpu(hist_preemptirqsoff_counting, cpu) = 1;
27844 +                       if (!time_set)
27845 +                               start = ftrace_now(cpu);
27846 +                       per_cpu(hist_preemptirqsoff_start, cpu) = start;
27847 +               }
27848 +#endif
27849 +       } else {
27850 +               cycle_t uninitialized_var(stop);
27852 +#ifdef CONFIG_INTERRUPT_OFF_HIST
27853 +               if ((reason == IRQS_ON || reason == TRACE_STOP) &&
27854 +                   per_cpu(hist_irqsoff_counting, cpu)) {
27855 +                       cycle_t start = per_cpu(hist_irqsoff_start, cpu);
27857 +                       stop = ftrace_now(cpu);
27858 +                       time_set++;
27859 +                       if (start) {
27860 +                               long latency = ((long) (stop - start)) /
27861 +                                   NSECS_PER_USECS;
27863 +                               latency_hist(IRQSOFF_LATENCY, cpu, latency, 0,
27864 +                                   stop, NULL);
27865 +                       }
27866 +                       per_cpu(hist_irqsoff_counting, cpu) = 0;
27867 +               }
27868 +#endif
27870 +#ifdef CONFIG_PREEMPT_OFF_HIST
27871 +               if ((reason == PREEMPT_ON || reason == TRACE_STOP) &&
27872 +                   per_cpu(hist_preemptoff_counting, cpu)) {
27873 +                       cycle_t start = per_cpu(hist_preemptoff_start, cpu);
27875 +                       if (!(time_set++))
27876 +                               stop = ftrace_now(cpu);
27877 +                       if (start) {
27878 +                               long latency = ((long) (stop - start)) /
27879 +                                   NSECS_PER_USECS;
27881 +                               latency_hist(PREEMPTOFF_LATENCY, cpu, latency,
27882 +                                   0, stop, NULL);
27883 +                       }
27884 +                       per_cpu(hist_preemptoff_counting, cpu) = 0;
27885 +               }
27886 +#endif
27888 +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
27889 +               if ((!per_cpu(hist_irqsoff_counting, cpu) ||
27890 +                    !per_cpu(hist_preemptoff_counting, cpu)) &&
27891 +                  per_cpu(hist_preemptirqsoff_counting, cpu)) {
27892 +                       cycle_t start = per_cpu(hist_preemptirqsoff_start, cpu);
27894 +                       if (!time_set)
27895 +                               stop = ftrace_now(cpu);
27896 +                       if (start) {
27897 +                               long latency = ((long) (stop - start)) /
27898 +                                   NSECS_PER_USECS;
27900 +                               latency_hist(PREEMPTIRQSOFF_LATENCY, cpu,
27901 +                                   latency, 0, stop, NULL);
27902 +                       }
27903 +                       per_cpu(hist_preemptirqsoff_counting, cpu) = 0;
27904 +               }
27905 +#endif
27906 +       }
27908 +#endif
27910 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
27911 +static DEFINE_RAW_SPINLOCK(wakeup_lock);
27912 +static notrace void probe_sched_migrate_task(void *v, struct task_struct *task,
27913 +       int cpu)
27915 +       int old_cpu = task_cpu(task);
27917 +       if (cpu != old_cpu) {
27918 +               unsigned long flags;
27919 +               struct task_struct *cpu_wakeup_task;
27921 +               raw_spin_lock_irqsave(&wakeup_lock, flags);
27923 +               cpu_wakeup_task = per_cpu(wakeup_task, old_cpu);
27924 +               if (task == cpu_wakeup_task) {
27925 +                       put_task_struct(cpu_wakeup_task);
27926 +                       per_cpu(wakeup_task, old_cpu) = NULL;
27927 +                       cpu_wakeup_task = per_cpu(wakeup_task, cpu) = task;
27928 +                       get_task_struct(cpu_wakeup_task);
27929 +               }
27931 +               raw_spin_unlock_irqrestore(&wakeup_lock, flags);
27932 +       }
27935 +static notrace void probe_wakeup_latency_hist_start(void *v,
27936 +       struct task_struct *p)
27938 +       unsigned long flags;
27939 +       struct task_struct *curr = current;
27940 +       int cpu = task_cpu(p);
27941 +       struct task_struct *cpu_wakeup_task;
27943 +       raw_spin_lock_irqsave(&wakeup_lock, flags);
27945 +       cpu_wakeup_task = per_cpu(wakeup_task, cpu);
27947 +       if (wakeup_pid) {
27948 +               if ((cpu_wakeup_task && p->prio == cpu_wakeup_task->prio) ||
27949 +                   p->prio == curr->prio)
27950 +                       per_cpu(wakeup_sharedprio, cpu) = 1;
27951 +               if (likely(wakeup_pid != task_pid_nr(p)))
27952 +                       goto out;
27953 +       } else {
27954 +               if (likely(!rt_task(p)) ||
27955 +                   (cpu_wakeup_task && p->prio > cpu_wakeup_task->prio) ||
27956 +                   p->prio > curr->prio)
27957 +                       goto out;
27958 +               if ((cpu_wakeup_task && p->prio == cpu_wakeup_task->prio) ||
27959 +                   p->prio == curr->prio)
27960 +                       per_cpu(wakeup_sharedprio, cpu) = 1;
27961 +       }
27963 +       if (cpu_wakeup_task)
27964 +               put_task_struct(cpu_wakeup_task);
27965 +       cpu_wakeup_task = per_cpu(wakeup_task, cpu) = p;
27966 +       get_task_struct(cpu_wakeup_task);
27967 +       cpu_wakeup_task->preempt_timestamp_hist =
27968 +               ftrace_now(raw_smp_processor_id());
27969 +out:
27970 +       raw_spin_unlock_irqrestore(&wakeup_lock, flags);
27973 +static notrace void probe_wakeup_latency_hist_stop(void *v,
27974 +       bool preempt, struct task_struct *prev, struct task_struct *next)
27976 +       unsigned long flags;
27977 +       int cpu = task_cpu(next);
27978 +       long latency;
27979 +       cycle_t stop;
27980 +       struct task_struct *cpu_wakeup_task;
27982 +       raw_spin_lock_irqsave(&wakeup_lock, flags);
27984 +       cpu_wakeup_task = per_cpu(wakeup_task, cpu);
27986 +       if (cpu_wakeup_task == NULL)
27987 +               goto out;
27989 +       /* Already running? */
27990 +       if (unlikely(current == cpu_wakeup_task))
27991 +               goto out_reset;
27993 +       if (next != cpu_wakeup_task) {
27994 +               if (next->prio < cpu_wakeup_task->prio)
27995 +                       goto out_reset;
27997 +               if (next->prio == cpu_wakeup_task->prio)
27998 +                       per_cpu(wakeup_sharedprio, cpu) = 1;
28000 +               goto out;
28001 +       }
28003 +       if (current->prio == cpu_wakeup_task->prio)
28004 +               per_cpu(wakeup_sharedprio, cpu) = 1;
28006 +       /*
28007 +        * The task we are waiting for is about to be switched to.
28008 +        * Calculate latency and store it in histogram.
28009 +        */
28010 +       stop = ftrace_now(raw_smp_processor_id());
28012 +       latency = ((long) (stop - next->preempt_timestamp_hist)) /
28013 +           NSECS_PER_USECS;
28015 +       if (per_cpu(wakeup_sharedprio, cpu)) {
28016 +               latency_hist(WAKEUP_LATENCY_SHAREDPRIO, cpu, latency, 0, stop,
28017 +                   next);
28018 +               per_cpu(wakeup_sharedprio, cpu) = 0;
28019 +       } else {
28020 +               latency_hist(WAKEUP_LATENCY, cpu, latency, 0, stop, next);
28021 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
28022 +               if (timerandwakeup_enabled_data.enabled) {
28023 +                       latency_hist(TIMERANDWAKEUP_LATENCY, cpu,
28024 +                           next->timer_offset + latency, next->timer_offset,
28025 +                           stop, next);
28026 +               }
28027 +#endif
28028 +       }
28030 +out_reset:
28031 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
28032 +       next->timer_offset = 0;
28033 +#endif
28034 +       put_task_struct(cpu_wakeup_task);
28035 +       per_cpu(wakeup_task, cpu) = NULL;
28036 +out:
28037 +       raw_spin_unlock_irqrestore(&wakeup_lock, flags);
28039 +#endif
28041 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
28042 +static notrace void probe_hrtimer_interrupt(void *v, int cpu,
28043 +       long long latency_ns, struct task_struct *curr,
28044 +       struct task_struct *task)
28046 +       if (latency_ns <= 0 && task != NULL && rt_task(task) &&
28047 +           (task->prio < curr->prio ||
28048 +           (task->prio == curr->prio &&
28049 +           !cpumask_test_cpu(cpu, &task->cpus_allowed)))) {
28050 +               long latency;
28051 +               cycle_t now;
28053 +               if (missed_timer_offsets_pid) {
28054 +                       if (likely(missed_timer_offsets_pid !=
28055 +                           task_pid_nr(task)))
28056 +                               return;
28057 +               }
28059 +               now = ftrace_now(cpu);
28060 +               latency = (long) div_s64(-latency_ns, NSECS_PER_USECS);
28061 +               latency_hist(MISSED_TIMER_OFFSETS, cpu, latency, latency, now,
28062 +                   task);
28063 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
28064 +               task->timer_offset = latency;
28065 +#endif
28066 +       }
28068 +#endif
28070 +static __init int latency_hist_init(void)
28072 +       struct dentry *latency_hist_root = NULL;
28073 +       struct dentry *dentry;
28074 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
28075 +       struct dentry *dentry_sharedprio;
28076 +#endif
28077 +       struct dentry *entry;
28078 +       struct dentry *enable_root;
28079 +       int i = 0;
28080 +       struct hist_data *my_hist;
28081 +       char name[64];
28082 +       char *cpufmt = "CPU%d";
28083 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
28084 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
28085 +       char *cpufmt_maxlatproc = "max_latency-CPU%d";
28086 +       struct maxlatproc_data *mp = NULL;
28087 +#endif
28089 +       dentry = tracing_init_dentry();
28090 +       latency_hist_root = debugfs_create_dir(latency_hist_dir_root, dentry);
28091 +       enable_root = debugfs_create_dir("enable", latency_hist_root);
28093 +#ifdef CONFIG_INTERRUPT_OFF_HIST
28094 +       dentry = debugfs_create_dir(irqsoff_hist_dir, latency_hist_root);
28095 +       for_each_possible_cpu(i) {
28096 +               sprintf(name, cpufmt, i);
28097 +               entry = debugfs_create_file(name, 0444, dentry,
28098 +                   &per_cpu(irqsoff_hist, i), &latency_hist_fops);
28099 +               my_hist = &per_cpu(irqsoff_hist, i);
28100 +               atomic_set(&my_hist->hist_mode, 1);
28101 +               my_hist->min_lat = LONG_MAX;
28102 +       }
28103 +       entry = debugfs_create_file("reset", 0644, dentry,
28104 +           (void *)IRQSOFF_LATENCY, &latency_hist_reset_fops);
28105 +#endif
28107 +#ifdef CONFIG_PREEMPT_OFF_HIST
28108 +       dentry = debugfs_create_dir(preemptoff_hist_dir,
28109 +           latency_hist_root);
28110 +       for_each_possible_cpu(i) {
28111 +               sprintf(name, cpufmt, i);
28112 +               entry = debugfs_create_file(name, 0444, dentry,
28113 +                   &per_cpu(preemptoff_hist, i), &latency_hist_fops);
28114 +               my_hist = &per_cpu(preemptoff_hist, i);
28115 +               atomic_set(&my_hist->hist_mode, 1);
28116 +               my_hist->min_lat = LONG_MAX;
28117 +       }
28118 +       entry = debugfs_create_file("reset", 0644, dentry,
28119 +           (void *)PREEMPTOFF_LATENCY, &latency_hist_reset_fops);
28120 +#endif
28122 +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
28123 +       dentry = debugfs_create_dir(preemptirqsoff_hist_dir,
28124 +           latency_hist_root);
28125 +       for_each_possible_cpu(i) {
28126 +               sprintf(name, cpufmt, i);
28127 +               entry = debugfs_create_file(name, 0444, dentry,
28128 +                   &per_cpu(preemptirqsoff_hist, i), &latency_hist_fops);
28129 +               my_hist = &per_cpu(preemptirqsoff_hist, i);
28130 +               atomic_set(&my_hist->hist_mode, 1);
28131 +               my_hist->min_lat = LONG_MAX;
28132 +       }
28133 +       entry = debugfs_create_file("reset", 0644, dentry,
28134 +           (void *)PREEMPTIRQSOFF_LATENCY, &latency_hist_reset_fops);
28135 +#endif
28137 +#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST)
28138 +       entry = debugfs_create_file("preemptirqsoff", 0644,
28139 +           enable_root, (void *)&preemptirqsoff_enabled_data,
28140 +           &enable_fops);
28141 +#endif
28143 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
28144 +       dentry = debugfs_create_dir(wakeup_latency_hist_dir,
28145 +           latency_hist_root);
28146 +       dentry_sharedprio = debugfs_create_dir(
28147 +           wakeup_latency_hist_dir_sharedprio, dentry);
28148 +       for_each_possible_cpu(i) {
28149 +               sprintf(name, cpufmt, i);
28151 +               entry = debugfs_create_file(name, 0444, dentry,
28152 +                   &per_cpu(wakeup_latency_hist, i),
28153 +                   &latency_hist_fops);
28154 +               my_hist = &per_cpu(wakeup_latency_hist, i);
28155 +               atomic_set(&my_hist->hist_mode, 1);
28156 +               my_hist->min_lat = LONG_MAX;
28158 +               entry = debugfs_create_file(name, 0444, dentry_sharedprio,
28159 +                   &per_cpu(wakeup_latency_hist_sharedprio, i),
28160 +                   &latency_hist_fops);
28161 +               my_hist = &per_cpu(wakeup_latency_hist_sharedprio, i);
28162 +               atomic_set(&my_hist->hist_mode, 1);
28163 +               my_hist->min_lat = LONG_MAX;
28165 +               sprintf(name, cpufmt_maxlatproc, i);
28167 +               mp = &per_cpu(wakeup_maxlatproc, i);
28168 +               entry = debugfs_create_file(name, 0444, dentry, mp,
28169 +                   &maxlatproc_fops);
28170 +               clear_maxlatprocdata(mp);
28172 +               mp = &per_cpu(wakeup_maxlatproc_sharedprio, i);
28173 +               entry = debugfs_create_file(name, 0444, dentry_sharedprio, mp,
28174 +                   &maxlatproc_fops);
28175 +               clear_maxlatprocdata(mp);
28176 +       }
28177 +       entry = debugfs_create_file("pid", 0644, dentry,
28178 +           (void *)&wakeup_pid, &pid_fops);
28179 +       entry = debugfs_create_file("reset", 0644, dentry,
28180 +           (void *)WAKEUP_LATENCY, &latency_hist_reset_fops);
28181 +       entry = debugfs_create_file("reset", 0644, dentry_sharedprio,
28182 +           (void *)WAKEUP_LATENCY_SHAREDPRIO, &latency_hist_reset_fops);
28183 +       entry = debugfs_create_file("wakeup", 0644,
28184 +           enable_root, (void *)&wakeup_latency_enabled_data,
28185 +           &enable_fops);
28186 +#endif
28188 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
28189 +       dentry = debugfs_create_dir(missed_timer_offsets_dir,
28190 +           latency_hist_root);
28191 +       for_each_possible_cpu(i) {
28192 +               sprintf(name, cpufmt, i);
28193 +               entry = debugfs_create_file(name, 0444, dentry,
28194 +                   &per_cpu(missed_timer_offsets, i), &latency_hist_fops);
28195 +               my_hist = &per_cpu(missed_timer_offsets, i);
28196 +               atomic_set(&my_hist->hist_mode, 1);
28197 +               my_hist->min_lat = LONG_MAX;
28199 +               sprintf(name, cpufmt_maxlatproc, i);
28200 +               mp = &per_cpu(missed_timer_offsets_maxlatproc, i);
28201 +               entry = debugfs_create_file(name, 0444, dentry, mp,
28202 +                   &maxlatproc_fops);
28203 +               clear_maxlatprocdata(mp);
28204 +       }
28205 +       entry = debugfs_create_file("pid", 0644, dentry,
28206 +           (void *)&missed_timer_offsets_pid, &pid_fops);
28207 +       entry = debugfs_create_file("reset", 0644, dentry,
28208 +           (void *)MISSED_TIMER_OFFSETS, &latency_hist_reset_fops);
28209 +       entry = debugfs_create_file("missed_timer_offsets", 0644,
28210 +           enable_root, (void *)&missed_timer_offsets_enabled_data,
28211 +           &enable_fops);
28212 +#endif
28214 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
28215 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
28216 +       dentry = debugfs_create_dir(timerandwakeup_latency_hist_dir,
28217 +           latency_hist_root);
28218 +       for_each_possible_cpu(i) {
28219 +               sprintf(name, cpufmt, i);
28220 +               entry = debugfs_create_file(name, 0444, dentry,
28221 +                   &per_cpu(timerandwakeup_latency_hist, i),
28222 +                   &latency_hist_fops);
28223 +               my_hist = &per_cpu(timerandwakeup_latency_hist, i);
28224 +               atomic_set(&my_hist->hist_mode, 1);
28225 +               my_hist->min_lat = LONG_MAX;
28227 +               sprintf(name, cpufmt_maxlatproc, i);
28228 +               mp = &per_cpu(timerandwakeup_maxlatproc, i);
28229 +               entry = debugfs_create_file(name, 0444, dentry, mp,
28230 +                   &maxlatproc_fops);
28231 +               clear_maxlatprocdata(mp);
28232 +       }
28233 +       entry = debugfs_create_file("reset", 0644, dentry,
28234 +           (void *)TIMERANDWAKEUP_LATENCY, &latency_hist_reset_fops);
28235 +       entry = debugfs_create_file("timerandwakeup", 0644,
28236 +           enable_root, (void *)&timerandwakeup_enabled_data,
28237 +           &enable_fops);
28238 +#endif
28239 +       return 0;
28242 +device_initcall(latency_hist_init);
28243 diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
28244 index 4c21c0b7dc91..d0e52efed27a 100644
28245 --- a/kernel/trace/trace.c
28246 +++ b/kernel/trace/trace.c
28247 @@ -1652,6 +1652,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
28248         struct task_struct *tsk = current;
28250         entry->preempt_count            = pc & 0xff;
28251 +       entry->preempt_lazy_count       = preempt_lazy_count();
28252         entry->pid                      = (tsk) ? tsk->pid : 0;
28253         entry->flags =
28254  #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
28255 @@ -1661,8 +1662,11 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
28256  #endif
28257                 ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) |
28258                 ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) |
28259 -               (tif_need_resched() ? TRACE_FLAG_NEED_RESCHED : 0) |
28260 +               (tif_need_resched_now() ? TRACE_FLAG_NEED_RESCHED : 0) |
28261 +               (need_resched_lazy() ? TRACE_FLAG_NEED_RESCHED_LAZY : 0) |
28262                 (test_preempt_need_resched() ? TRACE_FLAG_PREEMPT_RESCHED : 0);
28264 +       entry->migrate_disable = (tsk) ? __migrate_disabled(tsk) & 0xFF : 0;
28266  EXPORT_SYMBOL_GPL(tracing_generic_entry_update);
28268 @@ -2555,14 +2559,17 @@ get_total_entries(struct trace_buffer *buf,
28270  static void print_lat_help_header(struct seq_file *m)
28272 -       seq_puts(m, "#                  _------=> CPU#            \n"
28273 -                   "#                 / _-----=> irqs-off        \n"
28274 -                   "#                | / _----=> need-resched    \n"
28275 -                   "#                || / _---=> hardirq/softirq \n"
28276 -                   "#                ||| / _--=> preempt-depth   \n"
28277 -                   "#                |||| /     delay            \n"
28278 -                   "#  cmd     pid   ||||| time  |   caller      \n"
28279 -                   "#     \\   /      |||||  \\    |   /         \n");
28280 +       seq_puts(m, "#                  _--------=> CPU#              \n"
28281 +                   "#                 / _-------=> irqs-off          \n"
28282 +                   "#                | / _------=> need-resched      \n"
28283 +                   "#                || / _-----=> need-resched_lazy \n"
28284 +                   "#                ||| / _----=> hardirq/softirq   \n"
28285 +                   "#                |||| / _---=> preempt-depth     \n"
28286 +                   "#                ||||| / _--=> preempt-lazy-depth\n"
28287 +                   "#                |||||| / _-=> migrate-disable   \n"
28288 +                   "#                ||||||| /     delay             \n"
28289 +                   "# cmd     pid    |||||||| time   |  caller       \n"
28290 +                   "#     \\   /      ||||||||   \\    |  /            \n");
28293  static void print_event_info(struct trace_buffer *buf, struct seq_file *m)
28294 @@ -2588,11 +2595,14 @@ static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file
28295         print_event_info(buf, m);
28296         seq_puts(m, "#                              _-----=> irqs-off\n"
28297                     "#                             / _----=> need-resched\n"
28298 -                   "#                            | / _---=> hardirq/softirq\n"
28299 -                   "#                            || / _--=> preempt-depth\n"
28300 -                   "#                            ||| /     delay\n"
28301 -                   "#           TASK-PID   CPU#  ||||    TIMESTAMP  FUNCTION\n"
28302 -                   "#              | |       |   ||||       |         |\n");
28303 +                   "#                            |/  _-----=> need-resched_lazy\n"
28304 +                   "#                            || / _---=> hardirq/softirq\n"
28305 +                   "#                            ||| / _--=> preempt-depth\n"
28306 +                   "#                            |||| / _-=> preempt-lazy-depth\n"
28307 +                   "#                            ||||| / _-=> migrate-disable   \n"
28308 +                   "#                            |||||| /    delay\n"
28309 +                   "#           TASK-PID   CPU#  |||||||   TIMESTAMP  FUNCTION\n"
28310 +                   "#              | |       |   |||||||      |         |\n");
28313  void
28314 diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
28315 index 919d9d07686f..3bf86ece683c 100644
28316 --- a/kernel/trace/trace.h
28317 +++ b/kernel/trace/trace.h
28318 @@ -117,6 +117,7 @@ struct kretprobe_trace_entry_head {
28319   *  NEED_RESCHED       - reschedule is requested
28320   *  HARDIRQ            - inside an interrupt handler
28321   *  SOFTIRQ            - inside a softirq handler
28322 + *  NEED_RESCHED_LAZY  - lazy reschedule is requested
28323   */
28324  enum trace_flag_type {
28325         TRACE_FLAG_IRQS_OFF             = 0x01,
28326 @@ -125,6 +126,7 @@ enum trace_flag_type {
28327         TRACE_FLAG_HARDIRQ              = 0x08,
28328         TRACE_FLAG_SOFTIRQ              = 0x10,
28329         TRACE_FLAG_PREEMPT_RESCHED      = 0x20,
28330 +       TRACE_FLAG_NEED_RESCHED_LAZY    = 0x40,
28331  };
28333  #define TRACE_BUF_SIZE         1024
28334 diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
28335 index 996f0fd34312..5bd79b347398 100644
28336 --- a/kernel/trace/trace_events.c
28337 +++ b/kernel/trace/trace_events.c
28338 @@ -188,6 +188,8 @@ static int trace_define_common_fields(void)
28339         __common_field(unsigned char, flags);
28340         __common_field(unsigned char, preempt_count);
28341         __common_field(int, pid);
28342 +       __common_field(unsigned short, migrate_disable);
28343 +       __common_field(unsigned short, padding);
28345         return ret;
28347 @@ -244,6 +246,14 @@ void *trace_event_buffer_reserve(struct trace_event_buffer *fbuffer,
28349         local_save_flags(fbuffer->flags);
28350         fbuffer->pc = preempt_count();
28351 +       /*
28352 +        * If CONFIG_PREEMPT is enabled, then the tracepoint itself disables
28353 +        * preemption (adding one to the preempt_count). Since we are
28354 +        * interested in the preempt_count at the time the tracepoint was
28355 +        * hit, we need to subtract one to offset the increment.
28356 +        */
28357 +       if (IS_ENABLED(CONFIG_PREEMPT))
28358 +               fbuffer->pc--;
28359         fbuffer->trace_file = trace_file;
28361         fbuffer->event =
28362 diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
28363 index be3222b7d72e..553e71254ad6 100644
28364 --- a/kernel/trace/trace_irqsoff.c
28365 +++ b/kernel/trace/trace_irqsoff.c
28366 @@ -13,6 +13,7 @@
28367  #include <linux/uaccess.h>
28368  #include <linux/module.h>
28369  #include <linux/ftrace.h>
28370 +#include <trace/events/hist.h>
28372  #include "trace.h"
28374 @@ -424,11 +425,13 @@ void start_critical_timings(void)
28376         if (preempt_trace() || irq_trace())
28377                 start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
28378 +       trace_preemptirqsoff_hist_rcuidle(TRACE_START, 1);
28380  EXPORT_SYMBOL_GPL(start_critical_timings);
28382  void stop_critical_timings(void)
28384 +       trace_preemptirqsoff_hist_rcuidle(TRACE_STOP, 0);
28385         if (preempt_trace() || irq_trace())
28386                 stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
28388 @@ -438,6 +441,7 @@ EXPORT_SYMBOL_GPL(stop_critical_timings);
28389  #ifdef CONFIG_PROVE_LOCKING
28390  void time_hardirqs_on(unsigned long a0, unsigned long a1)
28392 +       trace_preemptirqsoff_hist_rcuidle(IRQS_ON, 0);
28393         if (!preempt_trace() && irq_trace())
28394                 stop_critical_timing(a0, a1);
28396 @@ -446,6 +450,7 @@ void time_hardirqs_off(unsigned long a0, unsigned long a1)
28398         if (!preempt_trace() && irq_trace())
28399                 start_critical_timing(a0, a1);
28400 +       trace_preemptirqsoff_hist_rcuidle(IRQS_OFF, 1);
28403  #else /* !CONFIG_PROVE_LOCKING */
28404 @@ -471,6 +476,7 @@ inline void print_irqtrace_events(struct task_struct *curr)
28405   */
28406  void trace_hardirqs_on(void)
28408 +       trace_preemptirqsoff_hist(IRQS_ON, 0);
28409         if (!preempt_trace() && irq_trace())
28410                 stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
28412 @@ -480,11 +486,13 @@ void trace_hardirqs_off(void)
28414         if (!preempt_trace() && irq_trace())
28415                 start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
28416 +       trace_preemptirqsoff_hist(IRQS_OFF, 1);
28418  EXPORT_SYMBOL(trace_hardirqs_off);
28420  __visible void trace_hardirqs_on_caller(unsigned long caller_addr)
28422 +       trace_preemptirqsoff_hist(IRQS_ON, 0);
28423         if (!preempt_trace() && irq_trace())
28424                 stop_critical_timing(CALLER_ADDR0, caller_addr);
28426 @@ -494,6 +502,7 @@ __visible void trace_hardirqs_off_caller(unsigned long caller_addr)
28428         if (!preempt_trace() && irq_trace())
28429                 start_critical_timing(CALLER_ADDR0, caller_addr);
28430 +       trace_preemptirqsoff_hist(IRQS_OFF, 1);
28432  EXPORT_SYMBOL(trace_hardirqs_off_caller);
28434 @@ -503,12 +512,14 @@ EXPORT_SYMBOL(trace_hardirqs_off_caller);
28435  #ifdef CONFIG_PREEMPT_TRACER
28436  void trace_preempt_on(unsigned long a0, unsigned long a1)
28438 +       trace_preemptirqsoff_hist(PREEMPT_ON, 0);
28439         if (preempt_trace() && !irq_trace())
28440                 stop_critical_timing(a0, a1);
28443  void trace_preempt_off(unsigned long a0, unsigned long a1)
28445 +       trace_preemptirqsoff_hist(PREEMPT_ON, 1);
28446         if (preempt_trace() && !irq_trace())
28447                 start_critical_timing(a0, a1);
28449 diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
28450 index 282982195e09..9f19d839a756 100644
28451 --- a/kernel/trace/trace_output.c
28452 +++ b/kernel/trace/trace_output.c
28453 @@ -386,6 +386,7 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
28455         char hardsoft_irq;
28456         char need_resched;
28457 +       char need_resched_lazy;
28458         char irqs_off;
28459         int hardirq;
28460         int softirq;
28461 @@ -413,6 +414,8 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
28462                 need_resched = '.';
28463                 break;
28464         }
28465 +       need_resched_lazy =
28466 +               (entry->flags & TRACE_FLAG_NEED_RESCHED_LAZY) ? 'L' : '.';
28468         hardsoft_irq =
28469                 (hardirq && softirq) ? 'H' :
28470 @@ -420,14 +423,25 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
28471                 softirq ? 's' :
28472                 '.';
28474 -       trace_seq_printf(s, "%c%c%c",
28475 -                        irqs_off, need_resched, hardsoft_irq);
28476 +       trace_seq_printf(s, "%c%c%c%c",
28477 +                        irqs_off, need_resched, need_resched_lazy,
28478 +                        hardsoft_irq);
28480         if (entry->preempt_count)
28481                 trace_seq_printf(s, "%x", entry->preempt_count);
28482         else
28483                 trace_seq_putc(s, '.');
28485 +       if (entry->preempt_lazy_count)
28486 +               trace_seq_printf(s, "%x", entry->preempt_lazy_count);
28487 +       else
28488 +               trace_seq_putc(s, '.');
28490 +       if (entry->migrate_disable)
28491 +               trace_seq_printf(s, "%x", entry->migrate_disable);
28492 +       else
28493 +               trace_seq_putc(s, '.');
28495         return !trace_seq_has_overflowed(s);
28498 diff --git a/kernel/user.c b/kernel/user.c
28499 index b069ccbfb0b0..1a2e88e98b5e 100644
28500 --- a/kernel/user.c
28501 +++ b/kernel/user.c
28502 @@ -161,11 +161,11 @@ void free_uid(struct user_struct *up)
28503         if (!up)
28504                 return;
28506 -       local_irq_save(flags);
28507 +       local_irq_save_nort(flags);
28508         if (atomic_dec_and_lock(&up->__count, &uidhash_lock))
28509                 free_user(up, flags);
28510         else
28511 -               local_irq_restore(flags);
28512 +               local_irq_restore_nort(flags);
28515  struct user_struct *alloc_uid(kuid_t uid)
28516 diff --git a/kernel/watchdog.c b/kernel/watchdog.c
28517 index c1e0b5f429b6..fa2e079cc314 100644
28518 --- a/kernel/watchdog.c
28519 +++ b/kernel/watchdog.c
28520 @@ -299,6 +299,8 @@ static int is_softlockup(unsigned long touch_ts)
28522  #ifdef CONFIG_HARDLOCKUP_DETECTOR
28524 +static DEFINE_RAW_SPINLOCK(watchdog_output_lock);
28526  static struct perf_event_attr wd_hw_attr = {
28527         .type           = PERF_TYPE_HARDWARE,
28528         .config         = PERF_COUNT_HW_CPU_CYCLES,
28529 @@ -332,6 +334,13 @@ static void watchdog_overflow_callback(struct perf_event *event,
28530                 /* only print hardlockups once */
28531                 if (__this_cpu_read(hard_watchdog_warn) == true)
28532                         return;
28533 +               /*
28534 +                * If early-printk is enabled then make sure we do not
28535 +                * lock up in printk() and kill console logging:
28536 +                */
28537 +               printk_kill();
28539 +               raw_spin_lock(&watchdog_output_lock);
28541                 pr_emerg("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
28542                 print_modules();
28543 @@ -349,8 +358,9 @@ static void watchdog_overflow_callback(struct perf_event *event,
28544                                 !test_and_set_bit(0, &hardlockup_allcpu_dumped))
28545                         trigger_allbutself_cpu_backtrace();
28547 +               raw_spin_unlock(&watchdog_output_lock);
28548                 if (hardlockup_panic)
28549 -                       panic("Hard LOCKUP");
28550 +                       nmi_panic(regs, "Hard LOCKUP");
28552                 __this_cpu_write(hard_watchdog_warn, true);
28553                 return;
28554 @@ -496,6 +506,7 @@ static void watchdog_enable(unsigned int cpu)
28555         /* kick off the timer for the hardlockup detector */
28556         hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
28557         hrtimer->function = watchdog_timer_fn;
28558 +       hrtimer->irqsafe = 1;
28560         /* Enable the perf event */
28561         watchdog_nmi_enable(cpu);
28562 diff --git a/kernel/workqueue.c b/kernel/workqueue.c
28563 index 2c2f971f3e75..d5b0f4fc0eb0 100644
28564 --- a/kernel/workqueue.c
28565 +++ b/kernel/workqueue.c
28566 @@ -48,6 +48,8 @@
28567  #include <linux/nodemask.h>
28568  #include <linux/moduleparam.h>
28569  #include <linux/uaccess.h>
28570 +#include <linux/locallock.h>
28571 +#include <linux/delay.h>
28573  #include "workqueue_internal.h"
28575 @@ -121,11 +123,16 @@ enum {
28576   *    cpu or grabbing pool->lock is enough for read access.  If
28577   *    POOL_DISASSOCIATED is set, it's identical to L.
28578   *
28579 + *    On RT we need the extra protection via rt_lock_idle_list() for
28580 + *    the list manipulations against read access from
28581 + *    wq_worker_sleeping(). All other places are nicely serialized via
28582 + *    pool->lock.
28583 + *
28584   * A: pool->attach_mutex protected.
28585   *
28586   * PL: wq_pool_mutex protected.
28587   *
28588 - * PR: wq_pool_mutex protected for writes.  Sched-RCU protected for reads.
28589 + * PR: wq_pool_mutex protected for writes.  RCU protected for reads.
28590   *
28591   * PW: wq_pool_mutex and wq->mutex protected for writes.  Either for reads.
28592   *
28593 @@ -134,7 +141,7 @@ enum {
28594   *
28595   * WQ: wq->mutex protected.
28596   *
28597 - * WR: wq->mutex protected for writes.  Sched-RCU protected for reads.
28598 + * WR: wq->mutex protected for writes.  RCU protected for reads.
28599   *
28600   * MD: wq_mayday_lock protected.
28601   */
28602 @@ -183,7 +190,7 @@ struct worker_pool {
28603         atomic_t                nr_running ____cacheline_aligned_in_smp;
28605         /*
28606 -        * Destruction of pool is sched-RCU protected to allow dereferences
28607 +        * Destruction of pool is RCU protected to allow dereferences
28608          * from get_work_pool().
28609          */
28610         struct rcu_head         rcu;
28611 @@ -212,7 +219,7 @@ struct pool_workqueue {
28612         /*
28613          * Release of unbound pwq is punted to system_wq.  See put_pwq()
28614          * and pwq_unbound_release_workfn() for details.  pool_workqueue
28615 -        * itself is also sched-RCU protected so that the first pwq can be
28616 +        * itself is also RCU protected so that the first pwq can be
28617          * determined without grabbing wq->mutex.
28618          */
28619         struct work_struct      unbound_release_work;
28620 @@ -331,6 +338,8 @@ EXPORT_SYMBOL_GPL(system_power_efficient_wq);
28621  struct workqueue_struct *system_freezable_power_efficient_wq __read_mostly;
28622  EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq);
28624 +static DEFINE_LOCAL_IRQ_LOCK(pendingb_lock);
28626  static int worker_thread(void *__worker);
28627  static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
28629 @@ -338,20 +347,20 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
28630  #include <trace/events/workqueue.h>
28632  #define assert_rcu_or_pool_mutex()                                     \
28633 -       RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() &&                 \
28634 +       RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&                       \
28635                          !lockdep_is_held(&wq_pool_mutex),              \
28636 -                        "sched RCU or wq_pool_mutex should be held")
28637 +                        "RCU or wq_pool_mutex should be held")
28639  #define assert_rcu_or_wq_mutex(wq)                                     \
28640 -       RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() &&                 \
28641 +       RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&                       \
28642                          !lockdep_is_held(&wq->mutex),                  \
28643 -                        "sched RCU or wq->mutex should be held")
28644 +                        "RCU or wq->mutex should be held")
28646  #define assert_rcu_or_wq_mutex_or_pool_mutex(wq)                       \
28647 -       RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() &&                 \
28648 +       RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&                       \
28649                          !lockdep_is_held(&wq->mutex) &&                \
28650                          !lockdep_is_held(&wq_pool_mutex),              \
28651 -                        "sched RCU, wq->mutex or wq_pool_mutex should be held")
28652 +                        "RCU, wq->mutex or wq_pool_mutex should be held")
28654  #define for_each_cpu_worker_pool(pool, cpu)                            \
28655         for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0];               \
28656 @@ -363,7 +372,7 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
28657   * @pool: iteration cursor
28658   * @pi: integer used for iteration
28659   *
28660 - * This must be called either with wq_pool_mutex held or sched RCU read
28661 + * This must be called either with wq_pool_mutex held or RCU read
28662   * locked.  If the pool needs to be used beyond the locking in effect, the
28663   * caller is responsible for guaranteeing that the pool stays online.
28664   *
28665 @@ -395,7 +404,7 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
28666   * @pwq: iteration cursor
28667   * @wq: the target workqueue
28668   *
28669 - * This must be called either with wq->mutex held or sched RCU read locked.
28670 + * This must be called either with wq->mutex held or RCU read locked.
28671   * If the pwq needs to be used beyond the locking in effect, the caller is
28672   * responsible for guaranteeing that the pwq stays online.
28673   *
28674 @@ -407,6 +416,31 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
28675                 if (({ assert_rcu_or_wq_mutex(wq); false; })) { }       \
28676                 else
28678 +#ifdef CONFIG_PREEMPT_RT_BASE
28679 +static inline void rt_lock_idle_list(struct worker_pool *pool)
28681 +       preempt_disable();
28683 +static inline void rt_unlock_idle_list(struct worker_pool *pool)
28685 +       preempt_enable();
28687 +static inline void sched_lock_idle_list(struct worker_pool *pool) { }
28688 +static inline void sched_unlock_idle_list(struct worker_pool *pool) { }
28689 +#else
28690 +static inline void rt_lock_idle_list(struct worker_pool *pool) { }
28691 +static inline void rt_unlock_idle_list(struct worker_pool *pool) { }
28692 +static inline void sched_lock_idle_list(struct worker_pool *pool)
28694 +       spin_lock_irq(&pool->lock);
28696 +static inline void sched_unlock_idle_list(struct worker_pool *pool)
28698 +       spin_unlock_irq(&pool->lock);
28700 +#endif
28703  #ifdef CONFIG_DEBUG_OBJECTS_WORK
28705  static struct debug_obj_descr work_debug_descr;
28706 @@ -557,7 +591,7 @@ static int worker_pool_assign_id(struct worker_pool *pool)
28707   * @wq: the target workqueue
28708   * @node: the node ID
28709   *
28710 - * This must be called with any of wq_pool_mutex, wq->mutex or sched RCU
28711 + * This must be called with any of wq_pool_mutex, wq->mutex or RCU
28712   * read locked.
28713   * If the pwq needs to be used beyond the locking in effect, the caller is
28714   * responsible for guaranteeing that the pwq stays online.
28715 @@ -701,8 +735,8 @@ static struct pool_workqueue *get_work_pwq(struct work_struct *work)
28716   * @work: the work item of interest
28717   *
28718   * Pools are created and destroyed under wq_pool_mutex, and allows read
28719 - * access under sched-RCU read lock.  As such, this function should be
28720 - * called under wq_pool_mutex or with preemption disabled.
28721 + * access under RCU read lock.  As such, this function should be
28722 + * called under wq_pool_mutex or inside of a rcu_read_lock() region.
28723   *
28724   * All fields of the returned pool are accessible as long as the above
28725   * mentioned locking is in effect.  If the returned pool needs to be used
28726 @@ -839,51 +873,44 @@ static struct worker *first_idle_worker(struct worker_pool *pool)
28727   */
28728  static void wake_up_worker(struct worker_pool *pool)
28730 -       struct worker *worker = first_idle_worker(pool);
28731 +       struct worker *worker;
28733 +       rt_lock_idle_list(pool);
28735 +       worker = first_idle_worker(pool);
28737         if (likely(worker))
28738                 wake_up_process(worker->task);
28740 +       rt_unlock_idle_list(pool);
28743  /**
28744 - * wq_worker_waking_up - a worker is waking up
28745 - * @task: task waking up
28746 - * @cpu: CPU @task is waking up to
28747 + * wq_worker_running - a worker is running again
28748 + * @task: task returning from sleep
28749   *
28750 - * This function is called during try_to_wake_up() when a worker is
28751 - * being awoken.
28752 - *
28753 - * CONTEXT:
28754 - * spin_lock_irq(rq->lock)
28755 + * This function is called when a worker returns from schedule()
28756   */
28757 -void wq_worker_waking_up(struct task_struct *task, int cpu)
28758 +void wq_worker_running(struct task_struct *task)
28760         struct worker *worker = kthread_data(task);
28762 -       if (!(worker->flags & WORKER_NOT_RUNNING)) {
28763 -               WARN_ON_ONCE(worker->pool->cpu != cpu);
28764 +       if (!worker->sleeping)
28765 +               return;
28766 +       if (!(worker->flags & WORKER_NOT_RUNNING))
28767                 atomic_inc(&worker->pool->nr_running);
28768 -       }
28769 +       worker->sleeping = 0;
28772  /**
28773   * wq_worker_sleeping - a worker is going to sleep
28774   * @task: task going to sleep
28775 - * @cpu: CPU in question, must be the current CPU number
28776 - *
28777 - * This function is called during schedule() when a busy worker is
28778 - * going to sleep.  Worker on the same cpu can be woken up by
28779 - * returning pointer to its task.
28780 - *
28781 - * CONTEXT:
28782 - * spin_lock_irq(rq->lock)
28783 - *
28784 - * Return:
28785 - * Worker task on @cpu to wake up, %NULL if none.
28786 + * This function is called from schedule() when a busy worker is
28787 + * going to sleep.
28788   */
28789 -struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu)
28790 +void wq_worker_sleeping(struct task_struct *task)
28792 -       struct worker *worker = kthread_data(task), *to_wakeup = NULL;
28793 +       struct worker *worker = kthread_data(task);
28794         struct worker_pool *pool;
28796         /*
28797 @@ -892,29 +919,26 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu)
28798          * checking NOT_RUNNING.
28799          */
28800         if (worker->flags & WORKER_NOT_RUNNING)
28801 -               return NULL;
28802 +               return;
28804         pool = worker->pool;
28806 -       /* this can only happen on the local cpu */
28807 -       if (WARN_ON_ONCE(cpu != raw_smp_processor_id() || pool->cpu != cpu))
28808 -               return NULL;
28809 +       if (WARN_ON_ONCE(worker->sleeping))
28810 +               return;
28812 +       worker->sleeping = 1;
28814         /*
28815          * The counterpart of the following dec_and_test, implied mb,
28816          * worklist not empty test sequence is in insert_work().
28817          * Please read comment there.
28818 -        *
28819 -        * NOT_RUNNING is clear.  This means that we're bound to and
28820 -        * running on the local cpu w/ rq lock held and preemption
28821 -        * disabled, which in turn means that none else could be
28822 -        * manipulating idle_list, so dereferencing idle_list without pool
28823 -        * lock is safe.
28824          */
28825         if (atomic_dec_and_test(&pool->nr_running) &&
28826 -           !list_empty(&pool->worklist))
28827 -               to_wakeup = first_idle_worker(pool);
28828 -       return to_wakeup ? to_wakeup->task : NULL;
28829 +           !list_empty(&pool->worklist)) {
28830 +               sched_lock_idle_list(pool);
28831 +               wake_up_worker(pool);
28832 +               sched_unlock_idle_list(pool);
28833 +       }
28836  /**
28837 @@ -1108,12 +1132,14 @@ static void put_pwq_unlocked(struct pool_workqueue *pwq)
28839         if (pwq) {
28840                 /*
28841 -                * As both pwqs and pools are sched-RCU protected, the
28842 +                * As both pwqs and pools are RCU protected, the
28843                  * following lock operations are safe.
28844                  */
28845 -               spin_lock_irq(&pwq->pool->lock);
28846 +               rcu_read_lock();
28847 +               local_spin_lock_irq(pendingb_lock, &pwq->pool->lock);
28848                 put_pwq(pwq);
28849 -               spin_unlock_irq(&pwq->pool->lock);
28850 +               local_spin_unlock_irq(pendingb_lock, &pwq->pool->lock);
28851 +               rcu_read_unlock();
28852         }
28855 @@ -1215,7 +1241,7 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
28856         struct worker_pool *pool;
28857         struct pool_workqueue *pwq;
28859 -       local_irq_save(*flags);
28860 +       local_lock_irqsave(pendingb_lock, *flags);
28862         /* try to steal the timer if it exists */
28863         if (is_dwork) {
28864 @@ -1234,6 +1260,7 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
28865         if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)))
28866                 return 0;
28868 +       rcu_read_lock();
28869         /*
28870          * The queueing is in progress, or it is already queued. Try to
28871          * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
28872 @@ -1272,14 +1299,16 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
28873                 set_work_pool_and_keep_pending(work, pool->id);
28875                 spin_unlock(&pool->lock);
28876 +               rcu_read_unlock();
28877                 return 1;
28878         }
28879         spin_unlock(&pool->lock);
28880  fail:
28881 -       local_irq_restore(*flags);
28882 +       rcu_read_unlock();
28883 +       local_unlock_irqrestore(pendingb_lock, *flags);
28884         if (work_is_canceling(work))
28885                 return -ENOENT;
28886 -       cpu_relax();
28887 +       cpu_chill();
28888         return -EAGAIN;
28891 @@ -1348,7 +1377,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
28892          * queued or lose PENDING.  Grabbing PENDING and queueing should
28893          * happen with IRQ disabled.
28894          */
28895 -       WARN_ON_ONCE(!irqs_disabled());
28896 +       WARN_ON_ONCE_NONRT(!irqs_disabled());
28898         debug_work_activate(work);
28900 @@ -1356,6 +1385,8 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
28901         if (unlikely(wq->flags & __WQ_DRAINING) &&
28902             WARN_ON_ONCE(!is_chained_work(wq)))
28903                 return;
28905 +       rcu_read_lock();
28906  retry:
28907         if (req_cpu == WORK_CPU_UNBOUND)
28908                 cpu = raw_smp_processor_id();
28909 @@ -1412,10 +1443,8 @@ retry:
28910         /* pwq determined, queue */
28911         trace_workqueue_queue_work(req_cpu, pwq, work);
28913 -       if (WARN_ON(!list_empty(&work->entry))) {
28914 -               spin_unlock(&pwq->pool->lock);
28915 -               return;
28916 -       }
28917 +       if (WARN_ON(!list_empty(&work->entry)))
28918 +               goto out;
28920         pwq->nr_in_flight[pwq->work_color]++;
28921         work_flags = work_color_to_flags(pwq->work_color);
28922 @@ -1431,7 +1460,9 @@ retry:
28924         insert_work(pwq, work, worklist, work_flags);
28926 +out:
28927         spin_unlock(&pwq->pool->lock);
28928 +       rcu_read_unlock();
28931  /**
28932 @@ -1451,14 +1482,14 @@ bool queue_work_on(int cpu, struct workqueue_struct *wq,
28933         bool ret = false;
28934         unsigned long flags;
28936 -       local_irq_save(flags);
28937 +       local_lock_irqsave(pendingb_lock,flags);
28939         if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
28940                 __queue_work(cpu, wq, work);
28941                 ret = true;
28942         }
28944 -       local_irq_restore(flags);
28945 +       local_unlock_irqrestore(pendingb_lock, flags);
28946         return ret;
28948  EXPORT_SYMBOL(queue_work_on);
28949 @@ -1525,14 +1556,14 @@ bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
28950         unsigned long flags;
28952         /* read the comment in __queue_work() */
28953 -       local_irq_save(flags);
28954 +       local_lock_irqsave(pendingb_lock, flags);
28956         if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
28957                 __queue_delayed_work(cpu, wq, dwork, delay);
28958                 ret = true;
28959         }
28961 -       local_irq_restore(flags);
28962 +       local_unlock_irqrestore(pendingb_lock, flags);
28963         return ret;
28965  EXPORT_SYMBOL(queue_delayed_work_on);
28966 @@ -1567,7 +1598,7 @@ bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq,
28968         if (likely(ret >= 0)) {
28969                 __queue_delayed_work(cpu, wq, dwork, delay);
28970 -               local_irq_restore(flags);
28971 +               local_unlock_irqrestore(pendingb_lock, flags);
28972         }
28974         /* -ENOENT from try_to_grab_pending() becomes %true */
28975 @@ -1600,7 +1631,9 @@ static void worker_enter_idle(struct worker *worker)
28976         worker->last_active = jiffies;
28978         /* idle_list is LIFO */
28979 +       rt_lock_idle_list(pool);
28980         list_add(&worker->entry, &pool->idle_list);
28981 +       rt_unlock_idle_list(pool);
28983         if (too_many_workers(pool) && !timer_pending(&pool->idle_timer))
28984                 mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);
28985 @@ -1633,7 +1666,9 @@ static void worker_leave_idle(struct worker *worker)
28986                 return;
28987         worker_clr_flags(worker, WORKER_IDLE);
28988         pool->nr_idle--;
28989 +       rt_lock_idle_list(pool);
28990         list_del_init(&worker->entry);
28991 +       rt_unlock_idle_list(pool);
28994  static struct worker *alloc_worker(int node)
28995 @@ -1799,7 +1834,9 @@ static void destroy_worker(struct worker *worker)
28996         pool->nr_workers--;
28997         pool->nr_idle--;
28999 +       rt_lock_idle_list(pool);
29000         list_del_init(&worker->entry);
29001 +       rt_unlock_idle_list(pool);
29002         worker->flags |= WORKER_DIE;
29003         wake_up_process(worker->task);
29005 @@ -2716,14 +2753,14 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)
29007         might_sleep();
29009 -       local_irq_disable();
29010 +       rcu_read_lock();
29011         pool = get_work_pool(work);
29012         if (!pool) {
29013 -               local_irq_enable();
29014 +               rcu_read_unlock();
29015                 return false;
29016         }
29018 -       spin_lock(&pool->lock);
29019 +       spin_lock_irq(&pool->lock);
29020         /* see the comment in try_to_grab_pending() with the same code */
29021         pwq = get_work_pwq(work);
29022         if (pwq) {
29023 @@ -2750,10 +2787,11 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)
29024         else
29025                 lock_map_acquire_read(&pwq->wq->lockdep_map);
29026         lock_map_release(&pwq->wq->lockdep_map);
29028 +       rcu_read_unlock();
29029         return true;
29030  already_gone:
29031         spin_unlock_irq(&pool->lock);
29032 +       rcu_read_unlock();
29033         return false;
29036 @@ -2840,7 +2878,7 @@ static bool __cancel_work_timer(struct work_struct *work, bool is_dwork)
29038         /* tell other tasks trying to grab @work to back off */
29039         mark_work_canceling(work);
29040 -       local_irq_restore(flags);
29041 +       local_unlock_irqrestore(pendingb_lock, flags);
29043         flush_work(work);
29044         clear_work_data(work);
29045 @@ -2895,10 +2933,10 @@ EXPORT_SYMBOL_GPL(cancel_work_sync);
29046   */
29047  bool flush_delayed_work(struct delayed_work *dwork)
29049 -       local_irq_disable();
29050 +       local_lock_irq(pendingb_lock);
29051         if (del_timer_sync(&dwork->timer))
29052                 __queue_work(dwork->cpu, dwork->wq, &dwork->work);
29053 -       local_irq_enable();
29054 +       local_unlock_irq(pendingb_lock);
29055         return flush_work(&dwork->work);
29057  EXPORT_SYMBOL(flush_delayed_work);
29058 @@ -2933,7 +2971,7 @@ bool cancel_delayed_work(struct delayed_work *dwork)
29060         set_work_pool_and_clear_pending(&dwork->work,
29061                                         get_work_pool_id(&dwork->work));
29062 -       local_irq_restore(flags);
29063 +       local_unlock_irqrestore(pendingb_lock, flags);
29064         return ret;
29066  EXPORT_SYMBOL(cancel_delayed_work);
29067 @@ -3161,7 +3199,7 @@ static void rcu_free_pool(struct rcu_head *rcu)
29068   * put_unbound_pool - put a worker_pool
29069   * @pool: worker_pool to put
29070   *
29071 - * Put @pool.  If its refcnt reaches zero, it gets destroyed in sched-RCU
29072 + * Put @pool.  If its refcnt reaches zero, it gets destroyed in RCU
29073   * safe manner.  get_unbound_pool() calls this function on its failure path
29074   * and this function should be able to release pools which went through,
29075   * successfully or not, init_worker_pool().
29076 @@ -3215,8 +3253,8 @@ static void put_unbound_pool(struct worker_pool *pool)
29077         del_timer_sync(&pool->idle_timer);
29078         del_timer_sync(&pool->mayday_timer);
29080 -       /* sched-RCU protected to allow dereferences from get_work_pool() */
29081 -       call_rcu_sched(&pool->rcu, rcu_free_pool);
29082 +       /* RCU protected to allow dereferences from get_work_pool() */
29083 +       call_rcu(&pool->rcu, rcu_free_pool);
29086  /**
29087 @@ -3323,14 +3361,14 @@ static void pwq_unbound_release_workfn(struct work_struct *work)
29088         put_unbound_pool(pool);
29089         mutex_unlock(&wq_pool_mutex);
29091 -       call_rcu_sched(&pwq->rcu, rcu_free_pwq);
29092 +       call_rcu(&pwq->rcu, rcu_free_pwq);
29094         /*
29095          * If we're the last pwq going away, @wq is already dead and no one
29096          * is gonna access it anymore.  Schedule RCU free.
29097          */
29098         if (is_last)
29099 -               call_rcu_sched(&wq->rcu, rcu_free_wq);
29100 +               call_rcu(&wq->rcu, rcu_free_wq);
29103  /**
29104 @@ -3983,7 +4021,7 @@ void destroy_workqueue(struct workqueue_struct *wq)
29105                  * The base ref is never dropped on per-cpu pwqs.  Directly
29106                  * schedule RCU free.
29107                  */
29108 -               call_rcu_sched(&wq->rcu, rcu_free_wq);
29109 +               call_rcu(&wq->rcu, rcu_free_wq);
29110         } else {
29111                 /*
29112                  * We're the sole accessor of @wq at this point.  Directly
29113 @@ -4076,7 +4114,8 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq)
29114         struct pool_workqueue *pwq;
29115         bool ret;
29117 -       rcu_read_lock_sched();
29118 +       rcu_read_lock();
29119 +       preempt_disable();
29121         if (cpu == WORK_CPU_UNBOUND)
29122                 cpu = smp_processor_id();
29123 @@ -4087,7 +4126,8 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq)
29124                 pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu));
29126         ret = !list_empty(&pwq->delayed_works);
29127 -       rcu_read_unlock_sched();
29128 +       preempt_enable();
29129 +       rcu_read_unlock();
29131         return ret;
29133 @@ -4113,15 +4153,15 @@ unsigned int work_busy(struct work_struct *work)
29134         if (work_pending(work))
29135                 ret |= WORK_BUSY_PENDING;
29137 -       local_irq_save(flags);
29138 +       rcu_read_lock();
29139         pool = get_work_pool(work);
29140         if (pool) {
29141 -               spin_lock(&pool->lock);
29142 +               spin_lock_irqsave(&pool->lock, flags);
29143                 if (find_worker_executing_work(pool, work))
29144                         ret |= WORK_BUSY_RUNNING;
29145 -               spin_unlock(&pool->lock);
29146 +               spin_unlock_irqrestore(&pool->lock, flags);
29147         }
29148 -       local_irq_restore(flags);
29149 +       rcu_read_unlock();
29151         return ret;
29153 @@ -4310,7 +4350,7 @@ void show_workqueue_state(void)
29154         unsigned long flags;
29155         int pi;
29157 -       rcu_read_lock_sched();
29158 +       rcu_read_lock();
29160         pr_info("Showing busy workqueues and worker pools:\n");
29162 @@ -4361,7 +4401,7 @@ void show_workqueue_state(void)
29163                 spin_unlock_irqrestore(&pool->lock, flags);
29164         }
29166 -       rcu_read_unlock_sched();
29167 +       rcu_read_unlock();
29170  /*
29171 @@ -4722,16 +4762,16 @@ bool freeze_workqueues_busy(void)
29172                  * nr_active is monotonically decreasing.  It's safe
29173                  * to peek without lock.
29174                  */
29175 -               rcu_read_lock_sched();
29176 +               rcu_read_lock();
29177                 for_each_pwq(pwq, wq) {
29178                         WARN_ON_ONCE(pwq->nr_active < 0);
29179                         if (pwq->nr_active) {
29180                                 busy = true;
29181 -                               rcu_read_unlock_sched();
29182 +                               rcu_read_unlock();
29183                                 goto out_unlock;
29184                         }
29185                 }
29186 -               rcu_read_unlock_sched();
29187 +               rcu_read_unlock();
29188         }
29189  out_unlock:
29190         mutex_unlock(&wq_pool_mutex);
29191 @@ -4921,7 +4961,8 @@ static ssize_t wq_pool_ids_show(struct device *dev,
29192         const char *delim = "";
29193         int node, written = 0;
29195 -       rcu_read_lock_sched();
29196 +       get_online_cpus();
29197 +       rcu_read_lock();
29198         for_each_node(node) {
29199                 written += scnprintf(buf + written, PAGE_SIZE - written,
29200                                      "%s%d:%d", delim, node,
29201 @@ -4929,7 +4970,8 @@ static ssize_t wq_pool_ids_show(struct device *dev,
29202                 delim = " ";
29203         }
29204         written += scnprintf(buf + written, PAGE_SIZE - written, "\n");
29205 -       rcu_read_unlock_sched();
29206 +       rcu_read_unlock();
29207 +       put_online_cpus();
29209         return written;
29211 diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h
29212 index 45215870ac6c..f000c4d6917e 100644
29213 --- a/kernel/workqueue_internal.h
29214 +++ b/kernel/workqueue_internal.h
29215 @@ -43,6 +43,7 @@ struct worker {
29216         unsigned long           last_active;    /* L: last active timestamp */
29217         unsigned int            flags;          /* X: flags */
29218         int                     id;             /* I: worker id */
29219 +       int                     sleeping;       /* None */
29221         /*
29222          * Opaque string set with work_set_desc().  Printed out with task
29223 @@ -68,7 +69,7 @@ static inline struct worker *current_wq_worker(void)
29224   * Scheduler hooks for concurrency managed workqueue.  Only to be used from
29225   * sched/core.c and workqueue.c.
29226   */
29227 -void wq_worker_waking_up(struct task_struct *task, int cpu);
29228 -struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu);
29229 +void wq_worker_running(struct task_struct *task);
29230 +void wq_worker_sleeping(struct task_struct *task);
29232  #endif /* _KERNEL_WORKQUEUE_INTERNAL_H */
29233 diff --git a/lib/Kconfig b/lib/Kconfig
29234 index 1a48744253d7..f75de578cca8 100644
29235 --- a/lib/Kconfig
29236 +++ b/lib/Kconfig
29237 @@ -397,6 +397,7 @@ config CHECK_SIGNATURE
29239  config CPUMASK_OFFSTACK
29240         bool "Force CPU masks off stack" if DEBUG_PER_CPU_MAPS
29241 +       depends on !PREEMPT_RT_FULL
29242         help
29243           Use dynamic allocation for cpumask_var_t, instead of putting
29244           them on the stack.  This is a bit more expensive, but avoids
29245 diff --git a/lib/debugobjects.c b/lib/debugobjects.c
29246 index 547f7f923dbc..8fcdbc2fc6d0 100644
29247 --- a/lib/debugobjects.c
29248 +++ b/lib/debugobjects.c
29249 @@ -309,7 +309,10 @@ __debug_object_init(void *addr, struct debug_obj_descr *descr, int onstack)
29250         struct debug_obj *obj;
29251         unsigned long flags;
29253 -       fill_pool();
29254 +#ifdef CONFIG_PREEMPT_RT_FULL
29255 +       if (preempt_count() == 0 && !irqs_disabled())
29256 +#endif
29257 +               fill_pool();
29259         db = get_bucket((unsigned long) addr);
29261 diff --git a/lib/idr.c b/lib/idr.c
29262 index 6098336df267..9decbe914595 100644
29263 --- a/lib/idr.c
29264 +++ b/lib/idr.c
29265 @@ -30,6 +30,7 @@
29266  #include <linux/idr.h>
29267  #include <linux/spinlock.h>
29268  #include <linux/percpu.h>
29269 +#include <linux/locallock.h>
29271  #define MAX_IDR_SHIFT          (sizeof(int) * 8 - 1)
29272  #define MAX_IDR_BIT            (1U << MAX_IDR_SHIFT)
29273 @@ -45,6 +46,37 @@ static DEFINE_PER_CPU(struct idr_layer *, idr_preload_head);
29274  static DEFINE_PER_CPU(int, idr_preload_cnt);
29275  static DEFINE_SPINLOCK(simple_ida_lock);
29277 +#ifdef CONFIG_PREEMPT_RT_FULL
29278 +static DEFINE_LOCAL_IRQ_LOCK(idr_lock);
29280 +static inline void idr_preload_lock(void)
29282 +       local_lock(idr_lock);
29285 +static inline void idr_preload_unlock(void)
29287 +       local_unlock(idr_lock);
29290 +void idr_preload_end(void)
29292 +       idr_preload_unlock();
29294 +EXPORT_SYMBOL(idr_preload_end);
29295 +#else
29296 +static inline void idr_preload_lock(void)
29298 +       preempt_disable();
29301 +static inline void idr_preload_unlock(void)
29303 +       preempt_enable();
29305 +#endif
29308  /* the maximum ID which can be allocated given idr->layers */
29309  static int idr_max(int layers)
29311 @@ -115,14 +147,14 @@ static struct idr_layer *idr_layer_alloc(gfp_t gfp_mask, struct idr *layer_idr)
29312          * context.  See idr_preload() for details.
29313          */
29314         if (!in_interrupt()) {
29315 -               preempt_disable();
29316 +               idr_preload_lock();
29317                 new = __this_cpu_read(idr_preload_head);
29318                 if (new) {
29319                         __this_cpu_write(idr_preload_head, new->ary[0]);
29320                         __this_cpu_dec(idr_preload_cnt);
29321                         new->ary[0] = NULL;
29322                 }
29323 -               preempt_enable();
29324 +               idr_preload_unlock();
29325                 if (new)
29326                         return new;
29327         }
29328 @@ -366,7 +398,6 @@ static void idr_fill_slot(struct idr *idr, void *ptr, int id,
29329         idr_mark_full(pa, id);
29333  /**
29334   * idr_preload - preload for idr_alloc()
29335   * @gfp_mask: allocation mask to use for preloading
29336 @@ -401,7 +432,7 @@ void idr_preload(gfp_t gfp_mask)
29337         WARN_ON_ONCE(in_interrupt());
29338         might_sleep_if(gfpflags_allow_blocking(gfp_mask));
29340 -       preempt_disable();
29341 +       idr_preload_lock();
29343         /*
29344          * idr_alloc() is likely to succeed w/o full idr_layer buffer and
29345 @@ -413,9 +444,9 @@ void idr_preload(gfp_t gfp_mask)
29346         while (__this_cpu_read(idr_preload_cnt) < MAX_IDR_FREE) {
29347                 struct idr_layer *new;
29349 -               preempt_enable();
29350 +               idr_preload_unlock();
29351                 new = kmem_cache_zalloc(idr_layer_cache, gfp_mask);
29352 -               preempt_disable();
29353 +               idr_preload_lock();
29354                 if (!new)
29355                         break;
29357 diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c
29358 index 872a15a2a637..b93a6103fa4d 100644
29359 --- a/lib/locking-selftest.c
29360 +++ b/lib/locking-selftest.c
29361 @@ -590,6 +590,8 @@ GENERATE_TESTCASE(init_held_rsem)
29362  #include "locking-selftest-spin-hardirq.h"
29363  GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_hard_spin)
29365 +#ifndef CONFIG_PREEMPT_RT_FULL
29367  #include "locking-selftest-rlock-hardirq.h"
29368  GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_hard_rlock)
29370 @@ -605,9 +607,12 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_soft_rlock)
29371  #include "locking-selftest-wlock-softirq.h"
29372  GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_soft_wlock)
29374 +#endif
29376  #undef E1
29377  #undef E2
29379 +#ifndef CONFIG_PREEMPT_RT_FULL
29380  /*
29381   * Enabling hardirqs with a softirq-safe lock held:
29382   */
29383 @@ -640,6 +645,8 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2A_rlock)
29384  #undef E1
29385  #undef E2
29387 +#endif
29389  /*
29390   * Enabling irqs with an irq-safe lock held:
29391   */
29392 @@ -663,6 +670,8 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2A_rlock)
29393  #include "locking-selftest-spin-hardirq.h"
29394  GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_hard_spin)
29396 +#ifndef CONFIG_PREEMPT_RT_FULL
29398  #include "locking-selftest-rlock-hardirq.h"
29399  GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_hard_rlock)
29401 @@ -678,6 +687,8 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_soft_rlock)
29402  #include "locking-selftest-wlock-softirq.h"
29403  GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_soft_wlock)
29405 +#endif
29407  #undef E1
29408  #undef E2
29410 @@ -709,6 +720,8 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_soft_wlock)
29411  #include "locking-selftest-spin-hardirq.h"
29412  GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_hard_spin)
29414 +#ifndef CONFIG_PREEMPT_RT_FULL
29416  #include "locking-selftest-rlock-hardirq.h"
29417  GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_hard_rlock)
29419 @@ -724,6 +737,8 @@ GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_soft_rlock)
29420  #include "locking-selftest-wlock-softirq.h"
29421  GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_soft_wlock)
29423 +#endif
29425  #undef E1
29426  #undef E2
29427  #undef E3
29428 @@ -757,6 +772,8 @@ GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_soft_wlock)
29429  #include "locking-selftest-spin-hardirq.h"
29430  GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_hard_spin)
29432 +#ifndef CONFIG_PREEMPT_RT_FULL
29434  #include "locking-selftest-rlock-hardirq.h"
29435  GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_hard_rlock)
29437 @@ -772,10 +789,14 @@ GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_soft_rlock)
29438  #include "locking-selftest-wlock-softirq.h"
29439  GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_soft_wlock)
29441 +#endif
29443  #undef E1
29444  #undef E2
29445  #undef E3
29447 +#ifndef CONFIG_PREEMPT_RT_FULL
29449  /*
29450   * read-lock / write-lock irq inversion.
29451   *
29452 @@ -838,6 +859,10 @@ GENERATE_PERMUTATIONS_3_EVENTS(irq_inversion_soft_wlock)
29453  #undef E2
29454  #undef E3
29456 +#endif
29458 +#ifndef CONFIG_PREEMPT_RT_FULL
29460  /*
29461   * read-lock / write-lock recursion that is actually safe.
29462   */
29463 @@ -876,6 +901,8 @@ GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion_soft)
29464  #undef E2
29465  #undef E3
29467 +#endif
29469  /*
29470   * read-lock / write-lock recursion that is unsafe.
29471   */
29472 @@ -1858,6 +1885,7 @@ void locking_selftest(void)
29474         printk("  --------------------------------------------------------------------------\n");
29476 +#ifndef CONFIG_PREEMPT_RT_FULL
29477         /*
29478          * irq-context testcases:
29479          */
29480 @@ -1870,6 +1898,28 @@ void locking_selftest(void)
29482         DO_TESTCASE_6x2("irq read-recursion", irq_read_recursion);
29483  //     DO_TESTCASE_6x2B("irq read-recursion #2", irq_read_recursion2);
29484 +#else
29485 +       /* On -rt, we only do hardirq context test for raw spinlock */
29486 +       DO_TESTCASE_1B("hard-irqs-on + irq-safe-A", irqsafe1_hard_spin, 12);
29487 +       DO_TESTCASE_1B("hard-irqs-on + irq-safe-A", irqsafe1_hard_spin, 21);
29489 +       DO_TESTCASE_1B("hard-safe-A + irqs-on", irqsafe2B_hard_spin, 12);
29490 +       DO_TESTCASE_1B("hard-safe-A + irqs-on", irqsafe2B_hard_spin, 21);
29492 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 123);
29493 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 132);
29494 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 213);
29495 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 231);
29496 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 312);
29497 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 321);
29499 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 123);
29500 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 132);
29501 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 213);
29502 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 231);
29503 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 312);
29504 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 321);
29505 +#endif
29507         ww_tests();
29509 diff --git a/lib/percpu_ida.c b/lib/percpu_ida.c
29510 index 6d40944960de..822a2c027e72 100644
29511 --- a/lib/percpu_ida.c
29512 +++ b/lib/percpu_ida.c
29513 @@ -26,6 +26,9 @@
29514  #include <linux/string.h>
29515  #include <linux/spinlock.h>
29516  #include <linux/percpu_ida.h>
29517 +#include <linux/locallock.h>
29519 +static DEFINE_LOCAL_IRQ_LOCK(irq_off_lock);
29521  struct percpu_ida_cpu {
29522         /*
29523 @@ -148,13 +151,13 @@ int percpu_ida_alloc(struct percpu_ida *pool, int state)
29524         unsigned long flags;
29525         int tag;
29527 -       local_irq_save(flags);
29528 +       local_lock_irqsave(irq_off_lock, flags);
29529         tags = this_cpu_ptr(pool->tag_cpu);
29531         /* Fastpath */
29532         tag = alloc_local_tag(tags);
29533         if (likely(tag >= 0)) {
29534 -               local_irq_restore(flags);
29535 +               local_unlock_irqrestore(irq_off_lock, flags);
29536                 return tag;
29537         }
29539 @@ -173,6 +176,7 @@ int percpu_ida_alloc(struct percpu_ida *pool, int state)
29541                 if (!tags->nr_free)
29542                         alloc_global_tags(pool, tags);
29544                 if (!tags->nr_free)
29545                         steal_tags(pool, tags);
29547 @@ -184,7 +188,7 @@ int percpu_ida_alloc(struct percpu_ida *pool, int state)
29548                 }
29550                 spin_unlock(&pool->lock);
29551 -               local_irq_restore(flags);
29552 +               local_unlock_irqrestore(irq_off_lock, flags);
29554                 if (tag >= 0 || state == TASK_RUNNING)
29555                         break;
29556 @@ -196,7 +200,7 @@ int percpu_ida_alloc(struct percpu_ida *pool, int state)
29558                 schedule();
29560 -               local_irq_save(flags);
29561 +               local_lock_irqsave(irq_off_lock, flags);
29562                 tags = this_cpu_ptr(pool->tag_cpu);
29563         }
29564         if (state != TASK_RUNNING)
29565 @@ -221,7 +225,7 @@ void percpu_ida_free(struct percpu_ida *pool, unsigned tag)
29567         BUG_ON(tag >= pool->nr_tags);
29569 -       local_irq_save(flags);
29570 +       local_lock_irqsave(irq_off_lock, flags);
29571         tags = this_cpu_ptr(pool->tag_cpu);
29573         spin_lock(&tags->lock);
29574 @@ -253,7 +257,7 @@ void percpu_ida_free(struct percpu_ida *pool, unsigned tag)
29575                 spin_unlock(&pool->lock);
29576         }
29578 -       local_irq_restore(flags);
29579 +       local_unlock_irqrestore(irq_off_lock, flags);
29581  EXPORT_SYMBOL_GPL(percpu_ida_free);
29583 @@ -345,7 +349,7 @@ int percpu_ida_for_each_free(struct percpu_ida *pool, percpu_ida_cb fn,
29584         struct percpu_ida_cpu *remote;
29585         unsigned cpu, i, err = 0;
29587 -       local_irq_save(flags);
29588 +       local_lock_irqsave(irq_off_lock, flags);
29589         for_each_possible_cpu(cpu) {
29590                 remote = per_cpu_ptr(pool->tag_cpu, cpu);
29591                 spin_lock(&remote->lock);
29592 @@ -367,7 +371,7 @@ int percpu_ida_for_each_free(struct percpu_ida *pool, percpu_ida_cb fn,
29593         }
29594         spin_unlock(&pool->lock);
29595  out:
29596 -       local_irq_restore(flags);
29597 +       local_unlock_irqrestore(irq_off_lock, flags);
29598         return err;
29600  EXPORT_SYMBOL_GPL(percpu_ida_for_each_free);
29601 diff --git a/lib/radix-tree.c b/lib/radix-tree.c
29602 index 6b79e9026e24..44bf36a396a9 100644
29603 --- a/lib/radix-tree.c
29604 +++ b/lib/radix-tree.c
29605 @@ -34,7 +34,7 @@
29606  #include <linux/bitops.h>
29607  #include <linux/rcupdate.h>
29608  #include <linux/preempt.h>             /* in_interrupt() */
29610 +#include <linux/locallock.h>
29612  /*
29613   * The height_to_maxindex array needs to be one deeper than the maximum
29614 @@ -69,6 +69,7 @@ struct radix_tree_preload {
29615         struct radix_tree_node *nodes;
29616  };
29617  static DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, };
29618 +static DEFINE_LOCAL_IRQ_LOCK(radix_tree_preloads_lock);
29620  static inline void *ptr_to_indirect(void *ptr)
29622 @@ -196,13 +197,14 @@ radix_tree_node_alloc(struct radix_tree_root *root)
29623                  * succeed in getting a node here (and never reach
29624                  * kmem_cache_alloc)
29625                  */
29626 -               rtp = this_cpu_ptr(&radix_tree_preloads);
29627 +               rtp = &get_locked_var(radix_tree_preloads_lock, radix_tree_preloads);
29628                 if (rtp->nr) {
29629                         ret = rtp->nodes;
29630                         rtp->nodes = ret->private_data;
29631                         ret->private_data = NULL;
29632                         rtp->nr--;
29633                 }
29634 +               put_locked_var(radix_tree_preloads_lock, radix_tree_preloads);
29635                 /*
29636                  * Update the allocation stack trace as this is more useful
29637                  * for debugging.
29638 @@ -257,14 +259,14 @@ static int __radix_tree_preload(gfp_t gfp_mask)
29639         struct radix_tree_node *node;
29640         int ret = -ENOMEM;
29642 -       preempt_disable();
29643 +       local_lock(radix_tree_preloads_lock);
29644         rtp = this_cpu_ptr(&radix_tree_preloads);
29645         while (rtp->nr < RADIX_TREE_PRELOAD_SIZE) {
29646 -               preempt_enable();
29647 +               local_unlock(radix_tree_preloads_lock);
29648                 node = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask);
29649                 if (node == NULL)
29650                         goto out;
29651 -               preempt_disable();
29652 +               local_lock(radix_tree_preloads_lock);
29653                 rtp = this_cpu_ptr(&radix_tree_preloads);
29654                 if (rtp->nr < RADIX_TREE_PRELOAD_SIZE) {
29655                         node->private_data = rtp->nodes;
29656 @@ -306,11 +308,17 @@ int radix_tree_maybe_preload(gfp_t gfp_mask)
29657         if (gfpflags_allow_blocking(gfp_mask))
29658                 return __radix_tree_preload(gfp_mask);
29659         /* Preloading doesn't help anything with this gfp mask, skip it */
29660 -       preempt_disable();
29661 +       local_lock(radix_tree_preloads_lock);
29662         return 0;
29664  EXPORT_SYMBOL(radix_tree_maybe_preload);
29666 +void radix_tree_preload_end(void)
29668 +       local_unlock(radix_tree_preloads_lock);
29670 +EXPORT_SYMBOL(radix_tree_preload_end);
29672  /*
29673   *     Return the maximum key which can be store into a
29674   *     radix tree with height HEIGHT.
29675 diff --git a/lib/rbtree.c b/lib/rbtree.c
29676 index 1356454e36de..d15d6c4327f1 100644
29677 --- a/lib/rbtree.c
29678 +++ b/lib/rbtree.c
29679 @@ -23,6 +23,7 @@
29681  #include <linux/rbtree_augmented.h>
29682  #include <linux/export.h>
29683 +#include <linux/rcupdate.h>
29685  /*
29686   * red-black trees properties:  http://en.wikipedia.org/wiki/Rbtree
29687 @@ -590,3 +591,13 @@ struct rb_node *rb_first_postorder(const struct rb_root *root)
29688         return rb_left_deepest_node(root->rb_node);
29690  EXPORT_SYMBOL(rb_first_postorder);
29692 +void rb_link_node_rcu(struct rb_node *node, struct rb_node *parent,
29693 +                                   struct rb_node **rb_link)
29695 +       node->__rb_parent_color = (unsigned long)parent;
29696 +       node->rb_left = node->rb_right = NULL;
29698 +       rcu_assign_pointer(*rb_link, node);
29700 +EXPORT_SYMBOL(rb_link_node_rcu);
29701 diff --git a/lib/scatterlist.c b/lib/scatterlist.c
29702 index bafa9933fa76..ebe3b7edd086 100644
29703 --- a/lib/scatterlist.c
29704 +++ b/lib/scatterlist.c
29705 @@ -620,7 +620,7 @@ void sg_miter_stop(struct sg_mapping_iter *miter)
29706                         flush_kernel_dcache_page(miter->page);
29708                 if (miter->__flags & SG_MITER_ATOMIC) {
29709 -                       WARN_ON_ONCE(preemptible());
29710 +                       WARN_ON_ONCE(!pagefault_disabled());
29711                         kunmap_atomic(miter->addr);
29712                 } else
29713                         kunmap(miter->page);
29714 @@ -664,7 +664,7 @@ size_t sg_copy_buffer(struct scatterlist *sgl, unsigned int nents, void *buf,
29715         if (!sg_miter_skip(&miter, skip))
29716                 return false;
29718 -       local_irq_save(flags);
29719 +       local_irq_save_nort(flags);
29721         while (sg_miter_next(&miter) && offset < buflen) {
29722                 unsigned int len;
29723 @@ -681,7 +681,7 @@ size_t sg_copy_buffer(struct scatterlist *sgl, unsigned int nents, void *buf,
29725         sg_miter_stop(&miter);
29727 -       local_irq_restore(flags);
29728 +       local_irq_restore_nort(flags);
29729         return offset;
29731  EXPORT_SYMBOL(sg_copy_buffer);
29732 diff --git a/lib/smp_processor_id.c b/lib/smp_processor_id.c
29733 index 1afec32de6f2..11fa431046a8 100644
29734 --- a/lib/smp_processor_id.c
29735 +++ b/lib/smp_processor_id.c
29736 @@ -39,8 +39,9 @@ notrace static unsigned int check_preemption_disabled(const char *what1,
29737         if (!printk_ratelimit())
29738                 goto out_enable;
29740 -       printk(KERN_ERR "BUG: using %s%s() in preemptible [%08x] code: %s/%d\n",
29741 -               what1, what2, preempt_count() - 1, current->comm, current->pid);
29742 +       printk(KERN_ERR "BUG: using %s%s() in preemptible [%08x %08x] code: %s/%d\n",
29743 +               what1, what2, preempt_count() - 1, __migrate_disabled(current),
29744 +               current->comm, current->pid);
29746         print_symbol("caller is %s\n", (long)__builtin_return_address(0));
29747         dump_stack();
29748 diff --git a/mm/Kconfig b/mm/Kconfig
29749 index 97a4e06b15c0..9614351e68b8 100644
29750 --- a/mm/Kconfig
29751 +++ b/mm/Kconfig
29752 @@ -392,7 +392,7 @@ config NOMMU_INITIAL_TRIM_EXCESS
29754  config TRANSPARENT_HUGEPAGE
29755         bool "Transparent Hugepage Support"
29756 -       depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE
29757 +       depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE && !PREEMPT_RT_FULL
29758         select COMPACTION
29759         help
29760           Transparent Hugepages allows the kernel to use huge pages and
29761 diff --git a/mm/backing-dev.c b/mm/backing-dev.c
29762 index a988d4ef39da..f2c2ee1d5191 100644
29763 --- a/mm/backing-dev.c
29764 +++ b/mm/backing-dev.c
29765 @@ -457,9 +457,9 @@ void wb_congested_put(struct bdi_writeback_congested *congested)
29767         unsigned long flags;
29769 -       local_irq_save(flags);
29770 +       local_irq_save_nort(flags);
29771         if (!atomic_dec_and_lock(&congested->refcnt, &cgwb_lock)) {
29772 -               local_irq_restore(flags);
29773 +               local_irq_restore_nort(flags);
29774                 return;
29775         }
29777 diff --git a/mm/compaction.c b/mm/compaction.c
29778 index dba02dec7195..51963f58a29b 100644
29779 --- a/mm/compaction.c
29780 +++ b/mm/compaction.c
29781 @@ -1430,10 +1430,12 @@ check_drain:
29782                                 cc->migrate_pfn & ~((1UL << cc->order) - 1);
29784                         if (cc->last_migrated_pfn < current_block_start) {
29785 -                               cpu = get_cpu();
29786 +                               cpu = get_cpu_light();
29787 +                               local_lock_irq(swapvec_lock);
29788                                 lru_add_drain_cpu(cpu);
29789 +                               local_unlock_irq(swapvec_lock);
29790                                 drain_local_pages(zone);
29791 -                               put_cpu();
29792 +                               put_cpu_light();
29793                                 /* No more flushing until we migrate again */
29794                                 cc->last_migrated_pfn = 0;
29795                         }
29796 diff --git a/mm/filemap.c b/mm/filemap.c
29797 index 69f75c77c098..b203169ca0b4 100644
29798 --- a/mm/filemap.c
29799 +++ b/mm/filemap.c
29800 @@ -144,9 +144,12 @@ static int page_cache_tree_insert(struct address_space *mapping,
29801                  * node->private_list is protected by
29802                  * mapping->tree_lock.
29803                  */
29804 -               if (!list_empty(&node->private_list))
29805 -                       list_lru_del(&workingset_shadow_nodes,
29806 +               if (!list_empty(&node->private_list)) {
29807 +                       local_lock(workingset_shadow_lock);
29808 +                       list_lru_del(&__workingset_shadow_nodes,
29809                                      &node->private_list);
29810 +                       local_unlock(workingset_shadow_lock);
29811 +               }
29812         }
29813         return 0;
29815 @@ -218,7 +221,9 @@ static void page_cache_tree_delete(struct address_space *mapping,
29816         if (!workingset_node_pages(node) &&
29817             list_empty(&node->private_list)) {
29818                 node->private_data = mapping;
29819 -               list_lru_add(&workingset_shadow_nodes, &node->private_list);
29820 +               local_lock(workingset_shadow_lock);
29821 +               list_lru_add(&__workingset_shadow_nodes, &node->private_list);
29822 +               local_unlock(workingset_shadow_lock);
29823         }
29826 diff --git a/mm/highmem.c b/mm/highmem.c
29827 index 123bcd3ed4f2..16e8cf26d38a 100644
29828 --- a/mm/highmem.c
29829 +++ b/mm/highmem.c
29830 @@ -29,10 +29,11 @@
29831  #include <linux/kgdb.h>
29832  #include <asm/tlbflush.h>
29835 +#ifndef CONFIG_PREEMPT_RT_FULL
29836  #if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32)
29837  DEFINE_PER_CPU(int, __kmap_atomic_idx);
29838  #endif
29839 +#endif
29841  /*
29842   * Virtual_count is not a pure "count".
29843 @@ -107,8 +108,9 @@ static inline wait_queue_head_t *get_pkmap_wait_queue_head(unsigned int color)
29844  unsigned long totalhigh_pages __read_mostly;
29845  EXPORT_SYMBOL(totalhigh_pages);
29848 +#ifndef CONFIG_PREEMPT_RT_FULL
29849  EXPORT_PER_CPU_SYMBOL(__kmap_atomic_idx);
29850 +#endif
29852  unsigned int nr_free_highpages (void)
29854 diff --git a/mm/memcontrol.c b/mm/memcontrol.c
29855 index e25b93a4267d..1c619267d9da 100644
29856 --- a/mm/memcontrol.c
29857 +++ b/mm/memcontrol.c
29858 @@ -67,6 +67,8 @@
29859  #include <net/sock.h>
29860  #include <net/ip.h>
29861  #include <net/tcp_memcontrol.h>
29862 +#include <linux/locallock.h>
29864  #include "slab.h"
29866  #include <asm/uaccess.h>
29867 @@ -87,6 +89,7 @@ int do_swap_account __read_mostly;
29868  #define do_swap_account                0
29869  #endif
29871 +static DEFINE_LOCAL_IRQ_LOCK(event_lock);
29872  static const char * const mem_cgroup_stat_names[] = {
29873         "cache",
29874         "rss",
29875 @@ -1922,14 +1925,17 @@ static void drain_local_stock(struct work_struct *dummy)
29876   */
29877  static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
29879 -       struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);
29880 +       struct memcg_stock_pcp *stock;
29881 +       int cpu = get_cpu_light();
29883 +       stock = &per_cpu(memcg_stock, cpu);
29885         if (stock->cached != memcg) { /* reset if necessary */
29886                 drain_stock(stock);
29887                 stock->cached = memcg;
29888         }
29889         stock->nr_pages += nr_pages;
29890 -       put_cpu_var(memcg_stock);
29891 +       put_cpu_light();
29894  /*
29895 @@ -1945,7 +1951,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
29896                 return;
29897         /* Notify other cpus that system-wide "drain" is running */
29898         get_online_cpus();
29899 -       curcpu = get_cpu();
29900 +       curcpu = get_cpu_light();
29901         for_each_online_cpu(cpu) {
29902                 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
29903                 struct mem_cgroup *memcg;
29904 @@ -1962,7 +1968,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
29905                                 schedule_work_on(cpu, &stock->work);
29906                 }
29907         }
29908 -       put_cpu();
29909 +       put_cpu_light();
29910         put_online_cpus();
29911         mutex_unlock(&percpu_charge_mutex);
29913 @@ -4691,12 +4697,12 @@ static int mem_cgroup_move_account(struct page *page,
29915         ret = 0;
29917 -       local_irq_disable();
29918 +       local_lock_irq(event_lock);
29919         mem_cgroup_charge_statistics(to, page, nr_pages);
29920         memcg_check_events(to, page);
29921         mem_cgroup_charge_statistics(from, page, -nr_pages);
29922         memcg_check_events(from, page);
29923 -       local_irq_enable();
29924 +       local_unlock_irq(event_lock);
29925  out_unlock:
29926         unlock_page(page);
29927  out:
29928 @@ -5486,10 +5492,10 @@ void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
29929                 VM_BUG_ON_PAGE(!PageTransHuge(page), page);
29930         }
29932 -       local_irq_disable();
29933 +       local_lock_irq(event_lock);
29934         mem_cgroup_charge_statistics(memcg, page, nr_pages);
29935         memcg_check_events(memcg, page);
29936 -       local_irq_enable();
29937 +       local_unlock_irq(event_lock);
29939         if (do_swap_account && PageSwapCache(page)) {
29940                 swp_entry_t entry = { .val = page_private(page) };
29941 @@ -5545,14 +5551,14 @@ static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
29942                 memcg_oom_recover(memcg);
29943         }
29945 -       local_irq_save(flags);
29946 +       local_lock_irqsave(event_lock, flags);
29947         __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon);
29948         __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_file);
29949         __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_huge);
29950         __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout);
29951         __this_cpu_add(memcg->stat->nr_page_events, nr_pages);
29952         memcg_check_events(memcg, dummy_page);
29953 -       local_irq_restore(flags);
29954 +       local_unlock_irqrestore(event_lock, flags);
29956         if (!mem_cgroup_is_root(memcg))
29957                 css_put_many(&memcg->css, nr_pages);
29958 @@ -5762,6 +5768,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
29960         struct mem_cgroup *memcg, *swap_memcg;
29961         unsigned short oldid;
29962 +       unsigned long flags;
29964         VM_BUG_ON_PAGE(PageLRU(page), page);
29965         VM_BUG_ON_PAGE(page_count(page), page);
29966 @@ -5802,12 +5809,16 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
29967          * important here to have the interrupts disabled because it is the
29968          * only synchronisation we have for udpating the per-CPU variables.
29969          */
29970 +       local_lock_irqsave(event_lock, flags);
29971 +#ifndef CONFIG_PREEMPT_RT_BASE
29972         VM_BUG_ON(!irqs_disabled());
29973 +#endif
29974         mem_cgroup_charge_statistics(memcg, page, -1);
29975         memcg_check_events(memcg, page);
29977         if (!mem_cgroup_is_root(memcg))
29978                 css_put(&memcg->css);
29979 +       local_unlock_irqrestore(event_lock, flags);
29982  /**
29983 diff --git a/mm/mmu_context.c b/mm/mmu_context.c
29984 index f802c2d216a7..b1b6f238e42d 100644
29985 --- a/mm/mmu_context.c
29986 +++ b/mm/mmu_context.c
29987 @@ -23,6 +23,7 @@ void use_mm(struct mm_struct *mm)
29988         struct task_struct *tsk = current;
29990         task_lock(tsk);
29991 +       preempt_disable_rt();
29992         active_mm = tsk->active_mm;
29993         if (active_mm != mm) {
29994                 atomic_inc(&mm->mm_count);
29995 @@ -30,6 +31,7 @@ void use_mm(struct mm_struct *mm)
29996         }
29997         tsk->mm = mm;
29998         switch_mm(active_mm, mm, tsk);
29999 +       preempt_enable_rt();
30000         task_unlock(tsk);
30001  #ifdef finish_arch_post_lock_switch
30002         finish_arch_post_lock_switch();
30003 diff --git a/mm/page_alloc.c b/mm/page_alloc.c
30004 index 6f9005dcca2e..e8ec72251705 100644
30005 --- a/mm/page_alloc.c
30006 +++ b/mm/page_alloc.c
30007 @@ -60,6 +60,7 @@
30008  #include <linux/page_ext.h>
30009  #include <linux/hugetlb.h>
30010  #include <linux/sched/rt.h>
30011 +#include <linux/locallock.h>
30012  #include <linux/page_owner.h>
30013  #include <linux/kthread.h>
30015 @@ -264,6 +265,18 @@ EXPORT_SYMBOL(nr_node_ids);
30016  EXPORT_SYMBOL(nr_online_nodes);
30017  #endif
30019 +static DEFINE_LOCAL_IRQ_LOCK(pa_lock);
30021 +#ifdef CONFIG_PREEMPT_RT_BASE
30022 +# define cpu_lock_irqsave(cpu, flags)          \
30023 +       local_lock_irqsave_on(pa_lock, flags, cpu)
30024 +# define cpu_unlock_irqrestore(cpu, flags)     \
30025 +       local_unlock_irqrestore_on(pa_lock, flags, cpu)
30026 +#else
30027 +# define cpu_lock_irqsave(cpu, flags)          local_irq_save(flags)
30028 +# define cpu_unlock_irqrestore(cpu, flags)     local_irq_restore(flags)
30029 +#endif
30031  int page_group_by_mobility_disabled __read_mostly;
30033  #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
30034 @@ -786,7 +799,7 @@ static inline int free_pages_check(struct page *page)
30037  /*
30038 - * Frees a number of pages from the PCP lists
30039 + * Frees a number of pages which have been collected from the pcp lists.
30040   * Assumes all pages on list are in same zone, and of same order.
30041   * count is the number of pages to free.
30042   *
30043 @@ -797,18 +810,53 @@ static inline int free_pages_check(struct page *page)
30044   * pinned" detection logic.
30045   */
30046  static void free_pcppages_bulk(struct zone *zone, int count,
30047 -                                       struct per_cpu_pages *pcp)
30048 +                              struct list_head *list)
30050 -       int migratetype = 0;
30051 -       int batch_free = 0;
30052         int to_free = count;
30053         unsigned long nr_scanned;
30054 +       unsigned long flags;
30056 +       spin_lock_irqsave(&zone->lock, flags);
30058 -       spin_lock(&zone->lock);
30059         nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED);
30060         if (nr_scanned)
30061                 __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned);
30063 +       while (!list_empty(list)) {
30064 +               struct page *page = list_first_entry(list, struct page, lru);
30065 +               int mt; /* migratetype of the to-be-freed page */
30067 +               /* must delete as __free_one_page list manipulates */
30068 +               list_del(&page->lru);
30070 +               mt = get_pcppage_migratetype(page);
30071 +               /* MIGRATE_ISOLATE page should not go to pcplists */
30072 +               VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
30073 +               /* Pageblock could have been isolated meanwhile */
30074 +               if (unlikely(has_isolate_pageblock(zone)))
30075 +                       mt = get_pageblock_migratetype(page);
30077 +               __free_one_page(page, page_to_pfn(page), zone, 0, mt);
30078 +               trace_mm_page_pcpu_drain(page, 0, mt);
30079 +               to_free--;
30080 +       }
30081 +       WARN_ON(to_free != 0);
30082 +       spin_unlock_irqrestore(&zone->lock, flags);
30086 + * Moves a number of pages from the PCP lists to free list which
30087 + * is freed outside of the locked region.
30088 + *
30089 + * Assumes all pages on list are in same zone, and of same order.
30090 + * count is the number of pages to free.
30091 + */
30092 +static void isolate_pcp_pages(int to_free, struct per_cpu_pages *src,
30093 +                             struct list_head *dst)
30095 +       int migratetype = 0;
30096 +       int batch_free = 0;
30098         while (to_free) {
30099                 struct page *page;
30100                 struct list_head *list;
30101 @@ -824,7 +872,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
30102                         batch_free++;
30103                         if (++migratetype == MIGRATE_PCPTYPES)
30104                                 migratetype = 0;
30105 -                       list = &pcp->lists[migratetype];
30106 +                       list = &src->lists[migratetype];
30107                 } while (list_empty(list));
30109                 /* This is the only non-empty list. Free them all. */
30110 @@ -832,24 +880,12 @@ static void free_pcppages_bulk(struct zone *zone, int count,
30111                         batch_free = to_free;
30113                 do {
30114 -                       int mt; /* migratetype of the to-be-freed page */
30116 -                       page = list_entry(list->prev, struct page, lru);
30117 -                       /* must delete as __free_one_page list manipulates */
30118 +                       page = list_last_entry(list, struct page, lru);
30119                         list_del(&page->lru);
30121 -                       mt = get_pcppage_migratetype(page);
30122 -                       /* MIGRATE_ISOLATE page should not go to pcplists */
30123 -                       VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
30124 -                       /* Pageblock could have been isolated meanwhile */
30125 -                       if (unlikely(has_isolate_pageblock(zone)))
30126 -                               mt = get_pageblock_migratetype(page);
30128 -                       __free_one_page(page, page_to_pfn(page), zone, 0, mt);
30129 -                       trace_mm_page_pcpu_drain(page, 0, mt);
30130 +                       list_add(&page->lru, dst);
30131                 } while (--to_free && --batch_free && !list_empty(list));
30132         }
30133 -       spin_unlock(&zone->lock);
30136  static void free_one_page(struct zone *zone,
30137 @@ -858,7 +894,9 @@ static void free_one_page(struct zone *zone,
30138                                 int migratetype)
30140         unsigned long nr_scanned;
30141 -       spin_lock(&zone->lock);
30142 +       unsigned long flags;
30144 +       spin_lock_irqsave(&zone->lock, flags);
30145         nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED);
30146         if (nr_scanned)
30147                 __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned);
30148 @@ -868,7 +906,7 @@ static void free_one_page(struct zone *zone,
30149                 migratetype = get_pfnblock_migratetype(page, pfn);
30150         }
30151         __free_one_page(page, pfn, zone, order, migratetype);
30152 -       spin_unlock(&zone->lock);
30153 +       spin_unlock_irqrestore(&zone->lock, flags);
30156  static int free_tail_pages_check(struct page *head_page, struct page *page)
30157 @@ -1019,10 +1057,10 @@ static void __free_pages_ok(struct page *page, unsigned int order)
30158                 return;
30160         migratetype = get_pfnblock_migratetype(page, pfn);
30161 -       local_irq_save(flags);
30162 +       local_lock_irqsave(pa_lock, flags);
30163         __count_vm_events(PGFREE, 1 << order);
30164         free_one_page(page_zone(page), page, pfn, order, migratetype);
30165 -       local_irq_restore(flags);
30166 +       local_unlock_irqrestore(pa_lock, flags);
30169  static void __init __free_pages_boot_core(struct page *page,
30170 @@ -1879,16 +1917,18 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
30171  void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
30173         unsigned long flags;
30174 +       LIST_HEAD(dst);
30175         int to_drain, batch;
30177 -       local_irq_save(flags);
30178 +       local_lock_irqsave(pa_lock, flags);
30179         batch = READ_ONCE(pcp->batch);
30180         to_drain = min(pcp->count, batch);
30181         if (to_drain > 0) {
30182 -               free_pcppages_bulk(zone, to_drain, pcp);
30183 +               isolate_pcp_pages(to_drain, pcp, &dst);
30184                 pcp->count -= to_drain;
30185         }
30186 -       local_irq_restore(flags);
30187 +       local_unlock_irqrestore(pa_lock, flags);
30188 +       free_pcppages_bulk(zone, to_drain, &dst);
30190  #endif
30192 @@ -1904,16 +1944,21 @@ static void drain_pages_zone(unsigned int cpu, struct zone *zone)
30193         unsigned long flags;
30194         struct per_cpu_pageset *pset;
30195         struct per_cpu_pages *pcp;
30196 +       LIST_HEAD(dst);
30197 +       int count;
30199 -       local_irq_save(flags);
30200 +       cpu_lock_irqsave(cpu, flags);
30201         pset = per_cpu_ptr(zone->pageset, cpu);
30203         pcp = &pset->pcp;
30204 -       if (pcp->count) {
30205 -               free_pcppages_bulk(zone, pcp->count, pcp);
30206 +       count = pcp->count;
30207 +       if (count) {
30208 +               isolate_pcp_pages(count, pcp, &dst);
30209                 pcp->count = 0;
30210         }
30211 -       local_irq_restore(flags);
30212 +       cpu_unlock_irqrestore(cpu, flags);
30213 +       if (count)
30214 +               free_pcppages_bulk(zone, count, &dst);
30217  /*
30218 @@ -1999,8 +2044,17 @@ void drain_all_pages(struct zone *zone)
30219                 else
30220                         cpumask_clear_cpu(cpu, &cpus_with_pcps);
30221         }
30222 +#ifndef CONFIG_PREEMPT_RT_BASE
30223         on_each_cpu_mask(&cpus_with_pcps, (smp_call_func_t) drain_local_pages,
30224                                                                 zone, 1);
30225 +#else
30226 +       for_each_cpu(cpu, &cpus_with_pcps) {
30227 +               if (zone)
30228 +                       drain_pages_zone(cpu, zone);
30229 +               else
30230 +                       drain_pages(cpu);
30231 +       }
30232 +#endif
30235  #ifdef CONFIG_HIBERNATION
30236 @@ -2056,7 +2110,7 @@ void free_hot_cold_page(struct page *page, bool cold)
30238         migratetype = get_pfnblock_migratetype(page, pfn);
30239         set_pcppage_migratetype(page, migratetype);
30240 -       local_irq_save(flags);
30241 +       local_lock_irqsave(pa_lock, flags);
30242         __count_vm_event(PGFREE);
30244         /*
30245 @@ -2082,12 +2136,17 @@ void free_hot_cold_page(struct page *page, bool cold)
30246         pcp->count++;
30247         if (pcp->count >= pcp->high) {
30248                 unsigned long batch = READ_ONCE(pcp->batch);
30249 -               free_pcppages_bulk(zone, batch, pcp);
30250 +               LIST_HEAD(dst);
30252 +               isolate_pcp_pages(batch, pcp, &dst);
30253                 pcp->count -= batch;
30254 +               local_unlock_irqrestore(pa_lock, flags);
30255 +               free_pcppages_bulk(zone, batch, &dst);
30256 +               return;
30257         }
30259  out:
30260 -       local_irq_restore(flags);
30261 +       local_unlock_irqrestore(pa_lock, flags);
30264  /*
30265 @@ -2222,7 +2281,7 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
30266                 struct per_cpu_pages *pcp;
30267                 struct list_head *list;
30269 -               local_irq_save(flags);
30270 +               local_lock_irqsave(pa_lock, flags);
30271                 pcp = &this_cpu_ptr(zone->pageset)->pcp;
30272                 list = &pcp->lists[migratetype];
30273                 if (list_empty(list)) {
30274 @@ -2254,7 +2313,7 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
30275                          */
30276                         WARN_ON_ONCE(order > 1);
30277                 }
30278 -               spin_lock_irqsave(&zone->lock, flags);
30279 +               local_spin_lock_irqsave(pa_lock, &zone->lock, flags);
30281                 page = NULL;
30282                 if (alloc_flags & ALLOC_HARDER) {
30283 @@ -2264,11 +2323,13 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
30284                 }
30285                 if (!page)
30286                         page = __rmqueue(zone, order, migratetype, gfp_flags);
30287 -               spin_unlock(&zone->lock);
30288 -               if (!page)
30289 +               if (!page) {
30290 +                       spin_unlock(&zone->lock);
30291                         goto failed;
30292 +               }
30293                 __mod_zone_freepage_state(zone, -(1 << order),
30294                                           get_pcppage_migratetype(page));
30295 +               spin_unlock(&zone->lock);
30296         }
30298         __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
30299 @@ -2278,13 +2339,13 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
30301         __count_zone_vm_events(PGALLOC, zone, 1 << order);
30302         zone_statistics(preferred_zone, zone, gfp_flags);
30303 -       local_irq_restore(flags);
30304 +       local_unlock_irqrestore(pa_lock, flags);
30306         VM_BUG_ON_PAGE(bad_range(zone, page), page);
30307         return page;
30309  failed:
30310 -       local_irq_restore(flags);
30311 +       local_unlock_irqrestore(pa_lock, flags);
30312         return NULL;
30315 @@ -5953,6 +6014,7 @@ static int page_alloc_cpu_notify(struct notifier_block *self,
30316  void __init page_alloc_init(void)
30318         hotcpu_notifier(page_alloc_cpu_notify, 0);
30319 +       local_irq_lock_init(pa_lock);
30322  /*
30323 @@ -6847,7 +6909,7 @@ void zone_pcp_reset(struct zone *zone)
30324         struct per_cpu_pageset *pset;
30326         /* avoid races with drain_pages()  */
30327 -       local_irq_save(flags);
30328 +       local_lock_irqsave(pa_lock, flags);
30329         if (zone->pageset != &boot_pageset) {
30330                 for_each_online_cpu(cpu) {
30331                         pset = per_cpu_ptr(zone->pageset, cpu);
30332 @@ -6856,7 +6918,7 @@ void zone_pcp_reset(struct zone *zone)
30333                 free_percpu(zone->pageset);
30334                 zone->pageset = &boot_pageset;
30335         }
30336 -       local_irq_restore(flags);
30337 +       local_unlock_irqrestore(pa_lock, flags);
30340  #ifdef CONFIG_MEMORY_HOTREMOVE
30341 diff --git a/mm/percpu.c b/mm/percpu.c
30342 index ef6353f0adbd..33ccbac7cdb8 100644
30343 --- a/mm/percpu.c
30344 +++ b/mm/percpu.c
30345 @@ -1285,18 +1285,7 @@ void free_percpu(void __percpu *ptr)
30347  EXPORT_SYMBOL_GPL(free_percpu);
30349 -/**
30350 - * is_kernel_percpu_address - test whether address is from static percpu area
30351 - * @addr: address to test
30352 - *
30353 - * Test whether @addr belongs to in-kernel static percpu area.  Module
30354 - * static percpu areas are not considered.  For those, use
30355 - * is_module_percpu_address().
30356 - *
30357 - * RETURNS:
30358 - * %true if @addr is from in-kernel static percpu area, %false otherwise.
30359 - */
30360 -bool is_kernel_percpu_address(unsigned long addr)
30361 +bool __is_kernel_percpu_address(unsigned long addr, unsigned long *can_addr)
30363  #ifdef CONFIG_SMP
30364         const size_t static_size = __per_cpu_end - __per_cpu_start;
30365 @@ -1305,16 +1294,36 @@ bool is_kernel_percpu_address(unsigned long addr)
30367         for_each_possible_cpu(cpu) {
30368                 void *start = per_cpu_ptr(base, cpu);
30369 +               void *va = (void *)addr;
30371 -               if ((void *)addr >= start && (void *)addr < start + static_size)
30372 +               if (va >= start && va < start + static_size) {
30373 +                       if (can_addr)
30374 +                               *can_addr = (unsigned long) (va - start);
30375                         return true;
30376 -        }
30377 +               }
30378 +       }
30379  #endif
30380         /* on UP, can't distinguish from other static vars, always false */
30381         return false;
30384  /**
30385 + * is_kernel_percpu_address - test whether address is from static percpu area
30386 + * @addr: address to test
30387 + *
30388 + * Test whether @addr belongs to in-kernel static percpu area.  Module
30389 + * static percpu areas are not considered.  For those, use
30390 + * is_module_percpu_address().
30391 + *
30392 + * RETURNS:
30393 + * %true if @addr is from in-kernel static percpu area, %false otherwise.
30394 + */
30395 +bool is_kernel_percpu_address(unsigned long addr)
30397 +       return __is_kernel_percpu_address(addr, NULL);
30400 +/**
30401   * per_cpu_ptr_to_phys - convert translated percpu address to physical address
30402   * @addr: the address to be converted to physical address
30403   *
30404 diff --git a/mm/slab.h b/mm/slab.h
30405 index 7b6087197997..afdc57941179 100644
30406 --- a/mm/slab.h
30407 +++ b/mm/slab.h
30408 @@ -324,7 +324,11 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
30409   * The slab lists for all objects.
30410   */
30411  struct kmem_cache_node {
30412 +#ifdef CONFIG_SLUB
30413 +       raw_spinlock_t list_lock;
30414 +#else
30415         spinlock_t list_lock;
30416 +#endif
30418  #ifdef CONFIG_SLAB
30419         struct list_head slabs_partial; /* partial list first, better asm code */
30420 diff --git a/mm/slub.c b/mm/slub.c
30421 index 65d5f92d51d2..feb4a445a546 100644
30422 --- a/mm/slub.c
30423 +++ b/mm/slub.c
30424 @@ -1075,7 +1075,7 @@ static noinline struct kmem_cache_node *free_debug_processing(
30425         void *object = head;
30426         int cnt = 0;
30428 -       spin_lock_irqsave(&n->list_lock, *flags);
30429 +       raw_spin_lock_irqsave(&n->list_lock, *flags);
30430         slab_lock(page);
30432         if (!check_slab(s, page))
30433 @@ -1136,7 +1136,7 @@ out:
30435  fail:
30436         slab_unlock(page);
30437 -       spin_unlock_irqrestore(&n->list_lock, *flags);
30438 +       raw_spin_unlock_irqrestore(&n->list_lock, *flags);
30439         slab_fix(s, "Object at 0x%p not freed", object);
30440         return NULL;
30442 @@ -1263,6 +1263,12 @@ static inline void dec_slabs_node(struct kmem_cache *s, int node,
30444  #endif /* CONFIG_SLUB_DEBUG */
30446 +struct slub_free_list {
30447 +       raw_spinlock_t          lock;
30448 +       struct list_head        list;
30450 +static DEFINE_PER_CPU(struct slub_free_list, slub_free_list);
30452  /*
30453   * Hooks for other subsystems that check memory allocations. In a typical
30454   * production configuration these hooks all should produce no code at all.
30455 @@ -1399,10 +1405,17 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
30456         gfp_t alloc_gfp;
30457         void *start, *p;
30458         int idx, order;
30459 +       bool enableirqs = false;
30461         flags &= gfp_allowed_mask;
30463         if (gfpflags_allow_blocking(flags))
30464 +               enableirqs = true;
30465 +#ifdef CONFIG_PREEMPT_RT_FULL
30466 +       if (system_state == SYSTEM_RUNNING)
30467 +               enableirqs = true;
30468 +#endif
30469 +       if (enableirqs)
30470                 local_irq_enable();
30472         flags |= s->allocflags;
30473 @@ -1473,7 +1486,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
30474         page->frozen = 1;
30476  out:
30477 -       if (gfpflags_allow_blocking(flags))
30478 +       if (enableirqs)
30479                 local_irq_disable();
30480         if (!page)
30481                 return NULL;
30482 @@ -1529,6 +1542,16 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
30483         __free_kmem_pages(page, order);
30486 +static void free_delayed(struct list_head *h)
30488 +       while(!list_empty(h)) {
30489 +               struct page *page = list_first_entry(h, struct page, lru);
30491 +               list_del(&page->lru);
30492 +               __free_slab(page->slab_cache, page);
30493 +       }
30496  #define need_reserve_slab_rcu                                          \
30497         (sizeof(((struct page *)NULL)->lru) < sizeof(struct rcu_head))
30499 @@ -1560,6 +1583,12 @@ static void free_slab(struct kmem_cache *s, struct page *page)
30500                 }
30502                 call_rcu(head, rcu_free_slab);
30503 +       } else if (irqs_disabled()) {
30504 +               struct slub_free_list *f = this_cpu_ptr(&slub_free_list);
30506 +               raw_spin_lock(&f->lock);
30507 +               list_add(&page->lru, &f->list);
30508 +               raw_spin_unlock(&f->lock);
30509         } else
30510                 __free_slab(s, page);
30512 @@ -1673,7 +1702,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
30513         if (!n || !n->nr_partial)
30514                 return NULL;
30516 -       spin_lock(&n->list_lock);
30517 +       raw_spin_lock(&n->list_lock);
30518         list_for_each_entry_safe(page, page2, &n->partial, lru) {
30519                 void *t;
30521 @@ -1698,7 +1727,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
30522                         break;
30524         }
30525 -       spin_unlock(&n->list_lock);
30526 +       raw_spin_unlock(&n->list_lock);
30527         return object;
30530 @@ -1944,7 +1973,7 @@ redo:
30531                          * that acquire_slab() will see a slab page that
30532                          * is frozen
30533                          */
30534 -                       spin_lock(&n->list_lock);
30535 +                       raw_spin_lock(&n->list_lock);
30536                 }
30537         } else {
30538                 m = M_FULL;
30539 @@ -1955,7 +1984,7 @@ redo:
30540                          * slabs from diagnostic functions will not see
30541                          * any frozen slabs.
30542                          */
30543 -                       spin_lock(&n->list_lock);
30544 +                       raw_spin_lock(&n->list_lock);
30545                 }
30546         }
30548 @@ -1990,7 +2019,7 @@ redo:
30549                 goto redo;
30551         if (lock)
30552 -               spin_unlock(&n->list_lock);
30553 +               raw_spin_unlock(&n->list_lock);
30555         if (m == M_FREE) {
30556                 stat(s, DEACTIVATE_EMPTY);
30557 @@ -2022,10 +2051,10 @@ static void unfreeze_partials(struct kmem_cache *s,
30558                 n2 = get_node(s, page_to_nid(page));
30559                 if (n != n2) {
30560                         if (n)
30561 -                               spin_unlock(&n->list_lock);
30562 +                               raw_spin_unlock(&n->list_lock);
30564                         n = n2;
30565 -                       spin_lock(&n->list_lock);
30566 +                       raw_spin_lock(&n->list_lock);
30567                 }
30569                 do {
30570 @@ -2054,7 +2083,7 @@ static void unfreeze_partials(struct kmem_cache *s,
30571         }
30573         if (n)
30574 -               spin_unlock(&n->list_lock);
30575 +               raw_spin_unlock(&n->list_lock);
30577         while (discard_page) {
30578                 page = discard_page;
30579 @@ -2093,14 +2122,21 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
30580                         pobjects = oldpage->pobjects;
30581                         pages = oldpage->pages;
30582                         if (drain && pobjects > s->cpu_partial) {
30583 +                               struct slub_free_list *f;
30584                                 unsigned long flags;
30585 +                               LIST_HEAD(tofree);
30586                                 /*
30587                                  * partial array is full. Move the existing
30588                                  * set to the per node partial list.
30589                                  */
30590                                 local_irq_save(flags);
30591                                 unfreeze_partials(s, this_cpu_ptr(s->cpu_slab));
30592 +                               f = this_cpu_ptr(&slub_free_list);
30593 +                               raw_spin_lock(&f->lock);
30594 +                               list_splice_init(&f->list, &tofree);
30595 +                               raw_spin_unlock(&f->lock);
30596                                 local_irq_restore(flags);
30597 +                               free_delayed(&tofree);
30598                                 oldpage = NULL;
30599                                 pobjects = 0;
30600                                 pages = 0;
30601 @@ -2172,7 +2208,22 @@ static bool has_cpu_slab(int cpu, void *info)
30603  static void flush_all(struct kmem_cache *s)
30605 +       LIST_HEAD(tofree);
30606 +       int cpu;
30608         on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1, GFP_ATOMIC);
30609 +       for_each_online_cpu(cpu) {
30610 +               struct slub_free_list *f;
30612 +               if (!has_cpu_slab(cpu, s))
30613 +                       continue;
30615 +               f = &per_cpu(slub_free_list, cpu);
30616 +               raw_spin_lock_irq(&f->lock);
30617 +               list_splice_init(&f->list, &tofree);
30618 +               raw_spin_unlock_irq(&f->lock);
30619 +               free_delayed(&tofree);
30620 +       }
30623  /*
30624 @@ -2208,10 +2259,10 @@ static unsigned long count_partial(struct kmem_cache_node *n,
30625         unsigned long x = 0;
30626         struct page *page;
30628 -       spin_lock_irqsave(&n->list_lock, flags);
30629 +       raw_spin_lock_irqsave(&n->list_lock, flags);
30630         list_for_each_entry(page, &n->partial, lru)
30631                 x += get_count(page);
30632 -       spin_unlock_irqrestore(&n->list_lock, flags);
30633 +       raw_spin_unlock_irqrestore(&n->list_lock, flags);
30634         return x;
30636  #endif /* CONFIG_SLUB_DEBUG || CONFIG_SYSFS */
30637 @@ -2349,8 +2400,10 @@ static inline void *get_freelist(struct kmem_cache *s, struct page *page)
30638   * already disabled (which is the case for bulk allocation).
30639   */
30640  static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
30641 -                         unsigned long addr, struct kmem_cache_cpu *c)
30642 +                         unsigned long addr, struct kmem_cache_cpu *c,
30643 +                         struct list_head *to_free)
30645 +       struct slub_free_list *f;
30646         void *freelist;
30647         struct page *page;
30649 @@ -2410,6 +2463,13 @@ load_freelist:
30650         VM_BUG_ON(!c->page->frozen);
30651         c->freelist = get_freepointer(s, freelist);
30652         c->tid = next_tid(c->tid);
30654 +out:
30655 +       f = this_cpu_ptr(&slub_free_list);
30656 +       raw_spin_lock(&f->lock);
30657 +       list_splice_init(&f->list, to_free);
30658 +       raw_spin_unlock(&f->lock);
30660         return freelist;
30662  new_slab:
30663 @@ -2441,7 +2501,7 @@ new_slab:
30664         deactivate_slab(s, page, get_freepointer(s, freelist));
30665         c->page = NULL;
30666         c->freelist = NULL;
30667 -       return freelist;
30668 +       goto out;
30671  /*
30672 @@ -2453,6 +2513,7 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
30674         void *p;
30675         unsigned long flags;
30676 +       LIST_HEAD(tofree);
30678         local_irq_save(flags);
30679  #ifdef CONFIG_PREEMPT
30680 @@ -2464,8 +2525,9 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
30681         c = this_cpu_ptr(s->cpu_slab);
30682  #endif
30684 -       p = ___slab_alloc(s, gfpflags, node, addr, c);
30685 +       p = ___slab_alloc(s, gfpflags, node, addr, c, &tofree);
30686         local_irq_restore(flags);
30687 +       free_delayed(&tofree);
30688         return p;
30691 @@ -2652,7 +2714,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
30693         do {
30694                 if (unlikely(n)) {
30695 -                       spin_unlock_irqrestore(&n->list_lock, flags);
30696 +                       raw_spin_unlock_irqrestore(&n->list_lock, flags);
30697                         n = NULL;
30698                 }
30699                 prior = page->freelist;
30700 @@ -2684,7 +2746,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
30701                                  * Otherwise the list_lock will synchronize with
30702                                  * other processors updating the list of slabs.
30703                                  */
30704 -                               spin_lock_irqsave(&n->list_lock, flags);
30705 +                               raw_spin_lock_irqsave(&n->list_lock, flags);
30707                         }
30708                 }
30709 @@ -2726,7 +2788,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
30710                 add_partial(n, page, DEACTIVATE_TO_TAIL);
30711                 stat(s, FREE_ADD_PARTIAL);
30712         }
30713 -       spin_unlock_irqrestore(&n->list_lock, flags);
30714 +       raw_spin_unlock_irqrestore(&n->list_lock, flags);
30715         return;
30717  slab_empty:
30718 @@ -2741,7 +2803,7 @@ slab_empty:
30719                 remove_full(s, n, page);
30720         }
30722 -       spin_unlock_irqrestore(&n->list_lock, flags);
30723 +       raw_spin_unlock_irqrestore(&n->list_lock, flags);
30724         stat(s, FREE_SLAB);
30725         discard_slab(s, page);
30727 @@ -2913,6 +2975,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
30728                           void **p)
30730         struct kmem_cache_cpu *c;
30731 +       LIST_HEAD(to_free);
30732         int i;
30734         /* memcg and kmem_cache debug support */
30735 @@ -2936,7 +2999,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
30736                          * of re-populating per CPU c->freelist
30737                          */
30738                         p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE,
30739 -                                           _RET_IP_, c);
30740 +                                           _RET_IP_, c, &to_free);
30741                         if (unlikely(!p[i]))
30742                                 goto error;
30744 @@ -2948,6 +3011,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
30745         }
30746         c->tid = next_tid(c->tid);
30747         local_irq_enable();
30748 +       free_delayed(&to_free);
30750         /* Clear memory outside IRQ disabled fastpath loop */
30751         if (unlikely(flags & __GFP_ZERO)) {
30752 @@ -3095,7 +3159,7 @@ static void
30753  init_kmem_cache_node(struct kmem_cache_node *n)
30755         n->nr_partial = 0;
30756 -       spin_lock_init(&n->list_lock);
30757 +       raw_spin_lock_init(&n->list_lock);
30758         INIT_LIST_HEAD(&n->partial);
30759  #ifdef CONFIG_SLUB_DEBUG
30760         atomic_long_set(&n->nr_slabs, 0);
30761 @@ -3677,7 +3741,7 @@ int __kmem_cache_shrink(struct kmem_cache *s, bool deactivate)
30762                 for (i = 0; i < SHRINK_PROMOTE_MAX; i++)
30763                         INIT_LIST_HEAD(promote + i);
30765 -               spin_lock_irqsave(&n->list_lock, flags);
30766 +               raw_spin_lock_irqsave(&n->list_lock, flags);
30768                 /*
30769                  * Build lists of slabs to discard or promote.
30770 @@ -3708,7 +3772,7 @@ int __kmem_cache_shrink(struct kmem_cache *s, bool deactivate)
30771                 for (i = SHRINK_PROMOTE_MAX - 1; i >= 0; i--)
30772                         list_splice(promote + i, &n->partial);
30774 -               spin_unlock_irqrestore(&n->list_lock, flags);
30775 +               raw_spin_unlock_irqrestore(&n->list_lock, flags);
30777                 /* Release empty slabs */
30778                 list_for_each_entry_safe(page, t, &discard, lru)
30779 @@ -3884,6 +3948,12 @@ void __init kmem_cache_init(void)
30781         static __initdata struct kmem_cache boot_kmem_cache,
30782                 boot_kmem_cache_node;
30783 +       int cpu;
30785 +       for_each_possible_cpu(cpu) {
30786 +               raw_spin_lock_init(&per_cpu(slub_free_list, cpu).lock);
30787 +               INIT_LIST_HEAD(&per_cpu(slub_free_list, cpu).list);
30788 +       }
30790         if (debug_guardpage_minorder())
30791                 slub_max_order = 0;
30792 @@ -4127,7 +4197,7 @@ static int validate_slab_node(struct kmem_cache *s,
30793         struct page *page;
30794         unsigned long flags;
30796 -       spin_lock_irqsave(&n->list_lock, flags);
30797 +       raw_spin_lock_irqsave(&n->list_lock, flags);
30799         list_for_each_entry(page, &n->partial, lru) {
30800                 validate_slab_slab(s, page, map);
30801 @@ -4149,7 +4219,7 @@ static int validate_slab_node(struct kmem_cache *s,
30802                        s->name, count, atomic_long_read(&n->nr_slabs));
30804  out:
30805 -       spin_unlock_irqrestore(&n->list_lock, flags);
30806 +       raw_spin_unlock_irqrestore(&n->list_lock, flags);
30807         return count;
30810 @@ -4337,12 +4407,12 @@ static int list_locations(struct kmem_cache *s, char *buf,
30811                 if (!atomic_long_read(&n->nr_slabs))
30812                         continue;
30814 -               spin_lock_irqsave(&n->list_lock, flags);
30815 +               raw_spin_lock_irqsave(&n->list_lock, flags);
30816                 list_for_each_entry(page, &n->partial, lru)
30817                         process_slab(&t, s, page, alloc, map);
30818                 list_for_each_entry(page, &n->full, lru)
30819                         process_slab(&t, s, page, alloc, map);
30820 -               spin_unlock_irqrestore(&n->list_lock, flags);
30821 +               raw_spin_unlock_irqrestore(&n->list_lock, flags);
30822         }
30824         for (i = 0; i < t.count; i++) {
30825 diff --git a/mm/swap.c b/mm/swap.c
30826 index 39395fb549c0..ad16649221d7 100644
30827 --- a/mm/swap.c
30828 +++ b/mm/swap.c
30829 @@ -31,6 +31,7 @@
30830  #include <linux/memcontrol.h>
30831  #include <linux/gfp.h>
30832  #include <linux/uio.h>
30833 +#include <linux/locallock.h>
30834  #include <linux/hugetlb.h>
30835  #include <linux/page_idle.h>
30837 @@ -46,6 +47,9 @@ static DEFINE_PER_CPU(struct pagevec, lru_add_pvec);
30838  static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
30839  static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs);
30841 +static DEFINE_LOCAL_IRQ_LOCK(rotate_lock);
30842 +DEFINE_LOCAL_IRQ_LOCK(swapvec_lock);
30844  /*
30845   * This path almost never happens for VM activity - pages are normally
30846   * freed via pagevecs.  But it gets used by networking.
30847 @@ -481,11 +485,11 @@ void rotate_reclaimable_page(struct page *page)
30848                 unsigned long flags;
30850                 page_cache_get(page);
30851 -               local_irq_save(flags);
30852 +               local_lock_irqsave(rotate_lock, flags);
30853                 pvec = this_cpu_ptr(&lru_rotate_pvecs);
30854                 if (!pagevec_add(pvec, page))
30855                         pagevec_move_tail(pvec);
30856 -               local_irq_restore(flags);
30857 +               local_unlock_irqrestore(rotate_lock, flags);
30858         }
30861 @@ -536,12 +540,13 @@ static bool need_activate_page_drain(int cpu)
30862  void activate_page(struct page *page)
30864         if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
30865 -               struct pagevec *pvec = &get_cpu_var(activate_page_pvecs);
30866 +               struct pagevec *pvec = &get_locked_var(swapvec_lock,
30867 +                                                      activate_page_pvecs);
30869                 page_cache_get(page);
30870                 if (!pagevec_add(pvec, page))
30871                         pagevec_lru_move_fn(pvec, __activate_page, NULL);
30872 -               put_cpu_var(activate_page_pvecs);
30873 +               put_locked_var(swapvec_lock, activate_page_pvecs);
30874         }
30877 @@ -567,7 +572,7 @@ void activate_page(struct page *page)
30879  static void __lru_cache_activate_page(struct page *page)
30881 -       struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
30882 +       struct pagevec *pvec = &get_locked_var(swapvec_lock, lru_add_pvec);
30883         int i;
30885         /*
30886 @@ -589,7 +594,7 @@ static void __lru_cache_activate_page(struct page *page)
30887                 }
30888         }
30890 -       put_cpu_var(lru_add_pvec);
30891 +       put_locked_var(swapvec_lock, lru_add_pvec);
30894  /*
30895 @@ -630,13 +635,13 @@ EXPORT_SYMBOL(mark_page_accessed);
30897  static void __lru_cache_add(struct page *page)
30899 -       struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
30900 +       struct pagevec *pvec = &get_locked_var(swapvec_lock, lru_add_pvec);
30902         page_cache_get(page);
30903         if (!pagevec_space(pvec))
30904                 __pagevec_lru_add(pvec);
30905         pagevec_add(pvec, page);
30906 -       put_cpu_var(lru_add_pvec);
30907 +       put_locked_var(swapvec_lock, lru_add_pvec);
30910  /**
30911 @@ -816,9 +821,15 @@ void lru_add_drain_cpu(int cpu)
30912                 unsigned long flags;
30914                 /* No harm done if a racing interrupt already did this */
30915 -               local_irq_save(flags);
30916 +#ifdef CONFIG_PREEMPT_RT_BASE
30917 +               local_lock_irqsave_on(rotate_lock, flags, cpu);
30918 +               pagevec_move_tail(pvec);
30919 +               local_unlock_irqrestore_on(rotate_lock, flags, cpu);
30920 +#else
30921 +               local_lock_irqsave(rotate_lock, flags);
30922                 pagevec_move_tail(pvec);
30923 -               local_irq_restore(flags);
30924 +               local_unlock_irqrestore(rotate_lock, flags);
30925 +#endif
30926         }
30928         pvec = &per_cpu(lru_deactivate_file_pvecs, cpu);
30929 @@ -846,26 +857,47 @@ void deactivate_file_page(struct page *page)
30930                 return;
30932         if (likely(get_page_unless_zero(page))) {
30933 -               struct pagevec *pvec = &get_cpu_var(lru_deactivate_file_pvecs);
30934 +               struct pagevec *pvec = &get_locked_var(swapvec_lock,
30935 +                                                      lru_deactivate_file_pvecs);
30937                 if (!pagevec_add(pvec, page))
30938                         pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
30939 -               put_cpu_var(lru_deactivate_file_pvecs);
30940 +               put_locked_var(swapvec_lock, lru_deactivate_file_pvecs);
30941         }
30944  void lru_add_drain(void)
30946 -       lru_add_drain_cpu(get_cpu());
30947 -       put_cpu();
30948 +       lru_add_drain_cpu(local_lock_cpu(swapvec_lock));
30949 +       local_unlock_cpu(swapvec_lock);
30953 +#ifdef CONFIG_PREEMPT_RT_BASE
30954 +static inline void remote_lru_add_drain(int cpu, struct cpumask *has_work)
30956 +       local_lock_on(swapvec_lock, cpu);
30957 +       lru_add_drain_cpu(cpu);
30958 +       local_unlock_on(swapvec_lock, cpu);
30961 +#else
30963  static void lru_add_drain_per_cpu(struct work_struct *dummy)
30965         lru_add_drain();
30968  static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
30969 +static inline void remote_lru_add_drain(int cpu, struct cpumask *has_work)
30971 +       struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);
30973 +       INIT_WORK(work, lru_add_drain_per_cpu);
30974 +       schedule_work_on(cpu, work);
30975 +       cpumask_set_cpu(cpu, has_work);
30977 +#endif
30979  void lru_add_drain_all(void)
30981 @@ -878,20 +910,17 @@ void lru_add_drain_all(void)
30982         cpumask_clear(&has_work);
30984         for_each_online_cpu(cpu) {
30985 -               struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);
30987                 if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
30988                     pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
30989                     pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) ||
30990 -                   need_activate_page_drain(cpu)) {
30991 -                       INIT_WORK(work, lru_add_drain_per_cpu);
30992 -                       schedule_work_on(cpu, work);
30993 -                       cpumask_set_cpu(cpu, &has_work);
30994 -               }
30995 +                   need_activate_page_drain(cpu))
30996 +                       remote_lru_add_drain(cpu, &has_work);
30997         }
30999 +#ifndef CONFIG_PREEMPT_RT_BASE
31000         for_each_cpu(cpu, &has_work)
31001                 flush_work(&per_cpu(lru_add_drain_work, cpu));
31002 +#endif
31004         put_online_cpus();
31005         mutex_unlock(&lock);
31006 diff --git a/mm/truncate.c b/mm/truncate.c
31007 index 76e35ad97102..5f196420020c 100644
31008 --- a/mm/truncate.c
31009 +++ b/mm/truncate.c
31010 @@ -56,8 +56,11 @@ static void clear_exceptional_entry(struct address_space *mapping,
31011          * protected by mapping->tree_lock.
31012          */
31013         if (!workingset_node_shadows(node) &&
31014 -           !list_empty(&node->private_list))
31015 -               list_lru_del(&workingset_shadow_nodes, &node->private_list);
31016 +           !list_empty(&node->private_list)) {
31017 +               local_lock(workingset_shadow_lock);
31018 +               list_lru_del(&__workingset_shadow_nodes, &node->private_list);
31019 +               local_unlock(workingset_shadow_lock);
31020 +       }
31021         __radix_tree_delete_node(&mapping->page_tree, node);
31022  unlock:
31023         spin_unlock_irq(&mapping->tree_lock);
31024 diff --git a/mm/vmalloc.c b/mm/vmalloc.c
31025 index 8e3c9c5a3042..68740314ad54 100644
31026 --- a/mm/vmalloc.c
31027 +++ b/mm/vmalloc.c
31028 @@ -821,7 +821,7 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
31029         struct vmap_block *vb;
31030         struct vmap_area *va;
31031         unsigned long vb_idx;
31032 -       int node, err;
31033 +       int node, err, cpu;
31034         void *vaddr;
31036         node = numa_node_id();
31037 @@ -864,11 +864,12 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
31038         BUG_ON(err);
31039         radix_tree_preload_end();
31041 -       vbq = &get_cpu_var(vmap_block_queue);
31042 +       cpu = get_cpu_light();
31043 +       vbq = this_cpu_ptr(&vmap_block_queue);
31044         spin_lock(&vbq->lock);
31045         list_add_tail_rcu(&vb->free_list, &vbq->free);
31046         spin_unlock(&vbq->lock);
31047 -       put_cpu_var(vmap_block_queue);
31048 +       put_cpu_light();
31050         return vaddr;
31052 @@ -937,6 +938,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
31053         struct vmap_block *vb;
31054         void *vaddr = NULL;
31055         unsigned int order;
31056 +       int cpu;
31058         BUG_ON(offset_in_page(size));
31059         BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
31060 @@ -951,7 +953,8 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
31061         order = get_order(size);
31063         rcu_read_lock();
31064 -       vbq = &get_cpu_var(vmap_block_queue);
31065 +       cpu = get_cpu_light();
31066 +       vbq = this_cpu_ptr(&vmap_block_queue);
31067         list_for_each_entry_rcu(vb, &vbq->free, free_list) {
31068                 unsigned long pages_off;
31070 @@ -974,7 +977,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
31071                 break;
31072         }
31074 -       put_cpu_var(vmap_block_queue);
31075 +       put_cpu_light();
31076         rcu_read_unlock();
31078         /* Allocate new block if nothing was found */
31079 diff --git a/mm/vmstat.c b/mm/vmstat.c
31080 index c54fd2924f25..64416fd7c209 100644
31081 --- a/mm/vmstat.c
31082 +++ b/mm/vmstat.c
31083 @@ -226,6 +226,7 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
31084         long x;
31085         long t;
31087 +       preempt_disable_rt();
31088         x = delta + __this_cpu_read(*p);
31090         t = __this_cpu_read(pcp->stat_threshold);
31091 @@ -235,6 +236,7 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
31092                 x = 0;
31093         }
31094         __this_cpu_write(*p, x);
31095 +       preempt_enable_rt();
31097  EXPORT_SYMBOL(__mod_zone_page_state);
31099 @@ -267,6 +269,7 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
31100         s8 __percpu *p = pcp->vm_stat_diff + item;
31101         s8 v, t;
31103 +       preempt_disable_rt();
31104         v = __this_cpu_inc_return(*p);
31105         t = __this_cpu_read(pcp->stat_threshold);
31106         if (unlikely(v > t)) {
31107 @@ -275,6 +278,7 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
31108                 zone_page_state_add(v + overstep, zone, item);
31109                 __this_cpu_write(*p, -overstep);
31110         }
31111 +       preempt_enable_rt();
31114  void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
31115 @@ -289,6 +293,7 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
31116         s8 __percpu *p = pcp->vm_stat_diff + item;
31117         s8 v, t;
31119 +       preempt_disable_rt();
31120         v = __this_cpu_dec_return(*p);
31121         t = __this_cpu_read(pcp->stat_threshold);
31122         if (unlikely(v < - t)) {
31123 @@ -297,6 +302,7 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
31124                 zone_page_state_add(v - overstep, zone, item);
31125                 __this_cpu_write(*p, overstep);
31126         }
31127 +       preempt_enable_rt();
31130  void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
31131 diff --git a/mm/workingset.c b/mm/workingset.c
31132 index df66f426fdcf..6db7b243fa0d 100644
31133 --- a/mm/workingset.c
31134 +++ b/mm/workingset.c
31135 @@ -264,7 +264,8 @@ void workingset_activation(struct page *page)
31136   * point where they would still be useful.
31137   */
31139 -struct list_lru workingset_shadow_nodes;
31140 +struct list_lru __workingset_shadow_nodes;
31141 +DEFINE_LOCAL_IRQ_LOCK(workingset_shadow_lock);
31143  static unsigned long count_shadow_nodes(struct shrinker *shrinker,
31144                                         struct shrink_control *sc)
31145 @@ -274,9 +275,9 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
31146         unsigned long pages;
31148         /* list_lru lock nests inside IRQ-safe mapping->tree_lock */
31149 -       local_irq_disable();
31150 -       shadow_nodes = list_lru_shrink_count(&workingset_shadow_nodes, sc);
31151 -       local_irq_enable();
31152 +       local_lock_irq(workingset_shadow_lock);
31153 +       shadow_nodes = list_lru_shrink_count(&__workingset_shadow_nodes, sc);
31154 +       local_unlock_irq(workingset_shadow_lock);
31156         pages = node_present_pages(sc->nid);
31157         /*
31158 @@ -361,9 +362,9 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
31159         spin_unlock(&mapping->tree_lock);
31160         ret = LRU_REMOVED_RETRY;
31161  out:
31162 -       local_irq_enable();
31163 +       local_unlock_irq(workingset_shadow_lock);
31164         cond_resched();
31165 -       local_irq_disable();
31166 +       local_lock_irq(workingset_shadow_lock);
31167         spin_lock(lru_lock);
31168         return ret;
31170 @@ -374,10 +375,10 @@ static unsigned long scan_shadow_nodes(struct shrinker *shrinker,
31171         unsigned long ret;
31173         /* list_lru lock nests inside IRQ-safe mapping->tree_lock */
31174 -       local_irq_disable();
31175 -       ret =  list_lru_shrink_walk(&workingset_shadow_nodes, sc,
31176 +       local_lock_irq(workingset_shadow_lock);
31177 +       ret =  list_lru_shrink_walk(&__workingset_shadow_nodes, sc,
31178                                     shadow_lru_isolate, NULL);
31179 -       local_irq_enable();
31180 +       local_unlock_irq(workingset_shadow_lock);
31181         return ret;
31184 @@ -398,7 +399,7 @@ static int __init workingset_init(void)
31186         int ret;
31188 -       ret = list_lru_init_key(&workingset_shadow_nodes, &shadow_nodes_key);
31189 +       ret = list_lru_init_key(&__workingset_shadow_nodes, &shadow_nodes_key);
31190         if (ret)
31191                 goto err;
31192         ret = register_shrinker(&workingset_shadow_shrinker);
31193 @@ -406,7 +407,7 @@ static int __init workingset_init(void)
31194                 goto err_list_lru;
31195         return 0;
31196  err_list_lru:
31197 -       list_lru_destroy(&workingset_shadow_nodes);
31198 +       list_lru_destroy(&__workingset_shadow_nodes);
31199  err:
31200         return ret;
31202 diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
31203 index c1ea19478119..529552c3716d 100644
31204 --- a/mm/zsmalloc.c
31205 +++ b/mm/zsmalloc.c
31206 @@ -64,6 +64,7 @@
31207  #include <linux/debugfs.h>
31208  #include <linux/zsmalloc.h>
31209  #include <linux/zpool.h>
31210 +#include <linux/locallock.h>
31212  /*
31213   * This must be power of 2 and greater than of equal to sizeof(link_free).
31214 @@ -403,6 +404,7 @@ static unsigned int get_maxobj_per_zspage(int size, int pages_per_zspage)
31216  /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */
31217  static DEFINE_PER_CPU(struct mapping_area, zs_map_area);
31218 +static DEFINE_LOCAL_IRQ_LOCK(zs_map_area_lock);
31220  static int is_first_page(struct page *page)
31222 @@ -1289,7 +1291,7 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
31223         class = pool->size_class[class_idx];
31224         off = obj_idx_to_offset(page, obj_idx, class->size);
31226 -       area = &get_cpu_var(zs_map_area);
31227 +       area = &get_locked_var(zs_map_area_lock, zs_map_area);
31228         area->vm_mm = mm;
31229         if (off + class->size <= PAGE_SIZE) {
31230                 /* this object is contained entirely within a page */
31231 @@ -1342,7 +1344,7 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
31233                 __zs_unmap_object(area, pages, off, class->size);
31234         }
31235 -       put_cpu_var(zs_map_area);
31236 +       put_locked_var(zs_map_area_lock, zs_map_area);
31237         unpin_tag(handle);
31239  EXPORT_SYMBOL_GPL(zs_unmap_object);
31240 diff --git a/net/core/dev.c b/net/core/dev.c
31241 index 48399d8ce614..2ccc02633e97 100644
31242 --- a/net/core/dev.c
31243 +++ b/net/core/dev.c
31244 @@ -186,6 +186,7 @@ static unsigned int napi_gen_id;
31245  static DEFINE_HASHTABLE(napi_hash, 8);
31247  static seqcount_t devnet_rename_seq;
31248 +static DEFINE_MUTEX(devnet_rename_mutex);
31250  static inline void dev_base_seq_inc(struct net *net)
31252 @@ -207,14 +208,14 @@ static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
31253  static inline void rps_lock(struct softnet_data *sd)
31255  #ifdef CONFIG_RPS
31256 -       spin_lock(&sd->input_pkt_queue.lock);
31257 +       raw_spin_lock(&sd->input_pkt_queue.raw_lock);
31258  #endif
31261  static inline void rps_unlock(struct softnet_data *sd)
31263  #ifdef CONFIG_RPS
31264 -       spin_unlock(&sd->input_pkt_queue.lock);
31265 +       raw_spin_unlock(&sd->input_pkt_queue.raw_lock);
31266  #endif
31269 @@ -884,7 +885,8 @@ retry:
31270         strcpy(name, dev->name);
31271         rcu_read_unlock();
31272         if (read_seqcount_retry(&devnet_rename_seq, seq)) {
31273 -               cond_resched();
31274 +               mutex_lock(&devnet_rename_mutex);
31275 +               mutex_unlock(&devnet_rename_mutex);
31276                 goto retry;
31277         }
31279 @@ -1153,20 +1155,17 @@ int dev_change_name(struct net_device *dev, const char *newname)
31280         if (dev->flags & IFF_UP)
31281                 return -EBUSY;
31283 -       write_seqcount_begin(&devnet_rename_seq);
31284 +       mutex_lock(&devnet_rename_mutex);
31285 +       __raw_write_seqcount_begin(&devnet_rename_seq);
31287 -       if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
31288 -               write_seqcount_end(&devnet_rename_seq);
31289 -               return 0;
31290 -       }
31291 +       if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
31292 +               goto outunlock;
31294         memcpy(oldname, dev->name, IFNAMSIZ);
31296         err = dev_get_valid_name(net, dev, newname);
31297 -       if (err < 0) {
31298 -               write_seqcount_end(&devnet_rename_seq);
31299 -               return err;
31300 -       }
31301 +       if (err < 0)
31302 +               goto outunlock;
31304         if (oldname[0] && !strchr(oldname, '%'))
31305                 netdev_info(dev, "renamed from %s\n", oldname);
31306 @@ -1179,11 +1178,12 @@ rollback:
31307         if (ret) {
31308                 memcpy(dev->name, oldname, IFNAMSIZ);
31309                 dev->name_assign_type = old_assign_type;
31310 -               write_seqcount_end(&devnet_rename_seq);
31311 -               return ret;
31312 +               err = ret;
31313 +               goto outunlock;
31314         }
31316 -       write_seqcount_end(&devnet_rename_seq);
31317 +       __raw_write_seqcount_end(&devnet_rename_seq);
31318 +       mutex_unlock(&devnet_rename_mutex);
31320         netdev_adjacent_rename_links(dev, oldname);
31322 @@ -1204,7 +1204,8 @@ rollback:
31323                 /* err >= 0 after dev_alloc_name() or stores the first errno */
31324                 if (err >= 0) {
31325                         err = ret;
31326 -                       write_seqcount_begin(&devnet_rename_seq);
31327 +                       mutex_lock(&devnet_rename_mutex);
31328 +                       __raw_write_seqcount_begin(&devnet_rename_seq);
31329                         memcpy(dev->name, oldname, IFNAMSIZ);
31330                         memcpy(oldname, newname, IFNAMSIZ);
31331                         dev->name_assign_type = old_assign_type;
31332 @@ -1217,6 +1218,11 @@ rollback:
31333         }
31335         return err;
31337 +outunlock:
31338 +       __raw_write_seqcount_end(&devnet_rename_seq);
31339 +       mutex_unlock(&devnet_rename_mutex);
31340 +       return err;
31343  /**
31344 @@ -2268,6 +2274,7 @@ static inline void __netif_reschedule(struct Qdisc *q)
31345         sd->output_queue_tailp = &q->next_sched;
31346         raise_softirq_irqoff(NET_TX_SOFTIRQ);
31347         local_irq_restore(flags);
31348 +       preempt_check_resched_rt();
31351  void __netif_schedule(struct Qdisc *q)
31352 @@ -2349,6 +2356,7 @@ void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
31353         __this_cpu_write(softnet_data.completion_queue, skb);
31354         raise_softirq_irqoff(NET_TX_SOFTIRQ);
31355         local_irq_restore(flags);
31356 +       preempt_check_resched_rt();
31358  EXPORT_SYMBOL(__dev_kfree_skb_irq);
31360 @@ -2906,7 +2914,11 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
31361          * This permits __QDISC___STATE_RUNNING owner to get the lock more
31362          * often and dequeue packets faster.
31363          */
31364 +#ifdef CONFIG_PREEMPT_RT_FULL
31365 +       contended = true;
31366 +#else
31367         contended = qdisc_is_running(q);
31368 +#endif
31369         if (unlikely(contended))
31370                 spin_lock(&q->busylock);
31372 @@ -2966,9 +2978,44 @@ static void skb_update_prio(struct sk_buff *skb)
31373  #define skb_update_prio(skb)
31374  #endif
31376 +#ifdef CONFIG_PREEMPT_RT_FULL
31378 +static inline int xmit_rec_read(void)
31380 +       return current->xmit_recursion;
31383 +static inline void xmit_rec_inc(void)
31385 +       current->xmit_recursion++;
31388 +static inline void xmit_rec_dec(void)
31390 +       current->xmit_recursion--;
31393 +#else
31395  DEFINE_PER_CPU(int, xmit_recursion);
31396  EXPORT_SYMBOL(xmit_recursion);
31398 +static inline int xmit_rec_read(void)
31400 +       return __this_cpu_read(xmit_recursion);
31403 +static inline void xmit_rec_inc(void)
31405 +       __this_cpu_inc(xmit_recursion);
31408 +static inline void xmit_rec_dec(void)
31410 +       __this_cpu_dec(xmit_recursion);
31412 +#endif
31414  #define RECURSION_LIMIT 10
31416  /**
31417 @@ -3161,7 +3208,7 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
31419                 if (txq->xmit_lock_owner != cpu) {
31421 -                       if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
31422 +                       if (xmit_rec_read() > RECURSION_LIMIT)
31423                                 goto recursion_alert;
31425                         skb = validate_xmit_skb(skb, dev);
31426 @@ -3171,9 +3218,9 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
31427                         HARD_TX_LOCK(dev, txq, cpu);
31429                         if (!netif_xmit_stopped(txq)) {
31430 -                               __this_cpu_inc(xmit_recursion);
31431 +                               xmit_rec_inc();
31432                                 skb = dev_hard_start_xmit(skb, dev, txq, &rc);
31433 -                               __this_cpu_dec(xmit_recursion);
31434 +                               xmit_rec_dec();
31435                                 if (dev_xmit_complete(rc)) {
31436                                         HARD_TX_UNLOCK(dev, txq);
31437                                         goto out;
31438 @@ -3547,6 +3594,7 @@ drop:
31439         rps_unlock(sd);
31441         local_irq_restore(flags);
31442 +       preempt_check_resched_rt();
31444         atomic_long_inc(&skb->dev->rx_dropped);
31445         kfree_skb(skb);
31446 @@ -3565,7 +3613,7 @@ static int netif_rx_internal(struct sk_buff *skb)
31447                 struct rps_dev_flow voidflow, *rflow = &voidflow;
31448                 int cpu;
31450 -               preempt_disable();
31451 +               migrate_disable();
31452                 rcu_read_lock();
31454                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
31455 @@ -3575,13 +3623,13 @@ static int netif_rx_internal(struct sk_buff *skb)
31456                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
31458                 rcu_read_unlock();
31459 -               preempt_enable();
31460 +               migrate_enable();
31461         } else
31462  #endif
31463         {
31464                 unsigned int qtail;
31465 -               ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
31466 -               put_cpu();
31467 +               ret = enqueue_to_backlog(skb, get_cpu_light(), &qtail);
31468 +               put_cpu_light();
31469         }
31470         return ret;
31472 @@ -3615,16 +3663,44 @@ int netif_rx_ni(struct sk_buff *skb)
31474         trace_netif_rx_ni_entry(skb);
31476 -       preempt_disable();
31477 +       local_bh_disable();
31478         err = netif_rx_internal(skb);
31479 -       if (local_softirq_pending())
31480 -               do_softirq();
31481 -       preempt_enable();
31482 +       local_bh_enable();
31484         return err;
31486  EXPORT_SYMBOL(netif_rx_ni);
31488 +#ifdef CONFIG_PREEMPT_RT_FULL
31490 + * RT runs ksoftirqd as a real time thread and the root_lock is a
31491 + * "sleeping spinlock". If the trylock fails then we can go into an
31492 + * infinite loop when ksoftirqd preempted the task which actually
31493 + * holds the lock, because we requeue q and raise NET_TX softirq
31494 + * causing ksoftirqd to loop forever.
31495 + *
31496 + * It's safe to use spin_lock on RT here as softirqs run in thread
31497 + * context and cannot deadlock against the thread which is holding
31498 + * root_lock.
31499 + *
31500 + * On !RT the trylock might fail, but there we bail out from the
31501 + * softirq loop after 10 attempts which we can't do on RT. And the
31502 + * task holding root_lock cannot be preempted, so the only downside of
31503 + * that trylock is that we need 10 loops to decide that we should have
31504 + * given up in the first one :)
31505 + */
31506 +static inline int take_root_lock(spinlock_t *lock)
31508 +       spin_lock(lock);
31509 +       return 1;
31511 +#else
31512 +static inline int take_root_lock(spinlock_t *lock)
31514 +       return spin_trylock(lock);
31516 +#endif
31518  static void net_tx_action(struct softirq_action *h)
31520         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
31521 @@ -3666,7 +3742,7 @@ static void net_tx_action(struct softirq_action *h)
31522                         head = head->next_sched;
31524                         root_lock = qdisc_lock(q);
31525 -                       if (spin_trylock(root_lock)) {
31526 +                       if (take_root_lock(root_lock)) {
31527                                 smp_mb__before_atomic();
31528                                 clear_bit(__QDISC_STATE_SCHED,
31529                                           &q->state);
31530 @@ -4088,7 +4164,7 @@ static void flush_backlog(void *arg)
31531         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
31532                 if (skb->dev == dev) {
31533                         __skb_unlink(skb, &sd->input_pkt_queue);
31534 -                       kfree_skb(skb);
31535 +                       __skb_queue_tail(&sd->tofree_queue, skb);
31536                         input_queue_head_incr(sd);
31537                 }
31538         }
31539 @@ -4097,10 +4173,13 @@ static void flush_backlog(void *arg)
31540         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
31541                 if (skb->dev == dev) {
31542                         __skb_unlink(skb, &sd->process_queue);
31543 -                       kfree_skb(skb);
31544 +                       __skb_queue_tail(&sd->tofree_queue, skb);
31545                         input_queue_head_incr(sd);
31546                 }
31547         }
31549 +       if (!skb_queue_empty(&sd->tofree_queue))
31550 +               raise_softirq_irqoff(NET_RX_SOFTIRQ);
31553  static int napi_gro_complete(struct sk_buff *skb)
31554 @@ -4557,6 +4636,7 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd)
31555                 sd->rps_ipi_list = NULL;
31557                 local_irq_enable();
31558 +               preempt_check_resched_rt();
31560                 /* Send pending IPI's to kick RPS processing on remote cpus. */
31561                 while (remsd) {
31562 @@ -4570,6 +4650,7 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd)
31563         } else
31564  #endif
31565                 local_irq_enable();
31566 +       preempt_check_resched_rt();
31569  static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
31570 @@ -4651,9 +4732,11 @@ void __napi_schedule(struct napi_struct *n)
31571         local_irq_save(flags);
31572         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
31573         local_irq_restore(flags);
31574 +       preempt_check_resched_rt();
31576  EXPORT_SYMBOL(__napi_schedule);
31578 +#ifndef CONFIG_PREEMPT_RT_FULL
31579  /**
31580   * __napi_schedule_irqoff - schedule for receive
31581   * @n: entry to schedule
31582 @@ -4665,6 +4748,7 @@ void __napi_schedule_irqoff(struct napi_struct *n)
31583         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
31585  EXPORT_SYMBOL(__napi_schedule_irqoff);
31586 +#endif
31588  void __napi_complete(struct napi_struct *n)
31590 @@ -4891,13 +4975,21 @@ static void net_rx_action(struct softirq_action *h)
31591         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
31592         unsigned long time_limit = jiffies + 2;
31593         int budget = netdev_budget;
31594 +       struct sk_buff_head tofree_q;
31595 +       struct sk_buff *skb;
31596         LIST_HEAD(list);
31597         LIST_HEAD(repoll);
31599 +       __skb_queue_head_init(&tofree_q);
31601         local_irq_disable();
31602 +       skb_queue_splice_init(&sd->tofree_queue, &tofree_q);
31603         list_splice_init(&sd->poll_list, &list);
31604         local_irq_enable();
31606 +       while ((skb = __skb_dequeue(&tofree_q)))
31607 +               kfree_skb(skb);
31609         for (;;) {
31610                 struct napi_struct *n;
31612 @@ -4927,7 +5019,7 @@ static void net_rx_action(struct softirq_action *h)
31613         list_splice_tail(&repoll, &list);
31614         list_splice(&list, &sd->poll_list);
31615         if (!list_empty(&sd->poll_list))
31616 -               __raise_softirq_irqoff(NET_RX_SOFTIRQ);
31617 +               __raise_softirq_irqoff_ksoft(NET_RX_SOFTIRQ);
31619         net_rps_action_and_irq_enable(sd);
31621 @@ -7266,7 +7358,7 @@ EXPORT_SYMBOL(free_netdev);
31622  void synchronize_net(void)
31624         might_sleep();
31625 -       if (rtnl_is_locked())
31626 +       if (rtnl_is_locked() && !IS_ENABLED(CONFIG_PREEMPT_RT_FULL))
31627                 synchronize_rcu_expedited();
31628         else
31629                 synchronize_rcu();
31630 @@ -7507,16 +7599,20 @@ static int dev_cpu_callback(struct notifier_block *nfb,
31632         raise_softirq_irqoff(NET_TX_SOFTIRQ);
31633         local_irq_enable();
31634 +       preempt_check_resched_rt();
31636         /* Process offline CPU's input_pkt_queue */
31637         while ((skb = __skb_dequeue(&oldsd->process_queue))) {
31638                 netif_rx_ni(skb);
31639                 input_queue_head_incr(oldsd);
31640         }
31641 -       while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
31642 +       while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
31643                 netif_rx_ni(skb);
31644                 input_queue_head_incr(oldsd);
31645         }
31646 +       while ((skb = __skb_dequeue(&oldsd->tofree_queue))) {
31647 +               kfree_skb(skb);
31648 +       }
31650         return NOTIFY_OK;
31652 @@ -7818,8 +7914,9 @@ static int __init net_dev_init(void)
31653         for_each_possible_cpu(i) {
31654                 struct softnet_data *sd = &per_cpu(softnet_data, i);
31656 -               skb_queue_head_init(&sd->input_pkt_queue);
31657 -               skb_queue_head_init(&sd->process_queue);
31658 +               skb_queue_head_init_raw(&sd->input_pkt_queue);
31659 +               skb_queue_head_init_raw(&sd->process_queue);
31660 +               skb_queue_head_init_raw(&sd->tofree_queue);
31661                 INIT_LIST_HEAD(&sd->poll_list);
31662                 sd->output_queue_tailp = &sd->output_queue;
31663  #ifdef CONFIG_RPS
31664 diff --git a/net/core/skbuff.c b/net/core/skbuff.c
31665 index 73dfd7729bc9..62f62810cddd 100644
31666 --- a/net/core/skbuff.c
31667 +++ b/net/core/skbuff.c
31668 @@ -63,6 +63,7 @@
31669  #include <linux/errqueue.h>
31670  #include <linux/prefetch.h>
31671  #include <linux/if_vlan.h>
31672 +#include <linux/locallock.h>
31674  #include <net/protocol.h>
31675  #include <net/dst.h>
31676 @@ -351,6 +352,8 @@ EXPORT_SYMBOL(build_skb);
31678  static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache);
31679  static DEFINE_PER_CPU(struct page_frag_cache, napi_alloc_cache);
31680 +static DEFINE_LOCAL_IRQ_LOCK(netdev_alloc_lock);
31681 +static DEFINE_LOCAL_IRQ_LOCK(napi_alloc_cache_lock);
31683  static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
31685 @@ -358,10 +361,10 @@ static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
31686         unsigned long flags;
31687         void *data;
31689 -       local_irq_save(flags);
31690 +       local_lock_irqsave(netdev_alloc_lock, flags);
31691         nc = this_cpu_ptr(&netdev_alloc_cache);
31692         data = __alloc_page_frag(nc, fragsz, gfp_mask);
31693 -       local_irq_restore(flags);
31694 +       local_unlock_irqrestore(netdev_alloc_lock, flags);
31695         return data;
31698 @@ -380,9 +383,13 @@ EXPORT_SYMBOL(netdev_alloc_frag);
31700  static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
31702 -       struct page_frag_cache *nc = this_cpu_ptr(&napi_alloc_cache);
31703 +       struct page_frag_cache *nc;
31704 +       void *data;
31706 -       return __alloc_page_frag(nc, fragsz, gfp_mask);
31707 +       nc = &get_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
31708 +       data = __alloc_page_frag(nc, fragsz, gfp_mask);
31709 +       put_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
31710 +       return data;
31713  void *napi_alloc_frag(unsigned int fragsz)
31714 @@ -429,13 +436,13 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
31715         if (sk_memalloc_socks())
31716                 gfp_mask |= __GFP_MEMALLOC;
31718 -       local_irq_save(flags);
31719 +       local_lock_irqsave(netdev_alloc_lock, flags);
31721         nc = this_cpu_ptr(&netdev_alloc_cache);
31722         data = __alloc_page_frag(nc, len, gfp_mask);
31723         pfmemalloc = nc->pfmemalloc;
31725 -       local_irq_restore(flags);
31726 +       local_unlock_irqrestore(netdev_alloc_lock, flags);
31728         if (unlikely(!data))
31729                 return NULL;
31730 @@ -476,9 +483,10 @@ EXPORT_SYMBOL(__netdev_alloc_skb);
31731  struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
31732                                  gfp_t gfp_mask)
31734 -       struct page_frag_cache *nc = this_cpu_ptr(&napi_alloc_cache);
31735 +       struct page_frag_cache *nc;
31736         struct sk_buff *skb;
31737         void *data;
31738 +       bool pfmemalloc;
31740         len += NET_SKB_PAD + NET_IP_ALIGN;
31742 @@ -496,7 +504,11 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
31743         if (sk_memalloc_socks())
31744                 gfp_mask |= __GFP_MEMALLOC;
31746 +       nc = &get_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
31747         data = __alloc_page_frag(nc, len, gfp_mask);
31748 +       pfmemalloc = nc->pfmemalloc;
31749 +       put_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
31751         if (unlikely(!data))
31752                 return NULL;
31754 @@ -507,7 +519,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
31755         }
31757         /* use OR instead of assignment to avoid clearing of bits in mask */
31758 -       if (nc->pfmemalloc)
31759 +       if (pfmemalloc)
31760                 skb->pfmemalloc = 1;
31761         skb->head_frag = 1;
31763 diff --git a/net/core/sock.c b/net/core/sock.c
31764 index 9c708a5fb751..823377a7e63a 100644
31765 --- a/net/core/sock.c
31766 +++ b/net/core/sock.c
31767 @@ -2447,12 +2447,11 @@ void lock_sock_nested(struct sock *sk, int subclass)
31768         if (sk->sk_lock.owned)
31769                 __lock_sock(sk);
31770         sk->sk_lock.owned = 1;
31771 -       spin_unlock(&sk->sk_lock.slock);
31772 +       spin_unlock_bh(&sk->sk_lock.slock);
31773         /*
31774          * The sk_lock has mutex_lock() semantics here:
31775          */
31776         mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
31777 -       local_bh_enable();
31779  EXPORT_SYMBOL(lock_sock_nested);
31781 diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
31782 index 36e26977c908..ff2593269089 100644
31783 --- a/net/ipv4/icmp.c
31784 +++ b/net/ipv4/icmp.c
31785 @@ -69,6 +69,7 @@
31786  #include <linux/jiffies.h>
31787  #include <linux/kernel.h>
31788  #include <linux/fcntl.h>
31789 +#include <linux/sysrq.h>
31790  #include <linux/socket.h>
31791  #include <linux/in.h>
31792  #include <linux/inet.h>
31793 @@ -77,6 +78,7 @@
31794  #include <linux/string.h>
31795  #include <linux/netfilter_ipv4.h>
31796  #include <linux/slab.h>
31797 +#include <linux/locallock.h>
31798  #include <net/snmp.h>
31799  #include <net/ip.h>
31800  #include <net/route.h>
31801 @@ -204,6 +206,8 @@ static const struct icmp_control icmp_pointers[NR_ICMP_TYPES+1];
31802   *
31803   *     On SMP we have one ICMP socket per-cpu.
31804   */
31805 +static DEFINE_LOCAL_IRQ_LOCK(icmp_sk_lock);
31807  static struct sock *icmp_sk(struct net *net)
31809         return *this_cpu_ptr(net->ipv4.icmp_sk);
31810 @@ -215,12 +219,14 @@ static inline struct sock *icmp_xmit_lock(struct net *net)
31812         local_bh_disable();
31814 +       local_lock(icmp_sk_lock);
31815         sk = icmp_sk(net);
31817         if (unlikely(!spin_trylock(&sk->sk_lock.slock))) {
31818                 /* This can happen if the output path signals a
31819                  * dst_link_failure() for an outgoing ICMP packet.
31820                  */
31821 +               local_unlock(icmp_sk_lock);
31822                 local_bh_enable();
31823                 return NULL;
31824         }
31825 @@ -230,6 +236,7 @@ static inline struct sock *icmp_xmit_lock(struct net *net)
31826  static inline void icmp_xmit_unlock(struct sock *sk)
31828         spin_unlock_bh(&sk->sk_lock.slock);
31829 +       local_unlock(icmp_sk_lock);
31832  int sysctl_icmp_msgs_per_sec __read_mostly = 1000;
31833 @@ -358,6 +365,7 @@ static void icmp_push_reply(struct icmp_bxm *icmp_param,
31834         struct sock *sk;
31835         struct sk_buff *skb;
31837 +       local_lock(icmp_sk_lock);
31838         sk = icmp_sk(dev_net((*rt)->dst.dev));
31839         if (ip_append_data(sk, fl4, icmp_glue_bits, icmp_param,
31840                            icmp_param->data_len+icmp_param->head_len,
31841 @@ -380,6 +388,7 @@ static void icmp_push_reply(struct icmp_bxm *icmp_param,
31842                 skb->ip_summed = CHECKSUM_NONE;
31843                 ip_push_pending_frames(sk, fl4);
31844         }
31845 +       local_unlock(icmp_sk_lock);
31848  /*
31849 @@ -891,6 +900,30 @@ static bool icmp_redirect(struct sk_buff *skb)
31852  /*
31853 + * 32bit and 64bit have different timestamp length, so we check for
31854 + * the cookie at offset 20 and verify it is repeated at offset 50
31855 + */
31856 +#define CO_POS0                20
31857 +#define CO_POS1                50
31858 +#define CO_SIZE                sizeof(int)
31859 +#define ICMP_SYSRQ_SIZE        57
31862 + * We got a ICMP_SYSRQ_SIZE sized ping request. Check for the cookie
31863 + * pattern and if it matches send the next byte as a trigger to sysrq.
31864 + */
31865 +static void icmp_check_sysrq(struct net *net, struct sk_buff *skb)
31867 +       int cookie = htonl(net->ipv4.sysctl_icmp_echo_sysrq);
31868 +       char *p = skb->data;
31870 +       if (!memcmp(&cookie, p + CO_POS0, CO_SIZE) &&
31871 +           !memcmp(&cookie, p + CO_POS1, CO_SIZE) &&
31872 +           p[CO_POS0 + CO_SIZE] == p[CO_POS1 + CO_SIZE])
31873 +               handle_sysrq(p[CO_POS0 + CO_SIZE]);
31877   *     Handle ICMP_ECHO ("ping") requests.
31878   *
31879   *     RFC 1122: 3.2.2.6 MUST have an echo server that answers ICMP echo
31880 @@ -917,6 +950,11 @@ static bool icmp_echo(struct sk_buff *skb)
31881                 icmp_param.data_len        = skb->len;
31882                 icmp_param.head_len        = sizeof(struct icmphdr);
31883                 icmp_reply(&icmp_param, skb);
31885 +               if (skb->len == ICMP_SYSRQ_SIZE &&
31886 +                   net->ipv4.sysctl_icmp_echo_sysrq) {
31887 +                       icmp_check_sysrq(net, skb);
31888 +               }
31889         }
31890         /* should there be an ICMP stat for ignored echos? */
31891         return true;
31892 diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
31893 index 70fb352e317f..1bcd436709a4 100644
31894 --- a/net/ipv4/sysctl_net_ipv4.c
31895 +++ b/net/ipv4/sysctl_net_ipv4.c
31896 @@ -818,6 +818,13 @@ static struct ctl_table ipv4_net_table[] = {
31897                 .proc_handler   = proc_dointvec
31898         },
31899         {
31900 +               .procname       = "icmp_echo_sysrq",
31901 +               .data           = &init_net.ipv4.sysctl_icmp_echo_sysrq,
31902 +               .maxlen         = sizeof(int),
31903 +               .mode           = 0644,
31904 +               .proc_handler   = proc_dointvec
31905 +       },
31906 +       {
31907                 .procname       = "icmp_ignore_bogus_error_responses",
31908                 .data           = &init_net.ipv4.sysctl_icmp_ignore_bogus_error_responses,
31909                 .maxlen         = sizeof(int),
31910 diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
31911 index 198fc2314c82..6d84e7c8c8a9 100644
31912 --- a/net/ipv4/tcp_ipv4.c
31913 +++ b/net/ipv4/tcp_ipv4.c
31914 @@ -62,6 +62,7 @@
31915  #include <linux/init.h>
31916  #include <linux/times.h>
31917  #include <linux/slab.h>
31918 +#include <linux/locallock.h>
31920  #include <net/net_namespace.h>
31921  #include <net/icmp.h>
31922 @@ -570,6 +571,7 @@ void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
31924  EXPORT_SYMBOL(tcp_v4_send_check);
31926 +static DEFINE_LOCAL_IRQ_LOCK(tcp_sk_lock);
31927  /*
31928   *     This routine will send an RST to the other tcp.
31929   *
31930 @@ -691,10 +693,13 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
31931                 arg.bound_dev_if = sk->sk_bound_dev_if;
31933         arg.tos = ip_hdr(skb)->tos;
31935 +       local_lock(tcp_sk_lock);
31936         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
31937                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
31938                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
31939                               &arg, arg.iov[0].iov_len);
31940 +       local_unlock(tcp_sk_lock);
31942         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
31943         TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
31944 @@ -776,10 +781,12 @@ static void tcp_v4_send_ack(struct net *net,
31945         if (oif)
31946                 arg.bound_dev_if = oif;
31947         arg.tos = tos;
31948 +       local_lock(tcp_sk_lock);
31949         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
31950                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
31951                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
31952                               &arg, arg.iov[0].iov_len);
31953 +       local_unlock(tcp_sk_lock);
31955         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
31957 diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
31958 index 9f0915f72702..28d234c03cb5 100644
31959 --- a/net/mac80211/rx.c
31960 +++ b/net/mac80211/rx.c
31961 @@ -3601,7 +3601,7 @@ void ieee80211_rx_napi(struct ieee80211_hw *hw, struct sk_buff *skb,
31962         struct ieee80211_supported_band *sband;
31963         struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
31965 -       WARN_ON_ONCE(softirq_count() == 0);
31966 +       WARN_ON_ONCE_NONRT(softirq_count() == 0);
31968         if (WARN_ON(status->band >= IEEE80211_NUM_BANDS))
31969                 goto drop;
31970 diff --git a/net/netfilter/core.c b/net/netfilter/core.c
31971 index f39276d1c2d7..10880c89d62f 100644
31972 --- a/net/netfilter/core.c
31973 +++ b/net/netfilter/core.c
31974 @@ -22,11 +22,17 @@
31975  #include <linux/proc_fs.h>
31976  #include <linux/mutex.h>
31977  #include <linux/slab.h>
31978 +#include <linux/locallock.h>
31979  #include <net/net_namespace.h>
31980  #include <net/sock.h>
31982  #include "nf_internals.h"
31984 +#ifdef CONFIG_PREEMPT_RT_BASE
31985 +DEFINE_LOCAL_IRQ_LOCK(xt_write_lock);
31986 +EXPORT_PER_CPU_SYMBOL(xt_write_lock);
31987 +#endif
31989  static DEFINE_MUTEX(afinfo_mutex);
31991  const struct nf_afinfo __rcu *nf_afinfo[NFPROTO_NUMPROTO] __read_mostly;
31992 diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
31993 index f8d6a0ca9c03..f6be45510cf6 100644
31994 --- a/net/packet/af_packet.c
31995 +++ b/net/packet/af_packet.c
31996 @@ -63,6 +63,7 @@
31997  #include <linux/if_packet.h>
31998  #include <linux/wireless.h>
31999  #include <linux/kernel.h>
32000 +#include <linux/delay.h>
32001  #include <linux/kmod.h>
32002  #include <linux/slab.h>
32003  #include <linux/vmalloc.h>
32004 @@ -694,7 +695,7 @@ static void prb_retire_rx_blk_timer_expired(unsigned long data)
32005         if (BLOCK_NUM_PKTS(pbd)) {
32006                 while (atomic_read(&pkc->blk_fill_in_prog)) {
32007                         /* Waiting for skb_copy_bits to finish... */
32008 -                       cpu_relax();
32009 +                       cpu_chill();
32010                 }
32011         }
32013 @@ -956,7 +957,7 @@ static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
32014                 if (!(status & TP_STATUS_BLK_TMO)) {
32015                         while (atomic_read(&pkc->blk_fill_in_prog)) {
32016                                 /* Waiting for skb_copy_bits to finish... */
32017 -                               cpu_relax();
32018 +                               cpu_chill();
32019                         }
32020                 }
32021                 prb_close_block(pkc, pbd, po, status);
32022 diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c
32023 index a2340748ec86..19123a97b354 100644
32024 --- a/net/rds/ib_rdma.c
32025 +++ b/net/rds/ib_rdma.c
32026 @@ -34,6 +34,7 @@
32027  #include <linux/slab.h>
32028  #include <linux/rculist.h>
32029  #include <linux/llist.h>
32030 +#include <linux/delay.h>
32032  #include "rds.h"
32033  #include "ib.h"
32034 @@ -313,7 +314,7 @@ static inline void wait_clean_list_grace(void)
32035         for_each_online_cpu(cpu) {
32036                 flag = &per_cpu(clean_list_grace, cpu);
32037                 while (test_bit(CLEAN_LIST_BUSY_BIT, flag))
32038 -                       cpu_relax();
32039 +                       cpu_chill();
32040         }
32043 diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
32044 index aa4725038f94..00b81cab28f3 100644
32045 --- a/net/sched/sch_generic.c
32046 +++ b/net/sched/sch_generic.c
32047 @@ -893,7 +893,7 @@ void dev_deactivate_many(struct list_head *head)
32048         /* Wait for outstanding qdisc_run calls. */
32049         list_for_each_entry(dev, head, close_list)
32050                 while (some_qdisc_is_busy(dev))
32051 -                       yield();
32052 +                       msleep(1);
32055  void dev_deactivate(struct net_device *dev)
32056 diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
32057 index a6cbb2104667..5b69bb580617 100644
32058 --- a/net/sunrpc/svc_xprt.c
32059 +++ b/net/sunrpc/svc_xprt.c
32060 @@ -340,7 +340,7 @@ void svc_xprt_do_enqueue(struct svc_xprt *xprt)
32061                 goto out;
32062         }
32064 -       cpu = get_cpu();
32065 +       cpu = get_cpu_light();
32066         pool = svc_pool_for_cpu(xprt->xpt_server, cpu);
32068         atomic_long_inc(&pool->sp_stats.packets);
32069 @@ -376,7 +376,7 @@ redo_search:
32071                 atomic_long_inc(&pool->sp_stats.threads_woken);
32072                 wake_up_process(rqstp->rq_task);
32073 -               put_cpu();
32074 +               put_cpu_light();
32075                 goto out;
32076         }
32077         rcu_read_unlock();
32078 @@ -397,7 +397,7 @@ redo_search:
32079                 goto redo_search;
32080         }
32081         rqstp = NULL;
32082 -       put_cpu();
32083 +       put_cpu_light();
32084  out:
32085         trace_svc_xprt_do_enqueue(xprt, rqstp);
32087 diff --git a/scripts/mkcompile_h b/scripts/mkcompile_h
32088 index 6fdc97ef6023..523e0420d7f0 100755
32089 --- a/scripts/mkcompile_h
32090 +++ b/scripts/mkcompile_h
32091 @@ -4,7 +4,8 @@ TARGET=$1
32092  ARCH=$2
32093  SMP=$3
32094  PREEMPT=$4
32095 -CC=$5
32096 +RT=$5
32097 +CC=$6
32099  vecho() { [ "${quiet}" = "silent_" ] || echo "$@" ; }
32101 @@ -57,6 +58,7 @@ UTS_VERSION="#$VERSION"
32102  CONFIG_FLAGS=""
32103  if [ -n "$SMP" ] ; then CONFIG_FLAGS="SMP"; fi
32104  if [ -n "$PREEMPT" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS PREEMPT"; fi
32105 +if [ -n "$RT" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS RT"; fi
32106  UTS_VERSION="$UTS_VERSION $CONFIG_FLAGS $TIMESTAMP"
32108  # Truncate to maximum length
32109 diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c
32110 index 4ba64fd49759..34e50186885d 100644
32111 --- a/sound/core/pcm_native.c
32112 +++ b/sound/core/pcm_native.c
32113 @@ -135,7 +135,7 @@ EXPORT_SYMBOL_GPL(snd_pcm_stream_unlock);
32114  void snd_pcm_stream_lock_irq(struct snd_pcm_substream *substream)
32116         if (!substream->pcm->nonatomic)
32117 -               local_irq_disable();
32118 +               local_irq_disable_nort();
32119         snd_pcm_stream_lock(substream);
32121  EXPORT_SYMBOL_GPL(snd_pcm_stream_lock_irq);
32122 @@ -150,7 +150,7 @@ void snd_pcm_stream_unlock_irq(struct snd_pcm_substream *substream)
32124         snd_pcm_stream_unlock(substream);
32125         if (!substream->pcm->nonatomic)
32126 -               local_irq_enable();
32127 +               local_irq_enable_nort();
32129  EXPORT_SYMBOL_GPL(snd_pcm_stream_unlock_irq);
32131 @@ -158,7 +158,7 @@ unsigned long _snd_pcm_stream_lock_irqsave(struct snd_pcm_substream *substream)
32133         unsigned long flags = 0;
32134         if (!substream->pcm->nonatomic)
32135 -               local_irq_save(flags);
32136 +               local_irq_save_nort(flags);
32137         snd_pcm_stream_lock(substream);
32138         return flags;
32140 @@ -176,7 +176,7 @@ void snd_pcm_stream_unlock_irqrestore(struct snd_pcm_substream *substream,
32142         snd_pcm_stream_unlock(substream);
32143         if (!substream->pcm->nonatomic)
32144 -               local_irq_restore(flags);
32145 +               local_irq_restore_nort(flags);
32147  EXPORT_SYMBOL_GPL(snd_pcm_stream_unlock_irqrestore);
32149 diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
32150 index 4f70d12e392d..9378d0919ed8 100644
32151 --- a/virt/kvm/async_pf.c
32152 +++ b/virt/kvm/async_pf.c
32153 @@ -98,8 +98,8 @@ static void async_pf_execute(struct work_struct *work)
32154          * This memory barrier pairs with prepare_to_wait's set_current_state()
32155          */
32156         smp_mb();
32157 -       if (waitqueue_active(&vcpu->wq))
32158 -               wake_up_interruptible(&vcpu->wq);
32159 +       if (swait_active(&vcpu->wq))
32160 +               swake_up(&vcpu->wq);
32162         mmput(mm);
32163         kvm_put_kvm(vcpu->kvm);
32164 diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
32165 index cb092bd9965b..133e7fa2ec8b 100644
32166 --- a/virt/kvm/kvm_main.c
32167 +++ b/virt/kvm/kvm_main.c
32168 @@ -228,8 +228,7 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
32169         vcpu->kvm = kvm;
32170         vcpu->vcpu_id = id;
32171         vcpu->pid = NULL;
32172 -       vcpu->halt_poll_ns = 0;
32173 -       init_waitqueue_head(&vcpu->wq);
32174 +       init_swait_queue_head(&vcpu->wq);
32175         kvm_async_pf_vcpu_init(vcpu);
32177         vcpu->pre_pcpu = -1;
32178 @@ -2008,7 +2007,7 @@ static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu)
32179  void kvm_vcpu_block(struct kvm_vcpu *vcpu)
32181         ktime_t start, cur;
32182 -       DEFINE_WAIT(wait);
32183 +       DECLARE_SWAITQUEUE(wait);
32184         bool waited = false;
32185         u64 block_ns;
32187 @@ -2033,7 +2032,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
32188         kvm_arch_vcpu_blocking(vcpu);
32190         for (;;) {
32191 -               prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
32192 +               prepare_to_swait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
32194                 if (kvm_vcpu_check_block(vcpu) < 0)
32195                         break;
32196 @@ -2042,7 +2041,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
32197                 schedule();
32198         }
32200 -       finish_wait(&vcpu->wq, &wait);
32201 +       finish_swait(&vcpu->wq, &wait);
32202         cur = ktime_get();
32204         kvm_arch_vcpu_unblocking(vcpu);
32205 @@ -2074,11 +2073,11 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
32207         int me;
32208         int cpu = vcpu->cpu;
32209 -       wait_queue_head_t *wqp;
32210 +       struct swait_queue_head *wqp;
32212         wqp = kvm_arch_vcpu_wq(vcpu);
32213 -       if (waitqueue_active(wqp)) {
32214 -               wake_up_interruptible(wqp);
32215 +       if (swait_active(wqp)) {
32216 +               swake_up(wqp);
32217                 ++vcpu->stat.halt_wakeup;
32218         }
32220 @@ -2179,7 +2178,7 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me)
32221                                 continue;
32222                         if (vcpu == me)
32223                                 continue;
32224 -                       if (waitqueue_active(&vcpu->wq) && !kvm_arch_vcpu_runnable(vcpu))
32225 +                       if (swait_active(&vcpu->wq) && !kvm_arch_vcpu_runnable(vcpu))
32226                                 continue;
32227                         if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
32228                                 continue;