rt-patch: add preempt-rt patch to rpi kernel
[openadk.git] / target / linux / patches / 72134397d72079a533c8fc742701fdc7f5ae7c5b / patch-realtime
blobfe7c393a689f488023f0299786f73d1bbafad3ba
1 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/Documentation/sysrq.txt linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/Documentation/sysrq.txt
2 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/Documentation/sysrq.txt 2017-04-16 10:37:28.000000000 +0200
3 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/Documentation/sysrq.txt      2017-04-18 17:54:21.000000000 +0200
4 @@ -59,10 +59,17 @@
5  On other - If you know of the key combos for other architectures, please
6             let me know so I can add them to this section.
7  
8 -On all -  write a character to /proc/sysrq-trigger.  e.g.:
10 +On all -  write a character to /proc/sysrq-trigger, e.g.:
11                 echo t > /proc/sysrq-trigger
13 +On all - Enable network SysRq by writing a cookie to icmp_echo_sysrq, e.g.
14 +               echo 0x01020304 >/proc/sys/net/ipv4/icmp_echo_sysrq
15 +        Send an ICMP echo request with this pattern plus the particular
16 +        SysRq command key. Example:
17 +               # ping -c1 -s57 -p0102030468
18 +        will trigger the SysRq-H (help) command.
21  *  What are the 'command' keys?
22  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
23  'b'     - Will immediately reboot the system without syncing or unmounting
24 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/Documentation/trace/histograms.txt linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/Documentation/trace/histograms.txt
25 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/Documentation/trace/histograms.txt      1970-01-01 01:00:00.000000000 +0100
26 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/Documentation/trace/histograms.txt   2017-04-18 17:54:22.000000000 +0200
27 @@ -0,0 +1,186 @@
28 +               Using the Linux Kernel Latency Histograms
31 +This document gives a short explanation how to enable, configure and use
32 +latency histograms. Latency histograms are primarily relevant in the
33 +context of real-time enabled kernels (CONFIG_PREEMPT/CONFIG_PREEMPT_RT)
34 +and are used in the quality management of the Linux real-time
35 +capabilities.
38 +* Purpose of latency histograms
40 +A latency histogram continuously accumulates the frequencies of latency
41 +data. There are two types of histograms
42 +- potential sources of latencies
43 +- effective latencies
46 +* Potential sources of latencies
48 +Potential sources of latencies are code segments where interrupts,
49 +preemption or both are disabled (aka critical sections). To create
50 +histograms of potential sources of latency, the kernel stores the time
51 +stamp at the start of a critical section, determines the time elapsed
52 +when the end of the section is reached, and increments the frequency
53 +counter of that latency value - irrespective of whether any concurrently
54 +running process is affected by latency or not.
55 +- Configuration items (in the Kernel hacking/Tracers submenu)
56 +  CONFIG_INTERRUPT_OFF_LATENCY
57 +  CONFIG_PREEMPT_OFF_LATENCY
60 +* Effective latencies
62 +Effective latencies are actually occuring during wakeup of a process. To
63 +determine effective latencies, the kernel stores the time stamp when a
64 +process is scheduled to be woken up, and determines the duration of the
65 +wakeup time shortly before control is passed over to this process. Note
66 +that the apparent latency in user space may be somewhat longer, since the
67 +process may be interrupted after control is passed over to it but before
68 +the execution in user space takes place. Simply measuring the interval
69 +between enqueuing and wakeup may also not appropriate in cases when a
70 +process is scheduled as a result of a timer expiration. The timer may have
71 +missed its deadline, e.g. due to disabled interrupts, but this latency
72 +would not be registered. Therefore, the offsets of missed timers are
73 +recorded in a separate histogram. If both wakeup latency and missed timer
74 +offsets are configured and enabled, a third histogram may be enabled that
75 +records the overall latency as a sum of the timer latency, if any, and the
76 +wakeup latency. This histogram is called "timerandwakeup".
77 +- Configuration items (in the Kernel hacking/Tracers submenu)
78 +  CONFIG_WAKEUP_LATENCY
79 +  CONFIG_MISSED_TIMER_OFSETS
82 +* Usage
84 +The interface to the administration of the latency histograms is located
85 +in the debugfs file system. To mount it, either enter
87 +mount -t sysfs nodev /sys
88 +mount -t debugfs nodev /sys/kernel/debug
90 +from shell command line level, or add
92 +nodev  /sys                    sysfs   defaults        0 0
93 +nodev  /sys/kernel/debug       debugfs defaults        0 0
95 +to the file /etc/fstab. All latency histogram related files are then
96 +available in the directory /sys/kernel/debug/tracing/latency_hist. A
97 +particular histogram type is enabled by writing non-zero to the related
98 +variable in the /sys/kernel/debug/tracing/latency_hist/enable directory.
99 +Select "preemptirqsoff" for the histograms of potential sources of
100 +latencies and "wakeup" for histograms of effective latencies etc. The
101 +histogram data - one per CPU - are available in the files
103 +/sys/kernel/debug/tracing/latency_hist/preemptoff/CPUx
104 +/sys/kernel/debug/tracing/latency_hist/irqsoff/CPUx
105 +/sys/kernel/debug/tracing/latency_hist/preemptirqsoff/CPUx
106 +/sys/kernel/debug/tracing/latency_hist/wakeup/CPUx
107 +/sys/kernel/debug/tracing/latency_hist/wakeup/sharedprio/CPUx
108 +/sys/kernel/debug/tracing/latency_hist/missed_timer_offsets/CPUx
109 +/sys/kernel/debug/tracing/latency_hist/timerandwakeup/CPUx
111 +The histograms are reset by writing non-zero to the file "reset" in a
112 +particular latency directory. To reset all latency data, use
114 +#!/bin/sh
116 +TRACINGDIR=/sys/kernel/debug/tracing
117 +HISTDIR=$TRACINGDIR/latency_hist
119 +if test -d $HISTDIR
120 +then
121 +  cd $HISTDIR
122 +  for i in `find . | grep /reset$`
123 +  do
124 +    echo 1 >$i
125 +  done
129 +* Data format
131 +Latency data are stored with a resolution of one microsecond. The
132 +maximum latency is 10,240 microseconds. The data are only valid, if the
133 +overflow register is empty. Every output line contains the latency in
134 +microseconds in the first row and the number of samples in the second
135 +row. To display only lines with a positive latency count, use, for
136 +example,
138 +grep -v " 0$" /sys/kernel/debug/tracing/latency_hist/preemptoff/CPU0
140 +#Minimum latency: 0 microseconds.
141 +#Average latency: 0 microseconds.
142 +#Maximum latency: 25 microseconds.
143 +#Total samples: 3104770694
144 +#There are 0 samples greater or equal than 10240 microseconds
145 +#usecs          samples
146 +    0        2984486876
147 +    1          49843506
148 +    2          58219047
149 +    3           5348126
150 +    4           2187960
151 +    5           3388262
152 +    6            959289
153 +    7            208294
154 +    8             40420
155 +    9              4485
156 +   10             14918
157 +   11             18340
158 +   12             25052
159 +   13             19455
160 +   14              5602
161 +   15               969
162 +   16                47
163 +   17                18
164 +   18                14
165 +   19                 1
166 +   20                 3
167 +   21                 2
168 +   22                 5
169 +   23                 2
170 +   25                 1
173 +* Wakeup latency of a selected process
175 +To only collect wakeup latency data of a particular process, write the
176 +PID of the requested process to
178 +/sys/kernel/debug/tracing/latency_hist/wakeup/pid
180 +PIDs are not considered, if this variable is set to 0.
183 +* Details of the process with the highest wakeup latency so far
185 +Selected data of the process that suffered from the highest wakeup
186 +latency that occurred in a particular CPU are available in the file
188 +/sys/kernel/debug/tracing/latency_hist/wakeup/max_latency-CPUx.
190 +In addition, other relevant system data at the time when the
191 +latency occurred are given.
193 +The format of the data is (all in one line):
194 +<PID> <Priority> <Latency> (<Timeroffset>) <Command> \
195 +<- <PID> <Priority> <Command> <Timestamp>
197 +The value of <Timeroffset> is only relevant in the combined timer
198 +and wakeup latency recording. In the wakeup recording, it is
199 +always 0, in the missed_timer_offsets recording, it is the same
200 +as <Latency>.
202 +When retrospectively searching for the origin of a latency and
203 +tracing was not enabled, it may be helpful to know the name and
204 +some basic data of the task that (finally) was switching to the
205 +late real-tlme task. In addition to the victim's data, also the
206 +data of the possible culprit are therefore displayed after the
207 +"<-" symbol.
209 +Finally, the timestamp of the time when the latency occurred
210 +in <seconds>.<microseconds> after the most recent system boot
211 +is provided.
213 +These data are also reset when the wakeup histogram is reset.
214 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/Kconfig linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/Kconfig
215 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/Kconfig    2017-04-16 10:37:28.000000000 +0200
216 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/Kconfig 2017-04-18 17:54:20.000000000 +0200
217 @@ -9,6 +9,7 @@
218         tristate "OProfile system profiling"
219         depends on PROFILING
220         depends on HAVE_OPROFILE
221 +       depends on !PREEMPT_RT_FULL
222         select RING_BUFFER
223         select RING_BUFFER_ALLOW_SWAP
224         help
225 @@ -52,6 +53,7 @@
226  config JUMP_LABEL
227         bool "Optimize very unlikely/likely branches"
228         depends on HAVE_ARCH_JUMP_LABEL
229 +       depends on (!INTERRUPT_OFF_HIST && !PREEMPT_OFF_HIST && !WAKEUP_LATENCY_HIST && !MISSED_TIMER_OFFSETS_HIST)
230         help
231           This option enables a transparent branch optimization that
232          makes certain almost-always-true or almost-always-false branch
233 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/arm/Kconfig linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/arm/Kconfig
234 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/arm/Kconfig        2017-04-16 10:37:29.000000000 +0200
235 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/arm/Kconfig     2017-04-18 17:54:19.000000000 +0200
236 @@ -36,7 +36,7 @@
237         select HAVE_ARCH_AUDITSYSCALL if (AEABI && !OABI_COMPAT)
238         select HAVE_ARCH_BITREVERSE if (CPU_32v7M || CPU_32v7) && !CPU_32v6
239         select HAVE_ARCH_HARDENED_USERCOPY
240 -       select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 && MMU
241 +       select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 && MMU && !PREEMPT_RT_BASE
242         select HAVE_ARCH_KGDB if !CPU_ENDIAN_BE32 && MMU
243         select HAVE_ARCH_MMAP_RND_BITS if MMU
244         select HAVE_ARCH_SECCOMP_FILTER if (AEABI && !OABI_COMPAT)
245 @@ -75,6 +75,7 @@
246         select HAVE_PERF_EVENTS
247         select HAVE_PERF_REGS
248         select HAVE_PERF_USER_STACK_DUMP
249 +       select HAVE_PREEMPT_LAZY
250         select HAVE_RCU_TABLE_FREE if (SMP && ARM_LPAE)
251         select HAVE_REGS_AND_STACK_ACCESS_API
252         select HAVE_SYSCALL_TRACEPOINTS
253 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/arm/include/asm/irq.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/arm/include/asm/irq.h
254 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/arm/include/asm/irq.h      2017-04-16 10:37:30.000000000 +0200
255 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/arm/include/asm/irq.h   2017-04-18 17:54:19.000000000 +0200
256 @@ -22,6 +22,8 @@
257  #endif
259  #ifndef __ASSEMBLY__
260 +#include <linux/cpumask.h>
262  struct irqaction;
263  struct pt_regs;
264  extern void migrate_irqs(void);
265 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/arm/include/asm/switch_to.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/arm/include/asm/switch_to.h
266 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/arm/include/asm/switch_to.h        2017-04-16 10:37:30.000000000 +0200
267 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/arm/include/asm/switch_to.h     2017-04-18 17:54:19.000000000 +0200
268 @@ -3,6 +3,13 @@
270  #include <linux/thread_info.h>
272 +#if defined CONFIG_PREEMPT_RT_FULL && defined CONFIG_HIGHMEM
273 +void switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p);
274 +#else
275 +static inline void
276 +switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p) { }
277 +#endif
279  /*
280   * For v7 SMP cores running a preemptible kernel we may be pre-empted
281   * during a TLB maintenance operation, so execute an inner-shareable dsb
282 @@ -25,6 +32,7 @@
283  #define switch_to(prev,next,last)                                      \
284  do {                                                                   \
285         __complete_pending_tlbi();                                      \
286 +       switch_kmaps(prev, next);                                       \
287         last = __switch_to(prev,task_thread_info(prev), task_thread_info(next));        \
288  } while (0)
290 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/arm/include/asm/thread_info.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/arm/include/asm/thread_info.h
291 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/arm/include/asm/thread_info.h      2017-04-16 10:37:30.000000000 +0200
292 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/arm/include/asm/thread_info.h   2017-04-18 17:54:19.000000000 +0200
293 @@ -49,6 +49,7 @@
294  struct thread_info {
295         unsigned long           flags;          /* low level flags */
296         int                     preempt_count;  /* 0 => preemptable, <0 => bug */
297 +       int                     preempt_lazy_count; /* 0 => preemptable, <0 => bug */
298         mm_segment_t            addr_limit;     /* address limit */
299         struct task_struct      *task;          /* main task structure */
300         __u32                   cpu;            /* cpu */
301 @@ -142,7 +143,8 @@
302  #define TIF_SYSCALL_TRACE      4       /* syscall trace active */
303  #define TIF_SYSCALL_AUDIT      5       /* syscall auditing active */
304  #define TIF_SYSCALL_TRACEPOINT 6       /* syscall tracepoint instrumentation */
305 -#define TIF_SECCOMP            7       /* seccomp syscall filtering active */
306 +#define TIF_SECCOMP            8       /* seccomp syscall filtering active */
307 +#define TIF_NEED_RESCHED_LAZY  7
309  #define TIF_NOHZ               12      /* in adaptive nohz mode */
310  #define TIF_USING_IWMMXT       17
311 @@ -152,6 +154,7 @@
312  #define _TIF_SIGPENDING                (1 << TIF_SIGPENDING)
313  #define _TIF_NEED_RESCHED      (1 << TIF_NEED_RESCHED)
314  #define _TIF_NOTIFY_RESUME     (1 << TIF_NOTIFY_RESUME)
315 +#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY)
316  #define _TIF_UPROBE            (1 << TIF_UPROBE)
317  #define _TIF_SYSCALL_TRACE     (1 << TIF_SYSCALL_TRACE)
318  #define _TIF_SYSCALL_AUDIT     (1 << TIF_SYSCALL_AUDIT)
319 @@ -167,7 +170,8 @@
320   * Change these and you break ASM code in entry-common.S
321   */
322  #define _TIF_WORK_MASK         (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
323 -                                _TIF_NOTIFY_RESUME | _TIF_UPROBE)
324 +                                _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
325 +                                _TIF_NEED_RESCHED_LAZY)
327  #endif /* __KERNEL__ */
328  #endif /* __ASM_ARM_THREAD_INFO_H */
329 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/arm/kernel/asm-offsets.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/arm/kernel/asm-offsets.c
330 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/arm/kernel/asm-offsets.c   2017-04-16 10:37:30.000000000 +0200
331 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/arm/kernel/asm-offsets.c        2017-04-18 17:54:19.000000000 +0200
332 @@ -65,6 +65,7 @@
333    BLANK();
334    DEFINE(TI_FLAGS,             offsetof(struct thread_info, flags));
335    DEFINE(TI_PREEMPT,           offsetof(struct thread_info, preempt_count));
336 +  DEFINE(TI_PREEMPT_LAZY,      offsetof(struct thread_info, preempt_lazy_count));
337    DEFINE(TI_ADDR_LIMIT,                offsetof(struct thread_info, addr_limit));
338    DEFINE(TI_TASK,              offsetof(struct thread_info, task));
339    DEFINE(TI_CPU,               offsetof(struct thread_info, cpu));
340 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/arm/kernel/entry-armv.S linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/arm/kernel/entry-armv.S
341 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/arm/kernel/entry-armv.S    2017-04-16 10:37:30.000000000 +0200
342 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/arm/kernel/entry-armv.S 2017-04-18 17:54:19.000000000 +0200
343 @@ -220,11 +220,18 @@
345  #ifdef CONFIG_PREEMPT
346         ldr     r8, [tsk, #TI_PREEMPT]          @ get preempt count
347 -       ldr     r0, [tsk, #TI_FLAGS]            @ get flags
348         teq     r8, #0                          @ if preempt count != 0
349 +       bne     1f                              @ return from exeption
350 +       ldr     r0, [tsk, #TI_FLAGS]            @ get flags
351 +       tst     r0, #_TIF_NEED_RESCHED          @ if NEED_RESCHED is set
352 +       blne    svc_preempt                     @ preempt!
354 +       ldr     r8, [tsk, #TI_PREEMPT_LAZY]     @ get preempt lazy count
355 +       teq     r8, #0                          @ if preempt lazy count != 0
356         movne   r0, #0                          @ force flags to 0
357 -       tst     r0, #_TIF_NEED_RESCHED
358 +       tst     r0, #_TIF_NEED_RESCHED_LAZY
359         blne    svc_preempt
361  #endif
363         svc_exit r5, irq = 1                    @ return from exception
364 @@ -239,8 +246,14 @@
365  1:     bl      preempt_schedule_irq            @ irq en/disable is done inside
366         ldr     r0, [tsk, #TI_FLAGS]            @ get new tasks TI_FLAGS
367         tst     r0, #_TIF_NEED_RESCHED
368 +       bne     1b
369 +       tst     r0, #_TIF_NEED_RESCHED_LAZY
370         reteq   r8                              @ go again
371 -       b       1b
372 +       ldr     r0, [tsk, #TI_PREEMPT_LAZY]     @ get preempt lazy count
373 +       teq     r0, #0                          @ if preempt lazy count != 0
374 +       beq     1b
375 +       ret     r8                              @ go again
377  #endif
379  __und_fault:
380 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/arm/kernel/entry-common.S linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/arm/kernel/entry-common.S
381 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/arm/kernel/entry-common.S  2017-04-16 10:37:30.000000000 +0200
382 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/arm/kernel/entry-common.S       2017-04-18 17:54:19.000000000 +0200
383 @@ -36,7 +36,9 @@
384   UNWIND(.cantunwind    )
385         disable_irq_notrace                     @ disable interrupts
386         ldr     r1, [tsk, #TI_FLAGS]            @ re-check for syscall tracing
387 -       tst     r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
388 +       tst     r1, #((_TIF_SYSCALL_WORK | _TIF_WORK_MASK) & ~_TIF_SECCOMP)
389 +       bne     fast_work_pending
390 +       tst     r1, #_TIF_SECCOMP
391         bne     fast_work_pending
393         /* perform architecture specific actions before user return */
394 @@ -62,8 +64,11 @@
395         str     r0, [sp, #S_R0 + S_OFF]!        @ save returned r0
396         disable_irq_notrace                     @ disable interrupts
397         ldr     r1, [tsk, #TI_FLAGS]            @ re-check for syscall tracing
398 -       tst     r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
399 +       tst     r1, #((_TIF_SYSCALL_WORK | _TIF_WORK_MASK) & ~_TIF_SECCOMP)
400 +       bne     do_slower_path
401 +       tst     r1, #_TIF_SECCOMP
402         beq     no_work_pending
403 +do_slower_path:
404   UNWIND(.fnend         )
405  ENDPROC(ret_fast_syscall)
407 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/arm/kernel/patch.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/arm/kernel/patch.c
408 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/arm/kernel/patch.c 2017-04-16 10:37:30.000000000 +0200
409 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/arm/kernel/patch.c      2017-04-18 17:54:19.000000000 +0200
410 @@ -15,7 +15,7 @@
411         unsigned int insn;
412  };
414 -static DEFINE_SPINLOCK(patch_lock);
415 +static DEFINE_RAW_SPINLOCK(patch_lock);
417  static void __kprobes *patch_map(void *addr, int fixmap, unsigned long *flags)
418         __acquires(&patch_lock)
419 @@ -32,7 +32,7 @@
420                 return addr;
422         if (flags)
423 -               spin_lock_irqsave(&patch_lock, *flags);
424 +               raw_spin_lock_irqsave(&patch_lock, *flags);
425         else
426                 __acquire(&patch_lock);
428 @@ -47,7 +47,7 @@
429         clear_fixmap(fixmap);
431         if (flags)
432 -               spin_unlock_irqrestore(&patch_lock, *flags);
433 +               raw_spin_unlock_irqrestore(&patch_lock, *flags);
434         else
435                 __release(&patch_lock);
437 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/arm/kernel/process.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/arm/kernel/process.c
438 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/arm/kernel/process.c       2017-04-16 10:37:30.000000000 +0200
439 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/arm/kernel/process.c    2017-04-18 17:54:19.000000000 +0200
440 @@ -322,6 +322,30 @@
443  #ifdef CONFIG_MMU
445 + * CONFIG_SPLIT_PTLOCK_CPUS results in a page->ptl lock.  If the lock is not
446 + * initialized by pgtable_page_ctor() then a coredump of the vector page will
447 + * fail.
448 + */
449 +static int __init vectors_user_mapping_init_page(void)
451 +       struct page *page;
452 +       unsigned long addr = 0xffff0000;
453 +       pgd_t *pgd;
454 +       pud_t *pud;
455 +       pmd_t *pmd;
457 +       pgd = pgd_offset_k(addr);
458 +       pud = pud_offset(pgd, addr);
459 +       pmd = pmd_offset(pud, addr);
460 +       page = pmd_page(*(pmd));
462 +       pgtable_page_ctor(page);
464 +       return 0;
466 +late_initcall(vectors_user_mapping_init_page);
468  #ifdef CONFIG_KUSER_HELPERS
469  /*
470   * The vectors page is always readable from user space for the
471 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/arm/kernel/signal.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/arm/kernel/signal.c
472 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/arm/kernel/signal.c        2017-04-16 10:37:30.000000000 +0200
473 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/arm/kernel/signal.c     2017-04-18 17:54:19.000000000 +0200
474 @@ -572,7 +572,8 @@
475          */
476         trace_hardirqs_off();
477         do {
478 -               if (likely(thread_flags & _TIF_NEED_RESCHED)) {
479 +               if (likely(thread_flags & (_TIF_NEED_RESCHED |
480 +                                          _TIF_NEED_RESCHED_LAZY))) {
481                         schedule();
482                 } else {
483                         if (unlikely(!user_mode(regs)))
484 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/arm/kernel/smp.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/arm/kernel/smp.c
485 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/arm/kernel/smp.c   2017-04-16 10:37:30.000000000 +0200
486 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/arm/kernel/smp.c        2017-04-18 17:54:19.000000000 +0200
487 @@ -234,8 +234,6 @@
488         flush_cache_louis();
489         local_flush_tlb_all();
491 -       clear_tasks_mm_cpumask(cpu);
493         return 0;
496 @@ -251,6 +249,9 @@
497                 pr_err("CPU%u: cpu didn't die\n", cpu);
498                 return;
499         }
501 +       clear_tasks_mm_cpumask(cpu);
503         pr_notice("CPU%u: shutdown\n", cpu);
505         /*
506 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/arm/kernel/unwind.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/arm/kernel/unwind.c
507 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/arm/kernel/unwind.c        2017-04-16 10:37:30.000000000 +0200
508 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/arm/kernel/unwind.c     2017-04-18 17:54:19.000000000 +0200
509 @@ -93,7 +93,7 @@
510  static const struct unwind_idx *__origin_unwind_idx;
511  extern const struct unwind_idx __stop_unwind_idx[];
513 -static DEFINE_SPINLOCK(unwind_lock);
514 +static DEFINE_RAW_SPINLOCK(unwind_lock);
515  static LIST_HEAD(unwind_tables);
517  /* Convert a prel31 symbol to an absolute address */
518 @@ -201,7 +201,7 @@
519                 /* module unwind tables */
520                 struct unwind_table *table;
522 -               spin_lock_irqsave(&unwind_lock, flags);
523 +               raw_spin_lock_irqsave(&unwind_lock, flags);
524                 list_for_each_entry(table, &unwind_tables, list) {
525                         if (addr >= table->begin_addr &&
526                             addr < table->end_addr) {
527 @@ -213,7 +213,7 @@
528                                 break;
529                         }
530                 }
531 -               spin_unlock_irqrestore(&unwind_lock, flags);
532 +               raw_spin_unlock_irqrestore(&unwind_lock, flags);
533         }
535         pr_debug("%s: idx = %p\n", __func__, idx);
536 @@ -529,9 +529,9 @@
537         tab->begin_addr = text_addr;
538         tab->end_addr = text_addr + text_size;
540 -       spin_lock_irqsave(&unwind_lock, flags);
541 +       raw_spin_lock_irqsave(&unwind_lock, flags);
542         list_add_tail(&tab->list, &unwind_tables);
543 -       spin_unlock_irqrestore(&unwind_lock, flags);
544 +       raw_spin_unlock_irqrestore(&unwind_lock, flags);
546         return tab;
548 @@ -543,9 +543,9 @@
549         if (!tab)
550                 return;
552 -       spin_lock_irqsave(&unwind_lock, flags);
553 +       raw_spin_lock_irqsave(&unwind_lock, flags);
554         list_del(&tab->list);
555 -       spin_unlock_irqrestore(&unwind_lock, flags);
556 +       raw_spin_unlock_irqrestore(&unwind_lock, flags);
558         kfree(tab);
560 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/arm/kvm/arm.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/arm/kvm/arm.c
561 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/arm/kvm/arm.c      2017-04-16 10:37:30.000000000 +0200
562 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/arm/kvm/arm.c   2017-04-18 17:54:19.000000000 +0200
563 @@ -619,7 +619,7 @@
564                  * involves poking the GIC, which must be done in a
565                  * non-preemptible context.
566                  */
567 -               preempt_disable();
568 +               migrate_disable();
569                 kvm_pmu_flush_hwstate(vcpu);
570                 kvm_timer_flush_hwstate(vcpu);
571                 kvm_vgic_flush_hwstate(vcpu);
572 @@ -640,7 +640,7 @@
573                         kvm_pmu_sync_hwstate(vcpu);
574                         kvm_timer_sync_hwstate(vcpu);
575                         kvm_vgic_sync_hwstate(vcpu);
576 -                       preempt_enable();
577 +                       migrate_enable();
578                         continue;
579                 }
581 @@ -696,7 +696,7 @@
583                 kvm_vgic_sync_hwstate(vcpu);
585 -               preempt_enable();
586 +               migrate_enable();
588                 ret = handle_exit(vcpu, run, ret);
589         }
590 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/arm/mach-exynos/platsmp.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/arm/mach-exynos/platsmp.c
591 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/arm/mach-exynos/platsmp.c  2017-04-16 10:37:30.000000000 +0200
592 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/arm/mach-exynos/platsmp.c       2017-04-18 17:54:19.000000000 +0200
593 @@ -229,7 +229,7 @@
594         return (void __iomem *)(S5P_VA_SCU);
597 -static DEFINE_SPINLOCK(boot_lock);
598 +static DEFINE_RAW_SPINLOCK(boot_lock);
600  static void exynos_secondary_init(unsigned int cpu)
602 @@ -242,8 +242,8 @@
603         /*
604          * Synchronise with the boot thread.
605          */
606 -       spin_lock(&boot_lock);
607 -       spin_unlock(&boot_lock);
608 +       raw_spin_lock(&boot_lock);
609 +       raw_spin_unlock(&boot_lock);
612  int exynos_set_boot_addr(u32 core_id, unsigned long boot_addr)
613 @@ -307,7 +307,7 @@
614          * Set synchronisation state between this boot processor
615          * and the secondary one
616          */
617 -       spin_lock(&boot_lock);
618 +       raw_spin_lock(&boot_lock);
620         /*
621          * The secondary processor is waiting to be released from
622 @@ -334,7 +334,7 @@
624                 if (timeout == 0) {
625                         printk(KERN_ERR "cpu1 power enable failed");
626 -                       spin_unlock(&boot_lock);
627 +                       raw_spin_unlock(&boot_lock);
628                         return -ETIMEDOUT;
629                 }
630         }
631 @@ -380,7 +380,7 @@
632          * calibrations, then wait for it to finish
633          */
634  fail:
635 -       spin_unlock(&boot_lock);
636 +       raw_spin_unlock(&boot_lock);
638         return pen_release != -1 ? ret : 0;
640 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/arm/mach-hisi/platmcpm.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/arm/mach-hisi/platmcpm.c
641 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/arm/mach-hisi/platmcpm.c   2017-04-16 10:37:30.000000000 +0200
642 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/arm/mach-hisi/platmcpm.c        2017-04-18 17:54:19.000000000 +0200
643 @@ -61,7 +61,7 @@
645  static void __iomem *sysctrl, *fabric;
646  static int hip04_cpu_table[HIP04_MAX_CLUSTERS][HIP04_MAX_CPUS_PER_CLUSTER];
647 -static DEFINE_SPINLOCK(boot_lock);
648 +static DEFINE_RAW_SPINLOCK(boot_lock);
649  static u32 fabric_phys_addr;
650  /*
651   * [0]: bootwrapper physical address
652 @@ -113,7 +113,7 @@
653         if (cluster >= HIP04_MAX_CLUSTERS || cpu >= HIP04_MAX_CPUS_PER_CLUSTER)
654                 return -EINVAL;
656 -       spin_lock_irq(&boot_lock);
657 +       raw_spin_lock_irq(&boot_lock);
659         if (hip04_cpu_table[cluster][cpu])
660                 goto out;
661 @@ -147,7 +147,7 @@
663  out:
664         hip04_cpu_table[cluster][cpu]++;
665 -       spin_unlock_irq(&boot_lock);
666 +       raw_spin_unlock_irq(&boot_lock);
668         return 0;
670 @@ -162,11 +162,11 @@
671         cpu = MPIDR_AFFINITY_LEVEL(mpidr, 0);
672         cluster = MPIDR_AFFINITY_LEVEL(mpidr, 1);
674 -       spin_lock(&boot_lock);
675 +       raw_spin_lock(&boot_lock);
676         hip04_cpu_table[cluster][cpu]--;
677         if (hip04_cpu_table[cluster][cpu] == 1) {
678                 /* A power_up request went ahead of us. */
679 -               spin_unlock(&boot_lock);
680 +               raw_spin_unlock(&boot_lock);
681                 return;
682         } else if (hip04_cpu_table[cluster][cpu] > 1) {
683                 pr_err("Cluster %d CPU%d boots multiple times\n", cluster, cpu);
684 @@ -174,7 +174,7 @@
685         }
687         last_man = hip04_cluster_is_down(cluster);
688 -       spin_unlock(&boot_lock);
689 +       raw_spin_unlock(&boot_lock);
690         if (last_man) {
691                 /* Since it's Cortex A15, disable L2 prefetching. */
692                 asm volatile(
693 @@ -203,7 +203,7 @@
694                cpu >= HIP04_MAX_CPUS_PER_CLUSTER);
696         count = TIMEOUT_MSEC / POLL_MSEC;
697 -       spin_lock_irq(&boot_lock);
698 +       raw_spin_lock_irq(&boot_lock);
699         for (tries = 0; tries < count; tries++) {
700                 if (hip04_cpu_table[cluster][cpu])
701                         goto err;
702 @@ -211,10 +211,10 @@
703                 data = readl_relaxed(sysctrl + SC_CPU_RESET_STATUS(cluster));
704                 if (data & CORE_WFI_STATUS(cpu))
705                         break;
706 -               spin_unlock_irq(&boot_lock);
707 +               raw_spin_unlock_irq(&boot_lock);
708                 /* Wait for clean L2 when the whole cluster is down. */
709                 msleep(POLL_MSEC);
710 -               spin_lock_irq(&boot_lock);
711 +               raw_spin_lock_irq(&boot_lock);
712         }
713         if (tries >= count)
714                 goto err;
715 @@ -231,10 +231,10 @@
716                 goto err;
717         if (hip04_cluster_is_down(cluster))
718                 hip04_set_snoop_filter(cluster, 0);
719 -       spin_unlock_irq(&boot_lock);
720 +       raw_spin_unlock_irq(&boot_lock);
721         return 1;
722  err:
723 -       spin_unlock_irq(&boot_lock);
724 +       raw_spin_unlock_irq(&boot_lock);
725         return 0;
727  #endif
728 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/arm/mach-omap2/omap-smp.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/arm/mach-omap2/omap-smp.c
729 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/arm/mach-omap2/omap-smp.c  2017-04-16 10:37:31.000000000 +0200
730 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/arm/mach-omap2/omap-smp.c       2017-04-18 17:54:19.000000000 +0200
731 @@ -64,7 +64,7 @@
732         .startup_addr = omap5_secondary_startup,
733  };
735 -static DEFINE_SPINLOCK(boot_lock);
736 +static DEFINE_RAW_SPINLOCK(boot_lock);
738  void __iomem *omap4_get_scu_base(void)
740 @@ -131,8 +131,8 @@
741         /*
742          * Synchronise with the boot thread.
743          */
744 -       spin_lock(&boot_lock);
745 -       spin_unlock(&boot_lock);
746 +       raw_spin_lock(&boot_lock);
747 +       raw_spin_unlock(&boot_lock);
750  static int omap4_boot_secondary(unsigned int cpu, struct task_struct *idle)
751 @@ -146,7 +146,7 @@
752          * Set synchronisation state between this boot processor
753          * and the secondary one
754          */
755 -       spin_lock(&boot_lock);
756 +       raw_spin_lock(&boot_lock);
758         /*
759          * Update the AuxCoreBoot0 with boot state for secondary core.
760 @@ -223,7 +223,7 @@
761          * Now the secondary core is starting up let it run its
762          * calibrations, then wait for it to finish
763          */
764 -       spin_unlock(&boot_lock);
765 +       raw_spin_unlock(&boot_lock);
767         return 0;
769 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/arm/mach-prima2/platsmp.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/arm/mach-prima2/platsmp.c
770 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/arm/mach-prima2/platsmp.c  2017-04-16 10:37:31.000000000 +0200
771 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/arm/mach-prima2/platsmp.c       2017-04-18 17:54:19.000000000 +0200
772 @@ -22,7 +22,7 @@
774  static void __iomem *clk_base;
776 -static DEFINE_SPINLOCK(boot_lock);
777 +static DEFINE_RAW_SPINLOCK(boot_lock);
779  static void sirfsoc_secondary_init(unsigned int cpu)
781 @@ -36,8 +36,8 @@
782         /*
783          * Synchronise with the boot thread.
784          */
785 -       spin_lock(&boot_lock);
786 -       spin_unlock(&boot_lock);
787 +       raw_spin_lock(&boot_lock);
788 +       raw_spin_unlock(&boot_lock);
791  static const struct of_device_id clk_ids[]  = {
792 @@ -75,7 +75,7 @@
793         /* make sure write buffer is drained */
794         mb();
796 -       spin_lock(&boot_lock);
797 +       raw_spin_lock(&boot_lock);
799         /*
800          * The secondary processor is waiting to be released from
801 @@ -107,7 +107,7 @@
802          * now the secondary core is starting up let it run its
803          * calibrations, then wait for it to finish
804          */
805 -       spin_unlock(&boot_lock);
806 +       raw_spin_unlock(&boot_lock);
808         return pen_release != -1 ? -ENOSYS : 0;
810 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/arm/mach-qcom/platsmp.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/arm/mach-qcom/platsmp.c
811 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/arm/mach-qcom/platsmp.c    2017-04-16 10:37:31.000000000 +0200
812 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/arm/mach-qcom/platsmp.c 2017-04-18 17:54:20.000000000 +0200
813 @@ -46,7 +46,7 @@
815  extern void secondary_startup_arm(void);
817 -static DEFINE_SPINLOCK(boot_lock);
818 +static DEFINE_RAW_SPINLOCK(boot_lock);
820  #ifdef CONFIG_HOTPLUG_CPU
821  static void qcom_cpu_die(unsigned int cpu)
822 @@ -60,8 +60,8 @@
823         /*
824          * Synchronise with the boot thread.
825          */
826 -       spin_lock(&boot_lock);
827 -       spin_unlock(&boot_lock);
828 +       raw_spin_lock(&boot_lock);
829 +       raw_spin_unlock(&boot_lock);
832  static int scss_release_secondary(unsigned int cpu)
833 @@ -284,7 +284,7 @@
834          * set synchronisation state between this boot processor
835          * and the secondary one
836          */
837 -       spin_lock(&boot_lock);
838 +       raw_spin_lock(&boot_lock);
840         /*
841          * Send the secondary CPU a soft interrupt, thereby causing
842 @@ -297,7 +297,7 @@
843          * now the secondary core is starting up let it run its
844          * calibrations, then wait for it to finish
845          */
846 -       spin_unlock(&boot_lock);
847 +       raw_spin_unlock(&boot_lock);
849         return ret;
851 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/arm/mach-spear/platsmp.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/arm/mach-spear/platsmp.c
852 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/arm/mach-spear/platsmp.c   2017-04-16 10:37:31.000000000 +0200
853 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/arm/mach-spear/platsmp.c        2017-04-18 17:54:20.000000000 +0200
854 @@ -32,7 +32,7 @@
855         sync_cache_w(&pen_release);
858 -static DEFINE_SPINLOCK(boot_lock);
859 +static DEFINE_RAW_SPINLOCK(boot_lock);
861  static void __iomem *scu_base = IOMEM(VA_SCU_BASE);
863 @@ -47,8 +47,8 @@
864         /*
865          * Synchronise with the boot thread.
866          */
867 -       spin_lock(&boot_lock);
868 -       spin_unlock(&boot_lock);
869 +       raw_spin_lock(&boot_lock);
870 +       raw_spin_unlock(&boot_lock);
873  static int spear13xx_boot_secondary(unsigned int cpu, struct task_struct *idle)
874 @@ -59,7 +59,7 @@
875          * set synchronisation state between this boot processor
876          * and the secondary one
877          */
878 -       spin_lock(&boot_lock);
879 +       raw_spin_lock(&boot_lock);
881         /*
882          * The secondary processor is waiting to be released from
883 @@ -84,7 +84,7 @@
884          * now the secondary core is starting up let it run its
885          * calibrations, then wait for it to finish
886          */
887 -       spin_unlock(&boot_lock);
888 +       raw_spin_unlock(&boot_lock);
890         return pen_release != -1 ? -ENOSYS : 0;
892 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/arm/mach-sti/platsmp.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/arm/mach-sti/platsmp.c
893 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/arm/mach-sti/platsmp.c     2017-04-16 10:37:31.000000000 +0200
894 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/arm/mach-sti/platsmp.c  2017-04-18 17:54:20.000000000 +0200
895 @@ -35,7 +35,7 @@
896         sync_cache_w(&pen_release);
899 -static DEFINE_SPINLOCK(boot_lock);
900 +static DEFINE_RAW_SPINLOCK(boot_lock);
902  static void sti_secondary_init(unsigned int cpu)
904 @@ -48,8 +48,8 @@
905         /*
906          * Synchronise with the boot thread.
907          */
908 -       spin_lock(&boot_lock);
909 -       spin_unlock(&boot_lock);
910 +       raw_spin_lock(&boot_lock);
911 +       raw_spin_unlock(&boot_lock);
914  static int sti_boot_secondary(unsigned int cpu, struct task_struct *idle)
915 @@ -60,7 +60,7 @@
916          * set synchronisation state between this boot processor
917          * and the secondary one
918          */
919 -       spin_lock(&boot_lock);
920 +       raw_spin_lock(&boot_lock);
922         /*
923          * The secondary processor is waiting to be released from
924 @@ -91,7 +91,7 @@
925          * now the secondary core is starting up let it run its
926          * calibrations, then wait for it to finish
927          */
928 -       spin_unlock(&boot_lock);
929 +       raw_spin_unlock(&boot_lock);
931         return pen_release != -1 ? -ENOSYS : 0;
933 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/arm/mm/fault.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/arm/mm/fault.c
934 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/arm/mm/fault.c     2017-04-16 10:37:31.000000000 +0200
935 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/arm/mm/fault.c  2017-04-18 17:54:20.000000000 +0200
936 @@ -430,6 +430,9 @@
937         if (addr < TASK_SIZE)
938                 return do_page_fault(addr, fsr, regs);
940 +       if (interrupts_enabled(regs))
941 +               local_irq_enable();
943         if (user_mode(regs))
944                 goto bad_area;
946 @@ -497,6 +500,9 @@
947  static int
948  do_sect_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
950 +       if (interrupts_enabled(regs))
951 +               local_irq_enable();
953         do_bad_area(addr, fsr, regs);
954         return 0;
956 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/arm/mm/highmem.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/arm/mm/highmem.c
957 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/arm/mm/highmem.c   2017-04-16 10:37:31.000000000 +0200
958 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/arm/mm/highmem.c        2017-04-18 17:54:20.000000000 +0200
959 @@ -34,6 +34,11 @@
960         return *ptep;
963 +static unsigned int fixmap_idx(int type)
965 +       return FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id();
968  void *kmap(struct page *page)
970         might_sleep();
971 @@ -54,12 +59,13 @@
973  void *kmap_atomic(struct page *page)
975 +       pte_t pte = mk_pte(page, kmap_prot);
976         unsigned int idx;
977         unsigned long vaddr;
978         void *kmap;
979         int type;
981 -       preempt_disable();
982 +       preempt_disable_nort();
983         pagefault_disable();
984         if (!PageHighMem(page))
985                 return page_address(page);
986 @@ -79,7 +85,7 @@
988         type = kmap_atomic_idx_push();
990 -       idx = FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id();
991 +       idx = fixmap_idx(type);
992         vaddr = __fix_to_virt(idx);
993  #ifdef CONFIG_DEBUG_HIGHMEM
994         /*
995 @@ -93,7 +99,10 @@
996          * in place, so the contained TLB flush ensures the TLB is updated
997          * with the new mapping.
998          */
999 -       set_fixmap_pte(idx, mk_pte(page, kmap_prot));
1000 +#ifdef CONFIG_PREEMPT_RT_FULL
1001 +       current->kmap_pte[type] = pte;
1002 +#endif
1003 +       set_fixmap_pte(idx, pte);
1005         return (void *)vaddr;
1007 @@ -106,44 +115,75 @@
1009         if (kvaddr >= (void *)FIXADDR_START) {
1010                 type = kmap_atomic_idx();
1011 -               idx = FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id();
1012 +               idx = fixmap_idx(type);
1014                 if (cache_is_vivt())
1015                         __cpuc_flush_dcache_area((void *)vaddr, PAGE_SIZE);
1016 +#ifdef CONFIG_PREEMPT_RT_FULL
1017 +               current->kmap_pte[type] = __pte(0);
1018 +#endif
1019  #ifdef CONFIG_DEBUG_HIGHMEM
1020                 BUG_ON(vaddr != __fix_to_virt(idx));
1021 -               set_fixmap_pte(idx, __pte(0));
1022  #else
1023                 (void) idx;  /* to kill a warning */
1024  #endif
1025 +               set_fixmap_pte(idx, __pte(0));
1026                 kmap_atomic_idx_pop();
1027         } else if (vaddr >= PKMAP_ADDR(0) && vaddr < PKMAP_ADDR(LAST_PKMAP)) {
1028                 /* this address was obtained through kmap_high_get() */
1029                 kunmap_high(pte_page(pkmap_page_table[PKMAP_NR(vaddr)]));
1030         }
1031         pagefault_enable();
1032 -       preempt_enable();
1033 +       preempt_enable_nort();
1035  EXPORT_SYMBOL(__kunmap_atomic);
1037  void *kmap_atomic_pfn(unsigned long pfn)
1039 +       pte_t pte = pfn_pte(pfn, kmap_prot);
1040         unsigned long vaddr;
1041         int idx, type;
1042         struct page *page = pfn_to_page(pfn);
1044 -       preempt_disable();
1045 +       preempt_disable_nort();
1046         pagefault_disable();
1047         if (!PageHighMem(page))
1048                 return page_address(page);
1050         type = kmap_atomic_idx_push();
1051 -       idx = FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id();
1052 +       idx = fixmap_idx(type);
1053         vaddr = __fix_to_virt(idx);
1054  #ifdef CONFIG_DEBUG_HIGHMEM
1055         BUG_ON(!pte_none(get_fixmap_pte(vaddr)));
1056  #endif
1057 -       set_fixmap_pte(idx, pfn_pte(pfn, kmap_prot));
1058 +#ifdef CONFIG_PREEMPT_RT_FULL
1059 +       current->kmap_pte[type] = pte;
1060 +#endif
1061 +       set_fixmap_pte(idx, pte);
1063         return (void *)vaddr;
1065 +#if defined CONFIG_PREEMPT_RT_FULL
1066 +void switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p)
1068 +       int i;
1070 +       /*
1071 +        * Clear @prev's kmap_atomic mappings
1072 +        */
1073 +       for (i = 0; i < prev_p->kmap_idx; i++) {
1074 +               int idx = fixmap_idx(i);
1076 +               set_fixmap_pte(idx, __pte(0));
1077 +       }
1078 +       /*
1079 +        * Restore @next_p's kmap_atomic mappings
1080 +        */
1081 +       for (i = 0; i < next_p->kmap_idx; i++) {
1082 +               int idx = fixmap_idx(i);
1084 +               if (!pte_none(next_p->kmap_pte[i]))
1085 +                       set_fixmap_pte(idx, next_p->kmap_pte[i]);
1086 +       }
1088 +#endif
1089 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/arm/plat-versatile/platsmp.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/arm/plat-versatile/platsmp.c
1090 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/arm/plat-versatile/platsmp.c       2017-04-16 10:37:31.000000000 +0200
1091 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/arm/plat-versatile/platsmp.c    2017-04-18 17:54:20.000000000 +0200
1092 @@ -32,7 +32,7 @@
1093         sync_cache_w(&pen_release);
1096 -static DEFINE_SPINLOCK(boot_lock);
1097 +static DEFINE_RAW_SPINLOCK(boot_lock);
1099  void versatile_secondary_init(unsigned int cpu)
1101 @@ -45,8 +45,8 @@
1102         /*
1103          * Synchronise with the boot thread.
1104          */
1105 -       spin_lock(&boot_lock);
1106 -       spin_unlock(&boot_lock);
1107 +       raw_spin_lock(&boot_lock);
1108 +       raw_spin_unlock(&boot_lock);
1111  int versatile_boot_secondary(unsigned int cpu, struct task_struct *idle)
1112 @@ -57,7 +57,7 @@
1113          * Set synchronisation state between this boot processor
1114          * and the secondary one
1115          */
1116 -       spin_lock(&boot_lock);
1117 +       raw_spin_lock(&boot_lock);
1119         /*
1120          * This is really belt and braces; we hold unintended secondary
1121 @@ -87,7 +87,7 @@
1122          * now the secondary core is starting up let it run its
1123          * calibrations, then wait for it to finish
1124          */
1125 -       spin_unlock(&boot_lock);
1126 +       raw_spin_unlock(&boot_lock);
1128         return pen_release != -1 ? -ENOSYS : 0;
1130 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/arm64/Kconfig linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/arm64/Kconfig
1131 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/arm64/Kconfig      2017-04-16 10:37:31.000000000 +0200
1132 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/arm64/Kconfig   2017-04-18 17:54:20.000000000 +0200
1133 @@ -91,6 +91,7 @@
1134         select HAVE_PERF_EVENTS
1135         select HAVE_PERF_REGS
1136         select HAVE_PERF_USER_STACK_DUMP
1137 +       select HAVE_PREEMPT_LAZY
1138         select HAVE_REGS_AND_STACK_ACCESS_API
1139         select HAVE_RCU_TABLE_FREE
1140         select HAVE_SYSCALL_TRACEPOINTS
1141 @@ -704,7 +705,7 @@
1143  config XEN
1144         bool "Xen guest support on ARM64"
1145 -       depends on ARM64 && OF
1146 +       depends on ARM64 && OF && !PREEMPT_RT_FULL
1147         select SWIOTLB_XEN
1148         select PARAVIRT
1149         help
1150 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/arm64/include/asm/thread_info.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/arm64/include/asm/thread_info.h
1151 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/arm64/include/asm/thread_info.h    2017-04-16 10:37:32.000000000 +0200
1152 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/arm64/include/asm/thread_info.h 2017-04-18 17:54:20.000000000 +0200
1153 @@ -49,6 +49,7 @@
1154         mm_segment_t            addr_limit;     /* address limit */
1155         struct task_struct      *task;          /* main task structure */
1156         int                     preempt_count;  /* 0 => preemptable, <0 => bug */
1157 +       int                     preempt_lazy_count; /* 0 => preemptable, <0 => bug */
1158         int                     cpu;            /* cpu */
1159  };
1161 @@ -112,6 +113,7 @@
1162  #define TIF_NEED_RESCHED       1
1163  #define TIF_NOTIFY_RESUME      2       /* callback before returning to user */
1164  #define TIF_FOREIGN_FPSTATE    3       /* CPU's FP state is not current's */
1165 +#define TIF_NEED_RESCHED_LAZY  4
1166  #define TIF_NOHZ               7
1167  #define TIF_SYSCALL_TRACE      8
1168  #define TIF_SYSCALL_AUDIT      9
1169 @@ -127,6 +129,7 @@
1170  #define _TIF_NEED_RESCHED      (1 << TIF_NEED_RESCHED)
1171  #define _TIF_NOTIFY_RESUME     (1 << TIF_NOTIFY_RESUME)
1172  #define _TIF_FOREIGN_FPSTATE   (1 << TIF_FOREIGN_FPSTATE)
1173 +#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY)
1174  #define _TIF_NOHZ              (1 << TIF_NOHZ)
1175  #define _TIF_SYSCALL_TRACE     (1 << TIF_SYSCALL_TRACE)
1176  #define _TIF_SYSCALL_AUDIT     (1 << TIF_SYSCALL_AUDIT)
1177 @@ -135,7 +138,9 @@
1178  #define _TIF_32BIT             (1 << TIF_32BIT)
1180  #define _TIF_WORK_MASK         (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
1181 -                                _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE)
1182 +                                _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE | \
1183 +                                _TIF_NEED_RESCHED_LAZY)
1184 +#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)
1186  #define _TIF_SYSCALL_WORK      (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \
1187                                  _TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP | \
1188 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/arm64/kernel/asm-offsets.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/arm64/kernel/asm-offsets.c
1189 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/arm64/kernel/asm-offsets.c 2017-04-16 10:37:32.000000000 +0200
1190 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/arm64/kernel/asm-offsets.c      2017-04-18 17:54:20.000000000 +0200
1191 @@ -38,6 +38,7 @@
1192    BLANK();
1193    DEFINE(TI_FLAGS,             offsetof(struct thread_info, flags));
1194    DEFINE(TI_PREEMPT,           offsetof(struct thread_info, preempt_count));
1195 +  DEFINE(TI_PREEMPT_LAZY,      offsetof(struct thread_info, preempt_lazy_count));
1196    DEFINE(TI_ADDR_LIMIT,                offsetof(struct thread_info, addr_limit));
1197    DEFINE(TI_TASK,              offsetof(struct thread_info, task));
1198    DEFINE(TI_CPU,               offsetof(struct thread_info, cpu));
1199 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/arm64/kernel/entry.S linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/arm64/kernel/entry.S
1200 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/arm64/kernel/entry.S       2017-04-16 10:37:32.000000000 +0200
1201 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/arm64/kernel/entry.S    2017-04-18 17:54:20.000000000 +0200
1202 @@ -428,11 +428,16 @@
1204  #ifdef CONFIG_PREEMPT
1205         ldr     w24, [tsk, #TI_PREEMPT]         // get preempt count
1206 -       cbnz    w24, 1f                         // preempt count != 0
1207 +       cbnz    w24, 2f                         // preempt count != 0
1208         ldr     x0, [tsk, #TI_FLAGS]            // get flags
1209 -       tbz     x0, #TIF_NEED_RESCHED, 1f       // needs rescheduling?
1210 -       bl      el1_preempt
1211 +       tbnz    x0, #TIF_NEED_RESCHED, 1f       // needs rescheduling?
1213 +       ldr     w24, [tsk, #TI_PREEMPT_LAZY]    // get preempt lazy count
1214 +       cbnz    w24, 2f                         // preempt lazy count != 0
1215 +       tbz     x0, #TIF_NEED_RESCHED_LAZY, 2f  // needs rescheduling?
1216  1:
1217 +       bl      el1_preempt
1219  #endif
1220  #ifdef CONFIG_TRACE_IRQFLAGS
1221         bl      trace_hardirqs_on
1222 @@ -446,6 +451,7 @@
1223  1:     bl      preempt_schedule_irq            // irq en/disable is done inside
1224         ldr     x0, [tsk, #TI_FLAGS]            // get new tasks TI_FLAGS
1225         tbnz    x0, #TIF_NEED_RESCHED, 1b       // needs rescheduling?
1226 +       tbnz    x0, #TIF_NEED_RESCHED_LAZY, 1b  // needs rescheduling?
1227         ret     x24
1228  #endif
1230 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/arm64/kernel/signal.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/arm64/kernel/signal.c
1231 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/arm64/kernel/signal.c      2017-04-16 10:37:32.000000000 +0200
1232 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/arm64/kernel/signal.c   2017-04-18 17:54:20.000000000 +0200
1233 @@ -409,7 +409,7 @@
1234          */
1235         trace_hardirqs_off();
1236         do {
1237 -               if (thread_flags & _TIF_NEED_RESCHED) {
1238 +               if (thread_flags & _TIF_NEED_RESCHED_MASK) {
1239                         schedule();
1240                 } else {
1241                         local_irq_enable();
1242 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/mips/Kconfig linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/mips/Kconfig
1243 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/mips/Kconfig       2017-04-16 10:37:33.000000000 +0200
1244 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/mips/Kconfig    2017-04-18 17:54:20.000000000 +0200
1245 @@ -2514,7 +2514,7 @@
1247  config HIGHMEM
1248         bool "High Memory Support"
1249 -       depends on 32BIT && CPU_SUPPORTS_HIGHMEM && SYS_SUPPORTS_HIGHMEM && !CPU_MIPS32_3_5_EVA
1250 +       depends on 32BIT && CPU_SUPPORTS_HIGHMEM && SYS_SUPPORTS_HIGHMEM && !CPU_MIPS32_3_5_EVA && !PREEMPT_RT_FULL
1252  config CPU_SUPPORTS_HIGHMEM
1253         bool
1254 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/powerpc/Kconfig linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/powerpc/Kconfig
1255 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/powerpc/Kconfig    2017-04-16 10:37:34.000000000 +0200
1256 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/powerpc/Kconfig 2017-04-18 17:54:20.000000000 +0200
1257 @@ -52,10 +52,11 @@
1259  config RWSEM_GENERIC_SPINLOCK
1260         bool
1261 +       default y if PREEMPT_RT_FULL
1263  config RWSEM_XCHGADD_ALGORITHM
1264         bool
1265 -       default y
1266 +       default y if !PREEMPT_RT_FULL
1268  config GENERIC_LOCKBREAK
1269         bool
1270 @@ -134,6 +135,7 @@
1271         select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
1272         select GENERIC_STRNCPY_FROM_USER
1273         select GENERIC_STRNLEN_USER
1274 +       select HAVE_PREEMPT_LAZY
1275         select HAVE_MOD_ARCH_SPECIFIC
1276         select MODULES_USE_ELF_RELA
1277         select CLONE_BACKWARDS
1278 @@ -321,7 +323,7 @@
1280  config HIGHMEM
1281         bool "High memory support"
1282 -       depends on PPC32
1283 +       depends on PPC32 && !PREEMPT_RT_FULL
1285  source kernel/Kconfig.hz
1286  source kernel/Kconfig.preempt
1287 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/powerpc/include/asm/thread_info.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/powerpc/include/asm/thread_info.h
1288 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/powerpc/include/asm/thread_info.h  2017-04-16 10:37:35.000000000 +0200
1289 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/powerpc/include/asm/thread_info.h       2017-04-18 17:54:20.000000000 +0200
1290 @@ -43,6 +43,8 @@
1291         int             cpu;                    /* cpu we're on */
1292         int             preempt_count;          /* 0 => preemptable,
1293                                                    <0 => BUG */
1294 +       int             preempt_lazy_count;     /* 0 => preemptable,
1295 +                                                  <0 => BUG */
1296         unsigned long   local_flags;            /* private flags for thread */
1297  #ifdef CONFIG_LIVEPATCH
1298         unsigned long *livepatch_sp;
1299 @@ -88,8 +90,7 @@
1300  #define TIF_SYSCALL_TRACE      0       /* syscall trace active */
1301  #define TIF_SIGPENDING         1       /* signal pending */
1302  #define TIF_NEED_RESCHED       2       /* rescheduling necessary */
1303 -#define TIF_POLLING_NRFLAG     3       /* true if poll_idle() is polling
1304 -                                          TIF_NEED_RESCHED */
1305 +#define TIF_NEED_RESCHED_LAZY  3       /* lazy rescheduling necessary */
1306  #define TIF_32BIT              4       /* 32 bit binary */
1307  #define TIF_RESTORE_TM         5       /* need to restore TM FP/VEC/VSX */
1308  #define TIF_SYSCALL_AUDIT      7       /* syscall auditing active */
1309 @@ -107,6 +108,8 @@
1310  #if defined(CONFIG_PPC64)
1311  #define TIF_ELF2ABI            18      /* function descriptors must die! */
1312  #endif
1313 +#define TIF_POLLING_NRFLAG     19      /* true if poll_idle() is polling
1314 +                                          TIF_NEED_RESCHED */
1316  /* as above, but as bit values */
1317  #define _TIF_SYSCALL_TRACE     (1<<TIF_SYSCALL_TRACE)
1318 @@ -125,14 +128,16 @@
1319  #define _TIF_SYSCALL_TRACEPOINT        (1<<TIF_SYSCALL_TRACEPOINT)
1320  #define _TIF_EMULATE_STACK_STORE       (1<<TIF_EMULATE_STACK_STORE)
1321  #define _TIF_NOHZ              (1<<TIF_NOHZ)
1322 +#define _TIF_NEED_RESCHED_LAZY (1<<TIF_NEED_RESCHED_LAZY)
1323  #define _TIF_SYSCALL_DOTRACE   (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \
1324                                  _TIF_SECCOMP | _TIF_SYSCALL_TRACEPOINT | \
1325                                  _TIF_NOHZ)
1327  #define _TIF_USER_WORK_MASK    (_TIF_SIGPENDING | _TIF_NEED_RESCHED | \
1328                                  _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
1329 -                                _TIF_RESTORE_TM)
1330 +                                _TIF_RESTORE_TM | _TIF_NEED_RESCHED_LAZY)
1331  #define _TIF_PERSYSCALL_MASK   (_TIF_RESTOREALL|_TIF_NOERROR)
1332 +#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)
1334  /* Bits in local_flags */
1335  /* Don't move TLF_NAPPING without adjusting the code in entry_32.S */
1336 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/powerpc/kernel/asm-offsets.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/powerpc/kernel/asm-offsets.c
1337 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/powerpc/kernel/asm-offsets.c       2017-04-16 10:37:35.000000000 +0200
1338 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/powerpc/kernel/asm-offsets.c    2017-04-18 17:54:20.000000000 +0200
1339 @@ -156,6 +156,7 @@
1340         DEFINE(TI_FLAGS, offsetof(struct thread_info, flags));
1341         DEFINE(TI_LOCAL_FLAGS, offsetof(struct thread_info, local_flags));
1342         DEFINE(TI_PREEMPT, offsetof(struct thread_info, preempt_count));
1343 +       DEFINE(TI_PREEMPT_LAZY, offsetof(struct thread_info, preempt_lazy_count));
1344         DEFINE(TI_TASK, offsetof(struct thread_info, task));
1345         DEFINE(TI_CPU, offsetof(struct thread_info, cpu));
1347 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/powerpc/kernel/entry_32.S linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/powerpc/kernel/entry_32.S
1348 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/powerpc/kernel/entry_32.S  2017-04-16 10:37:35.000000000 +0200
1349 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/powerpc/kernel/entry_32.S       2017-04-18 17:54:20.000000000 +0200
1350 @@ -835,7 +835,14 @@
1351         cmpwi   0,r0,0          /* if non-zero, just restore regs and return */
1352         bne     restore
1353         andi.   r8,r8,_TIF_NEED_RESCHED
1354 +       bne+    1f
1355 +       lwz     r0,TI_PREEMPT_LAZY(r9)
1356 +       cmpwi   0,r0,0          /* if non-zero, just restore regs and return */
1357 +       bne     restore
1358 +       lwz     r0,TI_FLAGS(r9)
1359 +       andi.   r0,r0,_TIF_NEED_RESCHED_LAZY
1360         beq+    restore
1362         lwz     r3,_MSR(r1)
1363         andi.   r0,r3,MSR_EE    /* interrupts off? */
1364         beq     restore         /* don't schedule if so */
1365 @@ -846,11 +853,11 @@
1366          */
1367         bl      trace_hardirqs_off
1368  #endif
1369 -1:     bl      preempt_schedule_irq
1370 +2:     bl      preempt_schedule_irq
1371         CURRENT_THREAD_INFO(r9, r1)
1372         lwz     r3,TI_FLAGS(r9)
1373 -       andi.   r0,r3,_TIF_NEED_RESCHED
1374 -       bne-    1b
1375 +       andi.   r0,r3,_TIF_NEED_RESCHED_MASK
1376 +       bne-    2b
1377  #ifdef CONFIG_TRACE_IRQFLAGS
1378         /* And now, to properly rebalance the above, we tell lockdep they
1379          * are being turned back on, which will happen when we return
1380 @@ -1171,7 +1178,7 @@
1381  #endif /* !(CONFIG_4xx || CONFIG_BOOKE) */
1383  do_work:                       /* r10 contains MSR_KERNEL here */
1384 -       andi.   r0,r9,_TIF_NEED_RESCHED
1385 +       andi.   r0,r9,_TIF_NEED_RESCHED_MASK
1386         beq     do_user_signal
1388  do_resched:                    /* r10 contains MSR_KERNEL here */
1389 @@ -1192,7 +1199,7 @@
1390         MTMSRD(r10)             /* disable interrupts */
1391         CURRENT_THREAD_INFO(r9, r1)
1392         lwz     r9,TI_FLAGS(r9)
1393 -       andi.   r0,r9,_TIF_NEED_RESCHED
1394 +       andi.   r0,r9,_TIF_NEED_RESCHED_MASK
1395         bne-    do_resched
1396         andi.   r0,r9,_TIF_USER_WORK_MASK
1397         beq     restore_user
1398 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/powerpc/kernel/entry_64.S linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/powerpc/kernel/entry_64.S
1399 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/powerpc/kernel/entry_64.S  2017-04-16 10:37:35.000000000 +0200
1400 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/powerpc/kernel/entry_64.S       2017-04-18 17:54:20.000000000 +0200
1401 @@ -656,7 +656,7 @@
1402         bl      restore_math
1403         b       restore
1404  #endif
1405 -1:     andi.   r0,r4,_TIF_NEED_RESCHED
1406 +1:     andi.   r0,r4,_TIF_NEED_RESCHED_MASK
1407         beq     2f
1408         bl      restore_interrupts
1409         SCHEDULE_USER
1410 @@ -718,10 +718,18 @@
1412  #ifdef CONFIG_PREEMPT
1413         /* Check if we need to preempt */
1414 +       lwz     r8,TI_PREEMPT(r9)
1415 +       cmpwi   0,r8,0          /* if non-zero, just restore regs and return */
1416 +       bne     restore
1417         andi.   r0,r4,_TIF_NEED_RESCHED
1418 +       bne+    check_count
1420 +       andi.   r0,r4,_TIF_NEED_RESCHED_LAZY
1421         beq+    restore
1422 +       lwz     r8,TI_PREEMPT_LAZY(r9)
1424         /* Check that preempt_count() == 0 and interrupts are enabled */
1425 -       lwz     r8,TI_PREEMPT(r9)
1426 +check_count:
1427         cmpwi   cr1,r8,0
1428         ld      r0,SOFTE(r1)
1429         cmpdi   r0,0
1430 @@ -738,7 +746,7 @@
1431         /* Re-test flags and eventually loop */
1432         CURRENT_THREAD_INFO(r9, r1)
1433         ld      r4,TI_FLAGS(r9)
1434 -       andi.   r0,r4,_TIF_NEED_RESCHED
1435 +       andi.   r0,r4,_TIF_NEED_RESCHED_MASK
1436         bne     1b
1438         /*
1439 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/powerpc/kernel/irq.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/powerpc/kernel/irq.c
1440 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/powerpc/kernel/irq.c       2017-04-16 10:37:35.000000000 +0200
1441 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/powerpc/kernel/irq.c    2017-04-18 17:54:20.000000000 +0200
1442 @@ -638,6 +638,7 @@
1443         }
1446 +#ifndef CONFIG_PREEMPT_RT_FULL
1447  void do_softirq_own_stack(void)
1449         struct thread_info *curtp, *irqtp;
1450 @@ -655,6 +656,7 @@
1451         if (irqtp->flags)
1452                 set_bits(irqtp->flags, &curtp->flags);
1454 +#endif
1456  irq_hw_number_t virq_to_hw(unsigned int virq)
1458 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/powerpc/kernel/misc_32.S linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/powerpc/kernel/misc_32.S
1459 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/powerpc/kernel/misc_32.S   2017-04-16 10:37:35.000000000 +0200
1460 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/powerpc/kernel/misc_32.S        2017-04-18 17:54:20.000000000 +0200
1461 @@ -41,6 +41,7 @@
1462   * We store the saved ksp_limit in the unused part
1463   * of the STACK_FRAME_OVERHEAD
1464   */
1465 +#ifndef CONFIG_PREEMPT_RT_FULL
1466  _GLOBAL(call_do_softirq)
1467         mflr    r0
1468         stw     r0,4(r1)
1469 @@ -57,6 +58,7 @@
1470         stw     r10,THREAD+KSP_LIMIT(r2)
1471         mtlr    r0
1472         blr
1473 +#endif
1475  /*
1476   * void call_do_irq(struct pt_regs *regs, struct thread_info *irqtp);
1477 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/powerpc/kernel/misc_64.S linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/powerpc/kernel/misc_64.S
1478 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/powerpc/kernel/misc_64.S   2017-04-16 10:37:35.000000000 +0200
1479 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/powerpc/kernel/misc_64.S        2017-04-18 17:54:20.000000000 +0200
1480 @@ -31,6 +31,7 @@
1482         .text
1484 +#ifndef CONFIG_PREEMPT_RT_FULL
1485  _GLOBAL(call_do_softirq)
1486         mflr    r0
1487         std     r0,16(r1)
1488 @@ -41,6 +42,7 @@
1489         ld      r0,16(r1)
1490         mtlr    r0
1491         blr
1492 +#endif
1494  _GLOBAL(call_do_irq)
1495         mflr    r0
1496 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/powerpc/kvm/Kconfig linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/powerpc/kvm/Kconfig
1497 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/powerpc/kvm/Kconfig        2017-04-16 10:37:35.000000000 +0200
1498 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/powerpc/kvm/Kconfig     2017-04-18 17:54:20.000000000 +0200
1499 @@ -175,6 +175,7 @@
1500  config KVM_MPIC
1501         bool "KVM in-kernel MPIC emulation"
1502         depends on KVM && E500
1503 +       depends on !PREEMPT_RT_FULL
1504         select HAVE_KVM_IRQCHIP
1505         select HAVE_KVM_IRQFD
1506         select HAVE_KVM_IRQ_ROUTING
1507 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/powerpc/platforms/ps3/device-init.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/powerpc/platforms/ps3/device-init.c
1508 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/powerpc/platforms/ps3/device-init.c        2017-04-16 10:37:35.000000000 +0200
1509 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/powerpc/platforms/ps3/device-init.c     2017-04-18 17:54:20.000000000 +0200
1510 @@ -752,7 +752,7 @@
1511         }
1512         pr_debug("%s:%u: notification %s issued\n", __func__, __LINE__, op);
1514 -       res = wait_event_interruptible(dev->done.wait,
1515 +       res = swait_event_interruptible(dev->done.wait,
1516                                        dev->done.done || kthread_should_stop());
1517         if (kthread_should_stop())
1518                 res = -EINTR;
1519 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/sh/kernel/irq.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/sh/kernel/irq.c
1520 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/sh/kernel/irq.c    2017-04-16 10:37:36.000000000 +0200
1521 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/sh/kernel/irq.c 2017-04-18 17:54:20.000000000 +0200
1522 @@ -147,6 +147,7 @@
1523         hardirq_ctx[cpu] = NULL;
1526 +#ifndef CONFIG_PREEMPT_RT_FULL
1527  void do_softirq_own_stack(void)
1529         struct thread_info *curctx;
1530 @@ -174,6 +175,7 @@
1531                   "r5", "r6", "r7", "r8", "r9", "r15", "t", "pr"
1532         );
1534 +#endif
1535  #else
1536  static inline void handle_one_irq(unsigned int irq)
1538 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/sparc/Kconfig linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/sparc/Kconfig
1539 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/sparc/Kconfig      2017-04-16 10:37:36.000000000 +0200
1540 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/sparc/Kconfig   2017-04-18 17:54:21.000000000 +0200
1541 @@ -194,12 +194,10 @@
1542  source kernel/Kconfig.hz
1544  config RWSEM_GENERIC_SPINLOCK
1545 -       bool
1546 -       default y if SPARC32
1547 +       def_bool PREEMPT_RT_FULL
1549  config RWSEM_XCHGADD_ALGORITHM
1550 -       bool
1551 -       default y if SPARC64
1552 +       def_bool !RWSEM_GENERIC_SPINLOCK && !PREEMPT_RT_FULL
1554  config GENERIC_HWEIGHT
1555         bool
1556 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/sparc/kernel/irq_64.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/sparc/kernel/irq_64.c
1557 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/sparc/kernel/irq_64.c      2017-04-16 10:37:36.000000000 +0200
1558 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/sparc/kernel/irq_64.c   2017-04-18 17:54:21.000000000 +0200
1559 @@ -854,6 +854,7 @@
1560         set_irq_regs(old_regs);
1563 +#ifndef CONFIG_PREEMPT_RT_FULL
1564  void do_softirq_own_stack(void)
1566         void *orig_sp, *sp = softirq_stack[smp_processor_id()];
1567 @@ -868,6 +869,7 @@
1568         __asm__ __volatile__("mov %0, %%sp"
1569                              : : "r" (orig_sp));
1571 +#endif
1573  #ifdef CONFIG_HOTPLUG_CPU
1574  void fixup_irqs(void)
1575 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/x86/Kconfig linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/x86/Kconfig
1576 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/x86/Kconfig        2017-04-16 10:37:37.000000000 +0200
1577 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/x86/Kconfig     2017-04-18 17:54:21.000000000 +0200
1578 @@ -17,6 +17,7 @@
1579  ### Arch settings
1580  config X86
1581         def_bool y
1582 +       select HAVE_PREEMPT_LAZY
1583         select ACPI_LEGACY_TABLES_LOOKUP        if ACPI
1584         select ACPI_SYSTEM_POWER_STATES_SUPPORT if ACPI
1585         select ANON_INODES
1586 @@ -232,8 +233,11 @@
1587         def_bool y
1588         depends on ISA_DMA_API
1590 +config RWSEM_GENERIC_SPINLOCK
1591 +       def_bool PREEMPT_RT_FULL
1593  config RWSEM_XCHGADD_ALGORITHM
1594 -       def_bool y
1595 +       def_bool !RWSEM_GENERIC_SPINLOCK && !PREEMPT_RT_FULL
1597  config GENERIC_CALIBRATE_DELAY
1598         def_bool y
1599 @@ -897,7 +901,7 @@
1600  config MAXSMP
1601         bool "Enable Maximum number of SMP Processors and NUMA Nodes"
1602         depends on X86_64 && SMP && DEBUG_KERNEL
1603 -       select CPUMASK_OFFSTACK
1604 +       select CPUMASK_OFFSTACK if !PREEMPT_RT_FULL
1605         ---help---
1606           Enable maximum number of CPUS and NUMA Nodes for this architecture.
1607           If unsure, say N.
1608 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/x86/crypto/aesni-intel_glue.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/x86/crypto/aesni-intel_glue.c
1609 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/x86/crypto/aesni-intel_glue.c      2017-04-16 10:37:37.000000000 +0200
1610 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/x86/crypto/aesni-intel_glue.c   2017-04-18 17:54:21.000000000 +0200
1611 @@ -372,14 +372,14 @@
1612         err = blkcipher_walk_virt(desc, &walk);
1613         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
1615 -       kernel_fpu_begin();
1616         while ((nbytes = walk.nbytes)) {
1617 +               kernel_fpu_begin();
1618                 aesni_ecb_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr,
1619 -                             nbytes & AES_BLOCK_MASK);
1620 +                               nbytes & AES_BLOCK_MASK);
1621 +               kernel_fpu_end();
1622                 nbytes &= AES_BLOCK_SIZE - 1;
1623                 err = blkcipher_walk_done(desc, &walk, nbytes);
1624         }
1625 -       kernel_fpu_end();
1627         return err;
1629 @@ -396,14 +396,14 @@
1630         err = blkcipher_walk_virt(desc, &walk);
1631         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
1633 -       kernel_fpu_begin();
1634         while ((nbytes = walk.nbytes)) {
1635 +               kernel_fpu_begin();
1636                 aesni_ecb_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr,
1637                               nbytes & AES_BLOCK_MASK);
1638 +               kernel_fpu_end();
1639                 nbytes &= AES_BLOCK_SIZE - 1;
1640                 err = blkcipher_walk_done(desc, &walk, nbytes);
1641         }
1642 -       kernel_fpu_end();
1644         return err;
1646 @@ -420,14 +420,14 @@
1647         err = blkcipher_walk_virt(desc, &walk);
1648         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
1650 -       kernel_fpu_begin();
1651         while ((nbytes = walk.nbytes)) {
1652 +               kernel_fpu_begin();
1653                 aesni_cbc_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr,
1654                               nbytes & AES_BLOCK_MASK, walk.iv);
1655 +               kernel_fpu_end();
1656                 nbytes &= AES_BLOCK_SIZE - 1;
1657                 err = blkcipher_walk_done(desc, &walk, nbytes);
1658         }
1659 -       kernel_fpu_end();
1661         return err;
1663 @@ -444,14 +444,14 @@
1664         err = blkcipher_walk_virt(desc, &walk);
1665         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
1667 -       kernel_fpu_begin();
1668         while ((nbytes = walk.nbytes)) {
1669 +               kernel_fpu_begin();
1670                 aesni_cbc_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr,
1671                               nbytes & AES_BLOCK_MASK, walk.iv);
1672 +               kernel_fpu_end();
1673                 nbytes &= AES_BLOCK_SIZE - 1;
1674                 err = blkcipher_walk_done(desc, &walk, nbytes);
1675         }
1676 -       kernel_fpu_end();
1678         return err;
1680 @@ -503,18 +503,20 @@
1681         err = blkcipher_walk_virt_block(desc, &walk, AES_BLOCK_SIZE);
1682         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
1684 -       kernel_fpu_begin();
1685         while ((nbytes = walk.nbytes) >= AES_BLOCK_SIZE) {
1686 +               kernel_fpu_begin();
1687                 aesni_ctr_enc_tfm(ctx, walk.dst.virt.addr, walk.src.virt.addr,
1688                                       nbytes & AES_BLOCK_MASK, walk.iv);
1689 +               kernel_fpu_end();
1690                 nbytes &= AES_BLOCK_SIZE - 1;
1691                 err = blkcipher_walk_done(desc, &walk, nbytes);
1692         }
1693         if (walk.nbytes) {
1694 +               kernel_fpu_begin();
1695                 ctr_crypt_final(ctx, &walk);
1696 +               kernel_fpu_end();
1697                 err = blkcipher_walk_done(desc, &walk, 0);
1698         }
1699 -       kernel_fpu_end();
1701         return err;
1703 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/x86/crypto/cast5_avx_glue.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/x86/crypto/cast5_avx_glue.c
1704 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/x86/crypto/cast5_avx_glue.c        2017-04-16 10:37:37.000000000 +0200
1705 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/x86/crypto/cast5_avx_glue.c     2017-04-18 17:54:21.000000000 +0200
1706 @@ -59,7 +59,7 @@
1707  static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
1708                      bool enc)
1710 -       bool fpu_enabled = false;
1711 +       bool fpu_enabled;
1712         struct cast5_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
1713         const unsigned int bsize = CAST5_BLOCK_SIZE;
1714         unsigned int nbytes;
1715 @@ -75,7 +75,7 @@
1716                 u8 *wsrc = walk->src.virt.addr;
1717                 u8 *wdst = walk->dst.virt.addr;
1719 -               fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes);
1720 +               fpu_enabled = cast5_fpu_begin(false, nbytes);
1722                 /* Process multi-block batch */
1723                 if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) {
1724 @@ -103,10 +103,9 @@
1725                 } while (nbytes >= bsize);
1727  done:
1728 +               cast5_fpu_end(fpu_enabled);
1729                 err = blkcipher_walk_done(desc, walk, nbytes);
1730         }
1732 -       cast5_fpu_end(fpu_enabled);
1733         return err;
1736 @@ -227,7 +226,7 @@
1737  static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
1738                        struct scatterlist *src, unsigned int nbytes)
1740 -       bool fpu_enabled = false;
1741 +       bool fpu_enabled;
1742         struct blkcipher_walk walk;
1743         int err;
1745 @@ -236,12 +235,11 @@
1746         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
1748         while ((nbytes = walk.nbytes)) {
1749 -               fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes);
1750 +               fpu_enabled = cast5_fpu_begin(false, nbytes);
1751                 nbytes = __cbc_decrypt(desc, &walk);
1752 +               cast5_fpu_end(fpu_enabled);
1753                 err = blkcipher_walk_done(desc, &walk, nbytes);
1754         }
1756 -       cast5_fpu_end(fpu_enabled);
1757         return err;
1760 @@ -311,7 +309,7 @@
1761  static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
1762                      struct scatterlist *src, unsigned int nbytes)
1764 -       bool fpu_enabled = false;
1765 +       bool fpu_enabled;
1766         struct blkcipher_walk walk;
1767         int err;
1769 @@ -320,13 +318,12 @@
1770         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
1772         while ((nbytes = walk.nbytes) >= CAST5_BLOCK_SIZE) {
1773 -               fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes);
1774 +               fpu_enabled = cast5_fpu_begin(false, nbytes);
1775                 nbytes = __ctr_crypt(desc, &walk);
1776 +               cast5_fpu_end(fpu_enabled);
1777                 err = blkcipher_walk_done(desc, &walk, nbytes);
1778         }
1780 -       cast5_fpu_end(fpu_enabled);
1782         if (walk.nbytes) {
1783                 ctr_crypt_final(desc, &walk);
1784                 err = blkcipher_walk_done(desc, &walk, 0);
1785 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/x86/crypto/glue_helper.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/x86/crypto/glue_helper.c
1786 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/x86/crypto/glue_helper.c   2017-04-16 10:37:37.000000000 +0200
1787 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/x86/crypto/glue_helper.c        2017-04-18 17:54:21.000000000 +0200
1788 @@ -39,7 +39,7 @@
1789         void *ctx = crypto_blkcipher_ctx(desc->tfm);
1790         const unsigned int bsize = 128 / 8;
1791         unsigned int nbytes, i, func_bytes;
1792 -       bool fpu_enabled = false;
1793 +       bool fpu_enabled;
1794         int err;
1796         err = blkcipher_walk_virt(desc, walk);
1797 @@ -49,7 +49,7 @@
1798                 u8 *wdst = walk->dst.virt.addr;
1800                 fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
1801 -                                            desc, fpu_enabled, nbytes);
1802 +                                            desc, false, nbytes);
1804                 for (i = 0; i < gctx->num_funcs; i++) {
1805                         func_bytes = bsize * gctx->funcs[i].num_blocks;
1806 @@ -71,10 +71,10 @@
1807                 }
1809  done:
1810 +               glue_fpu_end(fpu_enabled);
1811                 err = blkcipher_walk_done(desc, walk, nbytes);
1812         }
1814 -       glue_fpu_end(fpu_enabled);
1815         return err;
1818 @@ -194,7 +194,7 @@
1819                             struct scatterlist *src, unsigned int nbytes)
1821         const unsigned int bsize = 128 / 8;
1822 -       bool fpu_enabled = false;
1823 +       bool fpu_enabled;
1824         struct blkcipher_walk walk;
1825         int err;
1827 @@ -203,12 +203,12 @@
1829         while ((nbytes = walk.nbytes)) {
1830                 fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
1831 -                                            desc, fpu_enabled, nbytes);
1832 +                                            desc, false, nbytes);
1833                 nbytes = __glue_cbc_decrypt_128bit(gctx, desc, &walk);
1834 +               glue_fpu_end(fpu_enabled);
1835                 err = blkcipher_walk_done(desc, &walk, nbytes);
1836         }
1838 -       glue_fpu_end(fpu_enabled);
1839         return err;
1841  EXPORT_SYMBOL_GPL(glue_cbc_decrypt_128bit);
1842 @@ -277,7 +277,7 @@
1843                           struct scatterlist *src, unsigned int nbytes)
1845         const unsigned int bsize = 128 / 8;
1846 -       bool fpu_enabled = false;
1847 +       bool fpu_enabled;
1848         struct blkcipher_walk walk;
1849         int err;
1851 @@ -286,13 +286,12 @@
1853         while ((nbytes = walk.nbytes) >= bsize) {
1854                 fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
1855 -                                            desc, fpu_enabled, nbytes);
1856 +                                            desc, false, nbytes);
1857                 nbytes = __glue_ctr_crypt_128bit(gctx, desc, &walk);
1858 +               glue_fpu_end(fpu_enabled);
1859                 err = blkcipher_walk_done(desc, &walk, nbytes);
1860         }
1862 -       glue_fpu_end(fpu_enabled);
1864         if (walk.nbytes) {
1865                 glue_ctr_crypt_final_128bit(
1866                         gctx->funcs[gctx->num_funcs - 1].fn_u.ctr, desc, &walk);
1867 @@ -347,7 +346,7 @@
1868                           void *tweak_ctx, void *crypt_ctx)
1870         const unsigned int bsize = 128 / 8;
1871 -       bool fpu_enabled = false;
1872 +       bool fpu_enabled;
1873         struct blkcipher_walk walk;
1874         int err;
1876 @@ -360,21 +359,21 @@
1878         /* set minimum length to bsize, for tweak_fn */
1879         fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
1880 -                                    desc, fpu_enabled,
1881 +                                    desc, false,
1882                                      nbytes < bsize ? bsize : nbytes);
1884         /* calculate first value of T */
1885         tweak_fn(tweak_ctx, walk.iv, walk.iv);
1886 +       glue_fpu_end(fpu_enabled);
1888         while (nbytes) {
1889 +               fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
1890 +                               desc, false, nbytes);
1891                 nbytes = __glue_xts_crypt_128bit(gctx, crypt_ctx, desc, &walk);
1893 +               glue_fpu_end(fpu_enabled);
1894                 err = blkcipher_walk_done(desc, &walk, nbytes);
1895                 nbytes = walk.nbytes;
1896         }
1898 -       glue_fpu_end(fpu_enabled);
1900         return err;
1902  EXPORT_SYMBOL_GPL(glue_xts_crypt_128bit);
1903 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/x86/entry/common.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/x86/entry/common.c
1904 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/x86/entry/common.c 2017-04-16 10:37:37.000000000 +0200
1905 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/x86/entry/common.c      2017-04-18 17:54:21.000000000 +0200
1906 @@ -129,7 +129,7 @@
1908  #define EXIT_TO_USERMODE_LOOP_FLAGS                            \
1909         (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE |   \
1910 -        _TIF_NEED_RESCHED | _TIF_USER_RETURN_NOTIFY)
1911 +        _TIF_NEED_RESCHED_MASK | _TIF_USER_RETURN_NOTIFY)
1913  static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags)
1915 @@ -145,9 +145,16 @@
1916                 /* We have work to do. */
1917                 local_irq_enable();
1919 -               if (cached_flags & _TIF_NEED_RESCHED)
1920 +               if (cached_flags & _TIF_NEED_RESCHED_MASK)
1921                         schedule();
1923 +#ifdef ARCH_RT_DELAYS_SIGNAL_SEND
1924 +               if (unlikely(current->forced_info.si_signo)) {
1925 +                       struct task_struct *t = current;
1926 +                       force_sig_info(t->forced_info.si_signo, &t->forced_info, t);
1927 +                       t->forced_info.si_signo = 0;
1928 +               }
1929 +#endif
1930                 if (cached_flags & _TIF_UPROBE)
1931                         uprobe_notify_resume(regs);
1933 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/x86/entry/entry_32.S linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/x86/entry/entry_32.S
1934 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/x86/entry/entry_32.S       2017-04-16 10:37:37.000000000 +0200
1935 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/x86/entry/entry_32.S    2017-04-18 17:54:21.000000000 +0200
1936 @@ -308,8 +308,25 @@
1937  ENTRY(resume_kernel)
1938         DISABLE_INTERRUPTS(CLBR_ANY)
1939  need_resched:
1940 +       # preempt count == 0 + NEED_RS set?
1941         cmpl    $0, PER_CPU_VAR(__preempt_count)
1942 +#ifndef CONFIG_PREEMPT_LAZY
1943         jnz     restore_all
1944 +#else
1945 +       jz test_int_off
1947 +       # atleast preempt count == 0 ?
1948 +       cmpl $_PREEMPT_ENABLED,PER_CPU_VAR(__preempt_count)
1949 +       jne restore_all
1951 +       movl    PER_CPU_VAR(current_task), %ebp
1952 +       cmpl $0,TASK_TI_preempt_lazy_count(%ebp)        # non-zero preempt_lazy_count ?
1953 +       jnz restore_all
1955 +       testl $_TIF_NEED_RESCHED_LAZY, TASK_TI_flags(%ebp)
1956 +       jz restore_all
1957 +test_int_off:
1958 +#endif
1959         testl   $X86_EFLAGS_IF, PT_EFLAGS(%esp) # interrupts off (exception path) ?
1960         jz      restore_all
1961         call    preempt_schedule_irq
1962 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/x86/entry/entry_64.S linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/x86/entry/entry_64.S
1963 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/x86/entry/entry_64.S       2017-04-16 10:37:37.000000000 +0200
1964 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/x86/entry/entry_64.S    2017-04-18 17:54:21.000000000 +0200
1965 @@ -546,7 +546,23 @@
1966         bt      $9, EFLAGS(%rsp)                /* were interrupts off? */
1967         jnc     1f
1968  0:     cmpl    $0, PER_CPU_VAR(__preempt_count)
1969 +#ifndef CONFIG_PREEMPT_LAZY
1970         jnz     1f
1971 +#else
1972 +       jz      do_preempt_schedule_irq
1974 +       # atleast preempt count == 0 ?
1975 +       cmpl $_PREEMPT_ENABLED,PER_CPU_VAR(__preempt_count)
1976 +       jnz     1f
1978 +       movq    PER_CPU_VAR(current_task), %rcx
1979 +       cmpl    $0, TASK_TI_preempt_lazy_count(%rcx)
1980 +       jnz     1f
1982 +       bt      $TIF_NEED_RESCHED_LAZY,TASK_TI_flags(%rcx)
1983 +       jnc     1f
1984 +do_preempt_schedule_irq:
1985 +#endif
1986         call    preempt_schedule_irq
1987         jmp     0b
1988  1:
1989 @@ -894,6 +910,7 @@
1990         jmp     2b
1991         .previous
1993 +#ifndef CONFIG_PREEMPT_RT_FULL
1994  /* Call softirq on interrupt stack. Interrupts are off. */
1995  ENTRY(do_softirq_own_stack)
1996         pushq   %rbp
1997 @@ -906,6 +923,7 @@
1998         decl    PER_CPU_VAR(irq_count)
1999         ret
2000  END(do_softirq_own_stack)
2001 +#endif
2003  #ifdef CONFIG_XEN
2004  idtentry xen_hypervisor_callback xen_do_hypervisor_callback has_error_code=0
2005 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/x86/include/asm/preempt.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/x86/include/asm/preempt.h
2006 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/x86/include/asm/preempt.h  2017-04-16 10:37:37.000000000 +0200
2007 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/x86/include/asm/preempt.h       2017-04-18 17:54:21.000000000 +0200
2008 @@ -79,17 +79,46 @@
2009   * a decrement which hits zero means we have no preempt_count and should
2010   * reschedule.
2011   */
2012 -static __always_inline bool __preempt_count_dec_and_test(void)
2013 +static __always_inline bool ____preempt_count_dec_and_test(void)
2015         GEN_UNARY_RMWcc("decl", __preempt_count, __percpu_arg(0), e);
2018 +static __always_inline bool __preempt_count_dec_and_test(void)
2020 +       if (____preempt_count_dec_and_test())
2021 +               return true;
2022 +#ifdef CONFIG_PREEMPT_LAZY
2023 +       if (current_thread_info()->preempt_lazy_count)
2024 +               return false;
2025 +       return test_thread_flag(TIF_NEED_RESCHED_LAZY);
2026 +#else
2027 +       return false;
2028 +#endif
2031  /*
2032   * Returns true when we need to resched and can (barring IRQ state).
2033   */
2034  static __always_inline bool should_resched(int preempt_offset)
2036 +#ifdef CONFIG_PREEMPT_LAZY
2037 +       u32 tmp;
2039 +       tmp = raw_cpu_read_4(__preempt_count);
2040 +       if (tmp == preempt_offset)
2041 +               return true;
2043 +       /* preempt count == 0 ? */
2044 +       tmp &= ~PREEMPT_NEED_RESCHED;
2045 +       if (tmp)
2046 +               return false;
2047 +       if (current_thread_info()->preempt_lazy_count)
2048 +               return false;
2049 +       return test_thread_flag(TIF_NEED_RESCHED_LAZY);
2050 +#else
2051         return unlikely(raw_cpu_read_4(__preempt_count) == preempt_offset);
2052 +#endif
2055  #ifdef CONFIG_PREEMPT
2056 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/x86/include/asm/signal.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/x86/include/asm/signal.h
2057 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/x86/include/asm/signal.h   2017-04-16 10:37:37.000000000 +0200
2058 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/x86/include/asm/signal.h        2017-04-18 17:54:21.000000000 +0200
2059 @@ -27,6 +27,19 @@
2060  #define SA_IA32_ABI    0x02000000u
2061  #define SA_X32_ABI     0x01000000u
2064 + * Because some traps use the IST stack, we must keep preemption
2065 + * disabled while calling do_trap(), but do_trap() may call
2066 + * force_sig_info() which will grab the signal spin_locks for the
2067 + * task, which in PREEMPT_RT_FULL are mutexes.  By defining
2068 + * ARCH_RT_DELAYS_SIGNAL_SEND the force_sig_info() will set
2069 + * TIF_NOTIFY_RESUME and set up the signal to be sent on exit of the
2070 + * trap.
2071 + */
2072 +#if defined(CONFIG_PREEMPT_RT_FULL)
2073 +#define ARCH_RT_DELAYS_SIGNAL_SEND
2074 +#endif
2076  #ifndef CONFIG_COMPAT
2077  typedef sigset_t compat_sigset_t;
2078  #endif
2079 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/x86/include/asm/stackprotector.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/x86/include/asm/stackprotector.h
2080 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/x86/include/asm/stackprotector.h   2017-04-16 10:37:37.000000000 +0200
2081 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/x86/include/asm/stackprotector.h        2017-04-18 17:54:21.000000000 +0200
2082 @@ -59,7 +59,7 @@
2083   */
2084  static __always_inline void boot_init_stack_canary(void)
2086 -       u64 canary;
2087 +       u64 uninitialized_var(canary);
2088         u64 tsc;
2090  #ifdef CONFIG_X86_64
2091 @@ -70,8 +70,15 @@
2092          * of randomness. The TSC only matters for very early init,
2093          * there it already has some randomness on most systems. Later
2094          * on during the bootup the random pool has true entropy too.
2095 +        *
2096 +        * For preempt-rt we need to weaken the randomness a bit, as
2097 +        * we can't call into the random generator from atomic context
2098 +        * due to locking constraints. We just leave canary
2099 +        * uninitialized and use the TSC based randomness on top of it.
2100          */
2101 +#ifndef CONFIG_PREEMPT_RT_FULL
2102         get_random_bytes(&canary, sizeof(canary));
2103 +#endif
2104         tsc = rdtsc();
2105         canary += tsc + (tsc << 32UL);
2107 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/x86/include/asm/thread_info.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/x86/include/asm/thread_info.h
2108 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/x86/include/asm/thread_info.h      2017-04-16 10:37:37.000000000 +0200
2109 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/x86/include/asm/thread_info.h   2017-04-18 17:54:21.000000000 +0200
2110 @@ -54,11 +54,14 @@
2112  struct thread_info {
2113         unsigned long           flags;          /* low level flags */
2114 +       int                     preempt_lazy_count;     /* 0 => lazy preemptable
2115 +                                                          <0 => BUG */
2116  };
2118  #define INIT_THREAD_INFO(tsk)                  \
2119  {                                              \
2120         .flags          = 0,                    \
2121 +       .preempt_lazy_count = 0,                \
2124  #define init_stack             (init_thread_union.stack)
2125 @@ -67,6 +70,10 @@
2127  #include <asm/asm-offsets.h>
2129 +#define GET_THREAD_INFO(reg) \
2130 +       _ASM_MOV PER_CPU_VAR(cpu_current_top_of_stack),reg ; \
2131 +       _ASM_SUB $(THREAD_SIZE),reg ;
2133  #endif
2135  /*
2136 @@ -85,6 +92,7 @@
2137  #define TIF_SYSCALL_EMU                6       /* syscall emulation active */
2138  #define TIF_SYSCALL_AUDIT      7       /* syscall auditing active */
2139  #define TIF_SECCOMP            8       /* secure computing */
2140 +#define TIF_NEED_RESCHED_LAZY  9       /* lazy rescheduling necessary */
2141  #define TIF_USER_RETURN_NOTIFY 11      /* notify kernel of userspace return */
2142  #define TIF_UPROBE             12      /* breakpointed or singlestepping */
2143  #define TIF_NOTSC              16      /* TSC is not accessible in userland */
2144 @@ -108,6 +116,7 @@
2145  #define _TIF_SYSCALL_EMU       (1 << TIF_SYSCALL_EMU)
2146  #define _TIF_SYSCALL_AUDIT     (1 << TIF_SYSCALL_AUDIT)
2147  #define _TIF_SECCOMP           (1 << TIF_SECCOMP)
2148 +#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY)
2149  #define _TIF_USER_RETURN_NOTIFY        (1 << TIF_USER_RETURN_NOTIFY)
2150  #define _TIF_UPROBE            (1 << TIF_UPROBE)
2151  #define _TIF_NOTSC             (1 << TIF_NOTSC)
2152 @@ -143,6 +152,8 @@
2153  #define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY)
2154  #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW)
2156 +#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)
2158  #define STACK_WARN             (THREAD_SIZE/8)
2160  /*
2161 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/x86/include/asm/uv/uv_bau.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/x86/include/asm/uv/uv_bau.h
2162 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/x86/include/asm/uv/uv_bau.h        2017-04-16 10:37:37.000000000 +0200
2163 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/x86/include/asm/uv/uv_bau.h     2017-04-18 17:54:21.000000000 +0200
2164 @@ -624,9 +624,9 @@
2165         cycles_t                send_message;
2166         cycles_t                period_end;
2167         cycles_t                period_time;
2168 -       spinlock_t              uvhub_lock;
2169 -       spinlock_t              queue_lock;
2170 -       spinlock_t              disable_lock;
2171 +       raw_spinlock_t          uvhub_lock;
2172 +       raw_spinlock_t          queue_lock;
2173 +       raw_spinlock_t          disable_lock;
2174         /* tunables */
2175         int                     max_concurr;
2176         int                     max_concurr_const;
2177 @@ -815,15 +815,15 @@
2178   * to be lowered below the current 'v'.  atomic_add_unless can only stop
2179   * on equal.
2180   */
2181 -static inline int atomic_inc_unless_ge(spinlock_t *lock, atomic_t *v, int u)
2182 +static inline int atomic_inc_unless_ge(raw_spinlock_t *lock, atomic_t *v, int u)
2184 -       spin_lock(lock);
2185 +       raw_spin_lock(lock);
2186         if (atomic_read(v) >= u) {
2187 -               spin_unlock(lock);
2188 +               raw_spin_unlock(lock);
2189                 return 0;
2190         }
2191         atomic_inc(v);
2192 -       spin_unlock(lock);
2193 +       raw_spin_unlock(lock);
2194         return 1;
2197 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/x86/kernel/acpi/boot.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/x86/kernel/acpi/boot.c
2198 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/x86/kernel/acpi/boot.c     2017-04-16 10:37:37.000000000 +0200
2199 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/x86/kernel/acpi/boot.c  2017-04-18 17:54:21.000000000 +0200
2200 @@ -87,7 +87,9 @@
2201   *             ->ioapic_mutex
2202   *                     ->ioapic_lock
2203   */
2204 +#ifdef CONFIG_X86_IO_APIC
2205  static DEFINE_MUTEX(acpi_ioapic_lock);
2206 +#endif
2208  /* --------------------------------------------------------------------------
2209                                Boot-time Configuration
2210 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/x86/kernel/apic/io_apic.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/x86/kernel/apic/io_apic.c
2211 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/x86/kernel/apic/io_apic.c  2017-04-16 10:37:37.000000000 +0200
2212 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/x86/kernel/apic/io_apic.c       2017-04-18 17:54:21.000000000 +0200
2213 @@ -1712,7 +1712,8 @@
2214  static inline bool ioapic_irqd_mask(struct irq_data *data)
2216         /* If we are moving the irq we need to mask it */
2217 -       if (unlikely(irqd_is_setaffinity_pending(data))) {
2218 +       if (unlikely(irqd_is_setaffinity_pending(data) &&
2219 +                    !irqd_irq_inprogress(data))) {
2220                 mask_ioapic_irq(data);
2221                 return true;
2222         }
2223 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/x86/kernel/asm-offsets.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/x86/kernel/asm-offsets.c
2224 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/x86/kernel/asm-offsets.c   2017-04-16 10:37:37.000000000 +0200
2225 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/x86/kernel/asm-offsets.c        2017-04-18 17:54:21.000000000 +0200
2226 @@ -36,6 +36,7 @@
2228         BLANK();
2229         OFFSET(TASK_TI_flags, task_struct, thread_info.flags);
2230 +       OFFSET(TASK_TI_preempt_lazy_count, task_struct, thread_info.preempt_lazy_count);
2231         OFFSET(TASK_addr_limit, task_struct, thread.addr_limit);
2233         BLANK();
2234 @@ -91,4 +92,5 @@
2236         BLANK();
2237         DEFINE(PTREGS_SIZE, sizeof(struct pt_regs));
2238 +       DEFINE(_PREEMPT_ENABLED, PREEMPT_ENABLED);
2240 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/x86/kernel/cpu/mcheck/mce.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/x86/kernel/cpu/mcheck/mce.c
2241 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/x86/kernel/cpu/mcheck/mce.c        2017-04-16 10:37:37.000000000 +0200
2242 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/x86/kernel/cpu/mcheck/mce.c     2017-04-18 17:54:21.000000000 +0200
2243 @@ -41,6 +41,8 @@
2244  #include <linux/debugfs.h>
2245  #include <linux/irq_work.h>
2246  #include <linux/export.h>
2247 +#include <linux/jiffies.h>
2248 +#include <linux/swork.h>
2249  #include <linux/jump_label.h>
2251  #include <asm/processor.h>
2252 @@ -1317,7 +1319,7 @@
2253  static unsigned long check_interval = INITIAL_CHECK_INTERVAL;
2255  static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
2256 -static DEFINE_PER_CPU(struct timer_list, mce_timer);
2257 +static DEFINE_PER_CPU(struct hrtimer, mce_timer);
2259  static unsigned long mce_adjust_timer_default(unsigned long interval)
2261 @@ -1326,32 +1328,18 @@
2263  static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default;
2265 -static void __restart_timer(struct timer_list *t, unsigned long interval)
2266 +static enum hrtimer_restart __restart_timer(struct hrtimer *timer, unsigned long interval)
2268 -       unsigned long when = jiffies + interval;
2269 -       unsigned long flags;
2271 -       local_irq_save(flags);
2273 -       if (timer_pending(t)) {
2274 -               if (time_before(when, t->expires))
2275 -                       mod_timer(t, when);
2276 -       } else {
2277 -               t->expires = round_jiffies(when);
2278 -               add_timer_on(t, smp_processor_id());
2279 -       }
2281 -       local_irq_restore(flags);
2282 +       if (!interval)
2283 +               return HRTIMER_NORESTART;
2284 +       hrtimer_forward_now(timer, ns_to_ktime(jiffies_to_nsecs(interval)));
2285 +       return HRTIMER_RESTART;
2288 -static void mce_timer_fn(unsigned long data)
2289 +static enum hrtimer_restart mce_timer_fn(struct hrtimer *timer)
2291 -       struct timer_list *t = this_cpu_ptr(&mce_timer);
2292 -       int cpu = smp_processor_id();
2293         unsigned long iv;
2295 -       WARN_ON(cpu != data);
2297         iv = __this_cpu_read(mce_next_interval);
2299         if (mce_available(this_cpu_ptr(&cpu_info))) {
2300 @@ -1374,7 +1362,7 @@
2302  done:
2303         __this_cpu_write(mce_next_interval, iv);
2304 -       __restart_timer(t, iv);
2305 +       return __restart_timer(timer, iv);
2308  /*
2309 @@ -1382,7 +1370,7 @@
2310   */
2311  void mce_timer_kick(unsigned long interval)
2313 -       struct timer_list *t = this_cpu_ptr(&mce_timer);
2314 +       struct hrtimer *t = this_cpu_ptr(&mce_timer);
2315         unsigned long iv = __this_cpu_read(mce_next_interval);
2317         __restart_timer(t, interval);
2318 @@ -1397,7 +1385,7 @@
2319         int cpu;
2321         for_each_online_cpu(cpu)
2322 -               del_timer_sync(&per_cpu(mce_timer, cpu));
2323 +               hrtimer_cancel(&per_cpu(mce_timer, cpu));
2326  static void mce_do_trigger(struct work_struct *work)
2327 @@ -1407,6 +1395,56 @@
2329  static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
2331 +static void __mce_notify_work(struct swork_event *event)
2333 +       /* Not more than two messages every minute */
2334 +       static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
2336 +       /* wake processes polling /dev/mcelog */
2337 +       wake_up_interruptible(&mce_chrdev_wait);
2339 +       /*
2340 +        * There is no risk of missing notifications because
2341 +        * work_pending is always cleared before the function is
2342 +        * executed.
2343 +        */
2344 +       if (mce_helper[0] && !work_pending(&mce_trigger_work))
2345 +               schedule_work(&mce_trigger_work);
2347 +       if (__ratelimit(&ratelimit))
2348 +               pr_info(HW_ERR "Machine check events logged\n");
2351 +#ifdef CONFIG_PREEMPT_RT_FULL
2352 +static bool notify_work_ready __read_mostly;
2353 +static struct swork_event notify_work;
2355 +static int mce_notify_work_init(void)
2357 +       int err;
2359 +       err = swork_get();
2360 +       if (err)
2361 +               return err;
2363 +       INIT_SWORK(&notify_work, __mce_notify_work);
2364 +       notify_work_ready = true;
2365 +       return 0;
2368 +static void mce_notify_work(void)
2370 +       if (notify_work_ready)
2371 +               swork_queue(&notify_work);
2373 +#else
2374 +static void mce_notify_work(void)
2376 +       __mce_notify_work(NULL);
2378 +static inline int mce_notify_work_init(void) { return 0; }
2379 +#endif
2381  /*
2382   * Notify the user(s) about new machine check events.
2383   * Can be called from interrupt context, but not from machine check/NMI
2384 @@ -1414,19 +1452,8 @@
2385   */
2386  int mce_notify_irq(void)
2388 -       /* Not more than two messages every minute */
2389 -       static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
2391         if (test_and_clear_bit(0, &mce_need_notify)) {
2392 -               /* wake processes polling /dev/mcelog */
2393 -               wake_up_interruptible(&mce_chrdev_wait);
2395 -               if (mce_helper[0])
2396 -                       schedule_work(&mce_trigger_work);
2398 -               if (__ratelimit(&ratelimit))
2399 -                       pr_info(HW_ERR "Machine check events logged\n");
2401 +               mce_notify_work();
2402                 return 1;
2403         }
2404         return 0;
2405 @@ -1732,7 +1759,7 @@
2406         }
2409 -static void mce_start_timer(unsigned int cpu, struct timer_list *t)
2410 +static void mce_start_timer(unsigned int cpu, struct hrtimer *t)
2412         unsigned long iv = check_interval * HZ;
2414 @@ -1741,16 +1768,17 @@
2416         per_cpu(mce_next_interval, cpu) = iv;
2418 -       t->expires = round_jiffies(jiffies + iv);
2419 -       add_timer_on(t, cpu);
2420 +       hrtimer_start_range_ns(t, ns_to_ktime(jiffies_to_usecs(iv) * 1000ULL),
2421 +                       0, HRTIMER_MODE_REL_PINNED);
2424  static void __mcheck_cpu_init_timer(void)
2426 -       struct timer_list *t = this_cpu_ptr(&mce_timer);
2427 +       struct hrtimer *t = this_cpu_ptr(&mce_timer);
2428         unsigned int cpu = smp_processor_id();
2430 -       setup_pinned_timer(t, mce_timer_fn, cpu);
2431 +       hrtimer_init(t, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2432 +       t->function = mce_timer_fn;
2433         mce_start_timer(cpu, t);
2436 @@ -2475,6 +2503,8 @@
2437         if (!mce_available(raw_cpu_ptr(&cpu_info)))
2438                 return;
2440 +       hrtimer_cancel(this_cpu_ptr(&mce_timer));
2442         if (!(action & CPU_TASKS_FROZEN))
2443                 cmci_clear();
2445 @@ -2497,6 +2527,7 @@
2446                 if (b->init)
2447                         wrmsrl(msr_ops.ctl(i), b->ctl);
2448         }
2449 +       __mcheck_cpu_init_timer();
2452  /* Get notified when a cpu comes on/off. Be hotplug friendly. */
2453 @@ -2504,7 +2535,6 @@
2454  mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
2456         unsigned int cpu = (unsigned long)hcpu;
2457 -       struct timer_list *t = &per_cpu(mce_timer, cpu);
2459         switch (action & ~CPU_TASKS_FROZEN) {
2460         case CPU_ONLINE:
2461 @@ -2524,11 +2554,9 @@
2462                 break;
2463         case CPU_DOWN_PREPARE:
2464                 smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
2465 -               del_timer_sync(t);
2466                 break;
2467         case CPU_DOWN_FAILED:
2468                 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
2469 -               mce_start_timer(cpu, t);
2470                 break;
2471         }
2473 @@ -2567,6 +2595,10 @@
2474                 goto err_out;
2475         }
2477 +       err = mce_notify_work_init();
2478 +       if (err)
2479 +               goto err_out;
2481         if (!zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL)) {
2482                 err = -ENOMEM;
2483                 goto err_out;
2484 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/x86/kernel/irq_32.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/x86/kernel/irq_32.c
2485 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/x86/kernel/irq_32.c        2017-04-16 10:37:37.000000000 +0200
2486 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/x86/kernel/irq_32.c     2017-04-18 17:54:21.000000000 +0200
2487 @@ -127,6 +127,7 @@
2488                cpu, per_cpu(hardirq_stack, cpu),  per_cpu(softirq_stack, cpu));
2491 +#ifndef CONFIG_PREEMPT_RT_FULL
2492  void do_softirq_own_stack(void)
2494         struct irq_stack *irqstk;
2495 @@ -143,6 +144,7 @@
2497         call_on_stack(__do_softirq, isp);
2499 +#endif
2501  bool handle_irq(struct irq_desc *desc, struct pt_regs *regs)
2503 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/x86/kernel/process_32.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/x86/kernel/process_32.c
2504 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/x86/kernel/process_32.c    2017-04-16 10:37:37.000000000 +0200
2505 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/x86/kernel/process_32.c 2017-04-18 17:54:21.000000000 +0200
2506 @@ -35,6 +35,7 @@
2507  #include <linux/uaccess.h>
2508  #include <linux/io.h>
2509  #include <linux/kdebug.h>
2510 +#include <linux/highmem.h>
2512  #include <asm/pgtable.h>
2513  #include <asm/ldt.h>
2514 @@ -195,6 +196,35 @@
2516  EXPORT_SYMBOL_GPL(start_thread);
2518 +#ifdef CONFIG_PREEMPT_RT_FULL
2519 +static void switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p)
2521 +       int i;
2523 +       /*
2524 +        * Clear @prev's kmap_atomic mappings
2525 +        */
2526 +       for (i = 0; i < prev_p->kmap_idx; i++) {
2527 +               int idx = i + KM_TYPE_NR * smp_processor_id();
2528 +               pte_t *ptep = kmap_pte - idx;
2530 +               kpte_clear_flush(ptep, __fix_to_virt(FIX_KMAP_BEGIN + idx));
2531 +       }
2532 +       /*
2533 +        * Restore @next_p's kmap_atomic mappings
2534 +        */
2535 +       for (i = 0; i < next_p->kmap_idx; i++) {
2536 +               int idx = i + KM_TYPE_NR * smp_processor_id();
2538 +               if (!pte_none(next_p->kmap_pte[i]))
2539 +                       set_pte(kmap_pte - idx, next_p->kmap_pte[i]);
2540 +       }
2542 +#else
2543 +static inline void
2544 +switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p) { }
2545 +#endif
2548  /*
2549   *     switch_to(x,y) should switch tasks from x to y.
2550 @@ -271,6 +301,8 @@
2551                      task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT))
2552                 __switch_to_xtra(prev_p, next_p, tss);
2554 +       switch_kmaps(prev_p, next_p);
2556         /*
2557          * Leave lazy mode, flushing any hypercalls made here.
2558          * This must be done before restoring TLS segments so
2559 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/x86/kvm/lapic.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/x86/kvm/lapic.c
2560 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/x86/kvm/lapic.c    2017-04-16 10:37:37.000000000 +0200
2561 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/x86/kvm/lapic.c 2017-04-18 17:54:21.000000000 +0200
2562 @@ -1939,6 +1939,7 @@
2563         hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC,
2564                      HRTIMER_MODE_ABS_PINNED);
2565         apic->lapic_timer.timer.function = apic_timer_fn;
2566 +       apic->lapic_timer.timer.irqsafe = 1;
2568         /*
2569          * APIC is created enabled. This will prevent kvm_lapic_set_base from
2570 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/x86/kvm/x86.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/x86/kvm/x86.c
2571 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/x86/kvm/x86.c      2017-04-16 10:37:37.000000000 +0200
2572 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/x86/kvm/x86.c   2017-04-18 17:54:21.000000000 +0200
2573 @@ -5933,6 +5933,13 @@
2574                 goto out;
2575         }
2577 +#ifdef CONFIG_PREEMPT_RT_FULL
2578 +       if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
2579 +               printk(KERN_ERR "RT requires X86_FEATURE_CONSTANT_TSC\n");
2580 +               return -EOPNOTSUPP;
2581 +       }
2582 +#endif
2584         r = kvm_mmu_module_init();
2585         if (r)
2586                 goto out_free_percpu;
2587 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/x86/mm/highmem_32.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/x86/mm/highmem_32.c
2588 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/x86/mm/highmem_32.c        2017-04-16 10:37:37.000000000 +0200
2589 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/x86/mm/highmem_32.c     2017-04-18 17:54:21.000000000 +0200
2590 @@ -32,10 +32,11 @@
2591   */
2592  void *kmap_atomic_prot(struct page *page, pgprot_t prot)
2594 +       pte_t pte = mk_pte(page, prot);
2595         unsigned long vaddr;
2596         int idx, type;
2598 -       preempt_disable();
2599 +       preempt_disable_nort();
2600         pagefault_disable();
2602         if (!PageHighMem(page))
2603 @@ -45,7 +46,10 @@
2604         idx = type + KM_TYPE_NR*smp_processor_id();
2605         vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
2606         BUG_ON(!pte_none(*(kmap_pte-idx)));
2607 -       set_pte(kmap_pte-idx, mk_pte(page, prot));
2608 +#ifdef CONFIG_PREEMPT_RT_FULL
2609 +       current->kmap_pte[type] = pte;
2610 +#endif
2611 +       set_pte(kmap_pte-idx, pte);
2612         arch_flush_lazy_mmu_mode();
2614         return (void *)vaddr;
2615 @@ -88,6 +92,9 @@
2616                  * is a bad idea also, in case the page changes cacheability
2617                  * attributes or becomes a protected page in a hypervisor.
2618                  */
2619 +#ifdef CONFIG_PREEMPT_RT_FULL
2620 +               current->kmap_pte[type] = __pte(0);
2621 +#endif
2622                 kpte_clear_flush(kmap_pte-idx, vaddr);
2623                 kmap_atomic_idx_pop();
2624                 arch_flush_lazy_mmu_mode();
2625 @@ -100,7 +107,7 @@
2626  #endif
2628         pagefault_enable();
2629 -       preempt_enable();
2630 +       preempt_enable_nort();
2632  EXPORT_SYMBOL(__kunmap_atomic);
2634 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/x86/mm/iomap_32.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/x86/mm/iomap_32.c
2635 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/x86/mm/iomap_32.c  2017-04-16 10:37:37.000000000 +0200
2636 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/x86/mm/iomap_32.c       2017-04-18 17:54:21.000000000 +0200
2637 @@ -56,6 +56,7 @@
2639  void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
2641 +       pte_t pte = pfn_pte(pfn, prot);
2642         unsigned long vaddr;
2643         int idx, type;
2645 @@ -65,7 +66,12 @@
2646         type = kmap_atomic_idx_push();
2647         idx = type + KM_TYPE_NR * smp_processor_id();
2648         vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
2649 -       set_pte(kmap_pte - idx, pfn_pte(pfn, prot));
2650 +       WARN_ON(!pte_none(*(kmap_pte - idx)));
2652 +#ifdef CONFIG_PREEMPT_RT_FULL
2653 +       current->kmap_pte[type] = pte;
2654 +#endif
2655 +       set_pte(kmap_pte - idx, pte);
2656         arch_flush_lazy_mmu_mode();
2658         return (void *)vaddr;
2659 @@ -113,6 +119,9 @@
2660                  * is a bad idea also, in case the page changes cacheability
2661                  * attributes or becomes a protected page in a hypervisor.
2662                  */
2663 +#ifdef CONFIG_PREEMPT_RT_FULL
2664 +               current->kmap_pte[type] = __pte(0);
2665 +#endif
2666                 kpte_clear_flush(kmap_pte-idx, vaddr);
2667                 kmap_atomic_idx_pop();
2668         }
2669 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/x86/mm/pageattr.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/x86/mm/pageattr.c
2670 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/x86/mm/pageattr.c  2017-04-16 10:37:37.000000000 +0200
2671 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/x86/mm/pageattr.c       2017-04-18 17:54:21.000000000 +0200
2672 @@ -214,7 +214,15 @@
2673                             int in_flags, struct page **pages)
2675         unsigned int i, level;
2676 +#ifdef CONFIG_PREEMPT
2677 +       /*
2678 +        * Avoid wbinvd() because it causes latencies on all CPUs,
2679 +        * regardless of any CPU isolation that may be in effect.
2680 +        */
2681 +       unsigned long do_wbinvd = 0;
2682 +#else
2683         unsigned long do_wbinvd = cache && numpages >= 1024; /* 4M threshold */
2684 +#endif
2686         BUG_ON(irqs_disabled());
2688 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/x86/platform/uv/tlb_uv.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/x86/platform/uv/tlb_uv.c
2689 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/x86/platform/uv/tlb_uv.c   2017-04-16 10:37:37.000000000 +0200
2690 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/x86/platform/uv/tlb_uv.c        2017-04-18 17:54:21.000000000 +0200
2691 @@ -748,9 +748,9 @@
2693                 quiesce_local_uvhub(hmaster);
2695 -               spin_lock(&hmaster->queue_lock);
2696 +               raw_spin_lock(&hmaster->queue_lock);
2697                 reset_with_ipi(&bau_desc->distribution, bcp);
2698 -               spin_unlock(&hmaster->queue_lock);
2699 +               raw_spin_unlock(&hmaster->queue_lock);
2701                 end_uvhub_quiesce(hmaster);
2703 @@ -770,9 +770,9 @@
2705                 quiesce_local_uvhub(hmaster);
2707 -               spin_lock(&hmaster->queue_lock);
2708 +               raw_spin_lock(&hmaster->queue_lock);
2709                 reset_with_ipi(&bau_desc->distribution, bcp);
2710 -               spin_unlock(&hmaster->queue_lock);
2711 +               raw_spin_unlock(&hmaster->queue_lock);
2713                 end_uvhub_quiesce(hmaster);
2715 @@ -793,7 +793,7 @@
2716         cycles_t tm1;
2718         hmaster = bcp->uvhub_master;
2719 -       spin_lock(&hmaster->disable_lock);
2720 +       raw_spin_lock(&hmaster->disable_lock);
2721         if (!bcp->baudisabled) {
2722                 stat->s_bau_disabled++;
2723                 tm1 = get_cycles();
2724 @@ -806,7 +806,7 @@
2725                         }
2726                 }
2727         }
2728 -       spin_unlock(&hmaster->disable_lock);
2729 +       raw_spin_unlock(&hmaster->disable_lock);
2732  static void count_max_concurr(int stat, struct bau_control *bcp,
2733 @@ -869,7 +869,7 @@
2734   */
2735  static void uv1_throttle(struct bau_control *hmaster, struct ptc_stats *stat)
2737 -       spinlock_t *lock = &hmaster->uvhub_lock;
2738 +       raw_spinlock_t *lock = &hmaster->uvhub_lock;
2739         atomic_t *v;
2741         v = &hmaster->active_descriptor_count;
2742 @@ -1002,7 +1002,7 @@
2743         struct bau_control *hmaster;
2745         hmaster = bcp->uvhub_master;
2746 -       spin_lock(&hmaster->disable_lock);
2747 +       raw_spin_lock(&hmaster->disable_lock);
2748         if (bcp->baudisabled && (get_cycles() >= bcp->set_bau_on_time)) {
2749                 stat->s_bau_reenabled++;
2750                 for_each_present_cpu(tcpu) {
2751 @@ -1014,10 +1014,10 @@
2752                                 tbcp->period_giveups = 0;
2753                         }
2754                 }
2755 -               spin_unlock(&hmaster->disable_lock);
2756 +               raw_spin_unlock(&hmaster->disable_lock);
2757                 return 0;
2758         }
2759 -       spin_unlock(&hmaster->disable_lock);
2760 +       raw_spin_unlock(&hmaster->disable_lock);
2761         return -1;
2764 @@ -1940,9 +1940,9 @@
2765                 bcp->cong_reps                  = congested_reps;
2766                 bcp->disabled_period            = sec_2_cycles(disabled_period);
2767                 bcp->giveup_limit               = giveup_limit;
2768 -               spin_lock_init(&bcp->queue_lock);
2769 -               spin_lock_init(&bcp->uvhub_lock);
2770 -               spin_lock_init(&bcp->disable_lock);
2771 +               raw_spin_lock_init(&bcp->queue_lock);
2772 +               raw_spin_lock_init(&bcp->uvhub_lock);
2773 +               raw_spin_lock_init(&bcp->disable_lock);
2774         }
2777 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/x86/platform/uv/uv_time.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/x86/platform/uv/uv_time.c
2778 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/arch/x86/platform/uv/uv_time.c  2017-04-16 10:37:37.000000000 +0200
2779 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/arch/x86/platform/uv/uv_time.c       2017-04-18 17:54:21.000000000 +0200
2780 @@ -57,7 +57,7 @@
2782  /* There is one of these allocated per node */
2783  struct uv_rtc_timer_head {
2784 -       spinlock_t      lock;
2785 +       raw_spinlock_t  lock;
2786         /* next cpu waiting for timer, local node relative: */
2787         int             next_cpu;
2788         /* number of cpus on this node: */
2789 @@ -177,7 +177,7 @@
2790                                 uv_rtc_deallocate_timers();
2791                                 return -ENOMEM;
2792                         }
2793 -                       spin_lock_init(&head->lock);
2794 +                       raw_spin_lock_init(&head->lock);
2795                         head->ncpus = uv_blade_nr_possible_cpus(bid);
2796                         head->next_cpu = -1;
2797                         blade_info[bid] = head;
2798 @@ -231,7 +231,7 @@
2799         unsigned long flags;
2800         int next_cpu;
2802 -       spin_lock_irqsave(&head->lock, flags);
2803 +       raw_spin_lock_irqsave(&head->lock, flags);
2805         next_cpu = head->next_cpu;
2806         *t = expires;
2807 @@ -243,12 +243,12 @@
2808                 if (uv_setup_intr(cpu, expires)) {
2809                         *t = ULLONG_MAX;
2810                         uv_rtc_find_next_timer(head, pnode);
2811 -                       spin_unlock_irqrestore(&head->lock, flags);
2812 +                       raw_spin_unlock_irqrestore(&head->lock, flags);
2813                         return -ETIME;
2814                 }
2815         }
2817 -       spin_unlock_irqrestore(&head->lock, flags);
2818 +       raw_spin_unlock_irqrestore(&head->lock, flags);
2819         return 0;
2822 @@ -267,7 +267,7 @@
2823         unsigned long flags;
2824         int rc = 0;
2826 -       spin_lock_irqsave(&head->lock, flags);
2827 +       raw_spin_lock_irqsave(&head->lock, flags);
2829         if ((head->next_cpu == bcpu && uv_read_rtc(NULL) >= *t) || force)
2830                 rc = 1;
2831 @@ -279,7 +279,7 @@
2832                         uv_rtc_find_next_timer(head, pnode);
2833         }
2835 -       spin_unlock_irqrestore(&head->lock, flags);
2836 +       raw_spin_unlock_irqrestore(&head->lock, flags);
2838         return rc;
2840 @@ -299,13 +299,18 @@
2841  static cycle_t uv_read_rtc(struct clocksource *cs)
2843         unsigned long offset;
2844 +       cycle_t cycles;
2846 +       preempt_disable();
2847         if (uv_get_min_hub_revision_id() == 1)
2848                 offset = 0;
2849         else
2850                 offset = (uv_blade_processor_id() * L1_CACHE_BYTES) % PAGE_SIZE;
2852 -       return (cycle_t)uv_read_local_mmr(UVH_RTC | offset);
2853 +       cycles = (cycle_t)uv_read_local_mmr(UVH_RTC | offset);
2854 +       preempt_enable();
2856 +       return cycles;
2859  /*
2860 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/block/blk-core.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/block/blk-core.c
2861 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/block/blk-core.c        2017-04-16 10:37:38.000000000 +0200
2862 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/block/blk-core.c     2017-04-18 17:54:21.000000000 +0200
2863 @@ -125,6 +125,9 @@
2865         INIT_LIST_HEAD(&rq->queuelist);
2866         INIT_LIST_HEAD(&rq->timeout_list);
2867 +#ifdef CONFIG_PREEMPT_RT_FULL
2868 +       INIT_WORK(&rq->work, __blk_mq_complete_request_remote_work);
2869 +#endif
2870         rq->cpu = -1;
2871         rq->q = q;
2872         rq->__sector = (sector_t) -1;
2873 @@ -233,7 +236,7 @@
2874   **/
2875  void blk_start_queue(struct request_queue *q)
2877 -       WARN_ON(!irqs_disabled());
2878 +       WARN_ON_NONRT(!irqs_disabled());
2880         queue_flag_clear(QUEUE_FLAG_STOPPED, q);
2881         __blk_run_queue(q);
2882 @@ -659,7 +662,7 @@
2883                 if (nowait)
2884                         return -EBUSY;
2886 -               ret = wait_event_interruptible(q->mq_freeze_wq,
2887 +               ret = swait_event_interruptible(q->mq_freeze_wq,
2888                                 !atomic_read(&q->mq_freeze_depth) ||
2889                                 blk_queue_dying(q));
2890                 if (blk_queue_dying(q))
2891 @@ -679,7 +682,7 @@
2892         struct request_queue *q =
2893                 container_of(ref, struct request_queue, q_usage_counter);
2895 -       wake_up_all(&q->mq_freeze_wq);
2896 +       swake_up_all(&q->mq_freeze_wq);
2899  static void blk_rq_timed_out_timer(unsigned long data)
2900 @@ -748,7 +751,7 @@
2901         q->bypass_depth = 1;
2902         __set_bit(QUEUE_FLAG_BYPASS, &q->queue_flags);
2904 -       init_waitqueue_head(&q->mq_freeze_wq);
2905 +       init_swait_queue_head(&q->mq_freeze_wq);
2907         /*
2908          * Init percpu_ref in atomic mode so that it's faster to shutdown.
2909 @@ -3200,7 +3203,7 @@
2910                 blk_run_queue_async(q);
2911         else
2912                 __blk_run_queue(q);
2913 -       spin_unlock(q->queue_lock);
2914 +       spin_unlock_irq(q->queue_lock);
2917  static void flush_plug_callbacks(struct blk_plug *plug, bool from_schedule)
2918 @@ -3248,7 +3251,6 @@
2919  void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
2921         struct request_queue *q;
2922 -       unsigned long flags;
2923         struct request *rq;
2924         LIST_HEAD(list);
2925         unsigned int depth;
2926 @@ -3268,11 +3270,6 @@
2927         q = NULL;
2928         depth = 0;
2930 -       /*
2931 -        * Save and disable interrupts here, to avoid doing it for every
2932 -        * queue lock we have to take.
2933 -        */
2934 -       local_irq_save(flags);
2935         while (!list_empty(&list)) {
2936                 rq = list_entry_rq(list.next);
2937                 list_del_init(&rq->queuelist);
2938 @@ -3285,7 +3282,7 @@
2939                                 queue_unplugged(q, depth, from_schedule);
2940                         q = rq->q;
2941                         depth = 0;
2942 -                       spin_lock(q->queue_lock);
2943 +                       spin_lock_irq(q->queue_lock);
2944                 }
2946                 /*
2947 @@ -3312,8 +3309,6 @@
2948          */
2949         if (q)
2950                 queue_unplugged(q, depth, from_schedule);
2952 -       local_irq_restore(flags);
2955  void blk_finish_plug(struct blk_plug *plug)
2956 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/block/blk-ioc.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/block/blk-ioc.c
2957 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/block/blk-ioc.c 2017-04-16 10:37:38.000000000 +0200
2958 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/block/blk-ioc.c      2017-04-18 17:54:21.000000000 +0200
2959 @@ -7,6 +7,7 @@
2960  #include <linux/bio.h>
2961  #include <linux/blkdev.h>
2962  #include <linux/slab.h>
2963 +#include <linux/delay.h>
2965  #include "blk.h"
2967 @@ -109,7 +110,7 @@
2968                         spin_unlock(q->queue_lock);
2969                 } else {
2970                         spin_unlock_irqrestore(&ioc->lock, flags);
2971 -                       cpu_relax();
2972 +                       cpu_chill();
2973                         spin_lock_irqsave_nested(&ioc->lock, flags, 1);
2974                 }
2975         }
2976 @@ -187,7 +188,7 @@
2977                         spin_unlock(icq->q->queue_lock);
2978                 } else {
2979                         spin_unlock_irqrestore(&ioc->lock, flags);
2980 -                       cpu_relax();
2981 +                       cpu_chill();
2982                         goto retry;
2983                 }
2984         }
2985 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/block/blk-mq.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/block/blk-mq.c
2986 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/block/blk-mq.c  2017-04-16 10:37:38.000000000 +0200
2987 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/block/blk-mq.c       2017-04-18 17:54:21.000000000 +0200
2988 @@ -72,7 +72,7 @@
2990  static void blk_mq_freeze_queue_wait(struct request_queue *q)
2992 -       wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter));
2993 +       swait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter));
2996  /*
2997 @@ -110,7 +110,7 @@
2998         WARN_ON_ONCE(freeze_depth < 0);
2999         if (!freeze_depth) {
3000                 percpu_ref_reinit(&q->q_usage_counter);
3001 -               wake_up_all(&q->mq_freeze_wq);
3002 +               swake_up_all(&q->mq_freeze_wq);
3003         }
3005  EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);
3006 @@ -129,7 +129,7 @@
3007          * dying, we need to ensure that processes currently waiting on
3008          * the queue are notified as well.
3009          */
3010 -       wake_up_all(&q->mq_freeze_wq);
3011 +       swake_up_all(&q->mq_freeze_wq);
3014  bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
3015 @@ -177,6 +177,9 @@
3016         rq->resid_len = 0;
3017         rq->sense = NULL;
3019 +#ifdef CONFIG_PREEMPT_RT_FULL
3020 +       INIT_WORK(&rq->work, __blk_mq_complete_request_remote_work);
3021 +#endif
3022         INIT_LIST_HEAD(&rq->timeout_list);
3023         rq->timeout = 0;
3025 @@ -345,6 +348,17 @@
3027  EXPORT_SYMBOL(blk_mq_end_request);
3029 +#ifdef CONFIG_PREEMPT_RT_FULL
3031 +void __blk_mq_complete_request_remote_work(struct work_struct *work)
3033 +       struct request *rq = container_of(work, struct request, work);
3035 +       rq->q->softirq_done_fn(rq);
3038 +#else
3040  static void __blk_mq_complete_request_remote(void *data)
3042         struct request *rq = data;
3043 @@ -352,6 +366,8 @@
3044         rq->q->softirq_done_fn(rq);
3047 +#endif
3049  static void blk_mq_ipi_complete_request(struct request *rq)
3051         struct blk_mq_ctx *ctx = rq->mq_ctx;
3052 @@ -363,19 +379,23 @@
3053                 return;
3054         }
3056 -       cpu = get_cpu();
3057 +       cpu = get_cpu_light();
3058         if (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags))
3059                 shared = cpus_share_cache(cpu, ctx->cpu);
3061         if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) {
3062 +#ifdef CONFIG_PREEMPT_RT_FULL
3063 +               schedule_work_on(ctx->cpu, &rq->work);
3064 +#else
3065                 rq->csd.func = __blk_mq_complete_request_remote;
3066                 rq->csd.info = rq;
3067                 rq->csd.flags = 0;
3068                 smp_call_function_single_async(ctx->cpu, &rq->csd);
3069 +#endif
3070         } else {
3071                 rq->q->softirq_done_fn(rq);
3072         }
3073 -       put_cpu();
3074 +       put_cpu_light();
3077  static void __blk_mq_complete_request(struct request *rq)
3078 @@ -906,14 +926,14 @@
3079                 return;
3081         if (!async && !(hctx->flags & BLK_MQ_F_BLOCKING)) {
3082 -               int cpu = get_cpu();
3083 +               int cpu = get_cpu_light();
3084                 if (cpumask_test_cpu(cpu, hctx->cpumask)) {
3085                         __blk_mq_run_hw_queue(hctx);
3086 -                       put_cpu();
3087 +                       put_cpu_light();
3088                         return;
3089                 }
3091 -               put_cpu();
3092 +               put_cpu_light();
3093         }
3095         kblockd_schedule_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work);
3096 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/block/blk-mq.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/block/blk-mq.h
3097 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/block/blk-mq.h  2017-04-16 10:37:38.000000000 +0200
3098 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/block/blk-mq.h       2017-04-18 17:54:21.000000000 +0200
3099 @@ -72,12 +72,12 @@
3100   */
3101  static inline struct blk_mq_ctx *blk_mq_get_ctx(struct request_queue *q)
3103 -       return __blk_mq_get_ctx(q, get_cpu());
3104 +       return __blk_mq_get_ctx(q, get_cpu_light());
3107  static inline void blk_mq_put_ctx(struct blk_mq_ctx *ctx)
3109 -       put_cpu();
3110 +       put_cpu_light();
3113  struct blk_mq_alloc_data {
3114 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/block/blk-softirq.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/block/blk-softirq.c
3115 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/block/blk-softirq.c     2017-04-16 10:37:38.000000000 +0200
3116 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/block/blk-softirq.c  2017-04-18 17:54:21.000000000 +0200
3117 @@ -51,6 +51,7 @@
3118                 raise_softirq_irqoff(BLOCK_SOFTIRQ);
3120         local_irq_restore(flags);
3121 +       preempt_check_resched_rt();
3124  /*
3125 @@ -89,6 +90,7 @@
3126                          this_cpu_ptr(&blk_cpu_done));
3127         raise_softirq_irqoff(BLOCK_SOFTIRQ);
3128         local_irq_enable();
3129 +       preempt_check_resched_rt();
3131         return 0;
3133 @@ -141,6 +143,7 @@
3134                 goto do_local;
3136         local_irq_restore(flags);
3137 +       preempt_check_resched_rt();
3140  /**
3141 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/block/bounce.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/block/bounce.c
3142 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/block/bounce.c  2017-04-16 10:37:38.000000000 +0200
3143 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/block/bounce.c       2017-04-18 17:54:21.000000000 +0200
3144 @@ -55,11 +55,11 @@
3145         unsigned long flags;
3146         unsigned char *vto;
3148 -       local_irq_save(flags);
3149 +       local_irq_save_nort(flags);
3150         vto = kmap_atomic(to->bv_page);
3151         memcpy(vto + to->bv_offset, vfrom, to->bv_len);
3152         kunmap_atomic(vto);
3153 -       local_irq_restore(flags);
3154 +       local_irq_restore_nort(flags);
3157  #else /* CONFIG_HIGHMEM */
3158 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/crypto/algapi.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/crypto/algapi.c
3159 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/crypto/algapi.c 2017-04-16 10:37:38.000000000 +0200
3160 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/crypto/algapi.c      2017-04-18 17:54:21.000000000 +0200
3161 @@ -719,13 +719,13 @@
3163  int crypto_register_notifier(struct notifier_block *nb)
3165 -       return blocking_notifier_chain_register(&crypto_chain, nb);
3166 +       return srcu_notifier_chain_register(&crypto_chain, nb);
3168  EXPORT_SYMBOL_GPL(crypto_register_notifier);
3170  int crypto_unregister_notifier(struct notifier_block *nb)
3172 -       return blocking_notifier_chain_unregister(&crypto_chain, nb);
3173 +       return srcu_notifier_chain_unregister(&crypto_chain, nb);
3175  EXPORT_SYMBOL_GPL(crypto_unregister_notifier);
3177 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/crypto/api.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/crypto/api.c
3178 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/crypto/api.c    2017-04-16 10:37:38.000000000 +0200
3179 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/crypto/api.c 2017-04-18 17:54:21.000000000 +0200
3180 @@ -31,7 +31,7 @@
3181  DECLARE_RWSEM(crypto_alg_sem);
3182  EXPORT_SYMBOL_GPL(crypto_alg_sem);
3184 -BLOCKING_NOTIFIER_HEAD(crypto_chain);
3185 +SRCU_NOTIFIER_HEAD(crypto_chain);
3186  EXPORT_SYMBOL_GPL(crypto_chain);
3188  static struct crypto_alg *crypto_larval_wait(struct crypto_alg *alg);
3189 @@ -236,10 +236,10 @@
3191         int ok;
3193 -       ok = blocking_notifier_call_chain(&crypto_chain, val, v);
3194 +       ok = srcu_notifier_call_chain(&crypto_chain, val, v);
3195         if (ok == NOTIFY_DONE) {
3196                 request_module("cryptomgr");
3197 -               ok = blocking_notifier_call_chain(&crypto_chain, val, v);
3198 +               ok = srcu_notifier_call_chain(&crypto_chain, val, v);
3199         }
3201         return ok;
3202 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/crypto/internal.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/crypto/internal.h
3203 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/crypto/internal.h       2017-04-16 10:37:38.000000000 +0200
3204 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/crypto/internal.h    2017-04-18 17:54:21.000000000 +0200
3205 @@ -47,7 +47,7 @@
3207  extern struct list_head crypto_alg_list;
3208  extern struct rw_semaphore crypto_alg_sem;
3209 -extern struct blocking_notifier_head crypto_chain;
3210 +extern struct srcu_notifier_head crypto_chain;
3212  #ifdef CONFIG_PROC_FS
3213  void __init crypto_init_proc(void);
3214 @@ -146,7 +146,7 @@
3216  static inline void crypto_notify(unsigned long val, void *v)
3218 -       blocking_notifier_call_chain(&crypto_chain, val, v);
3219 +       srcu_notifier_call_chain(&crypto_chain, val, v);
3222  #endif /* _CRYPTO_INTERNAL_H */
3223 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/acpi/acpica/acglobal.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/acpi/acpica/acglobal.h
3224 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/acpi/acpica/acglobal.h  2017-04-16 10:37:38.000000000 +0200
3225 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/acpi/acpica/acglobal.h       2017-04-18 17:54:22.000000000 +0200
3226 @@ -116,7 +116,7 @@
3227   * interrupt level
3228   */
3229  ACPI_GLOBAL(acpi_spinlock, acpi_gbl_gpe_lock); /* For GPE data structs and registers */
3230 -ACPI_GLOBAL(acpi_spinlock, acpi_gbl_hardware_lock);    /* For ACPI H/W except GPE registers */
3231 +ACPI_GLOBAL(acpi_raw_spinlock, acpi_gbl_hardware_lock);        /* For ACPI H/W except GPE registers */
3232  ACPI_GLOBAL(acpi_spinlock, acpi_gbl_reference_count_lock);
3234  /* Mutex for _OSI support */
3235 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/acpi/acpica/hwregs.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/acpi/acpica/hwregs.c
3236 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/acpi/acpica/hwregs.c    2017-04-16 10:37:38.000000000 +0200
3237 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/acpi/acpica/hwregs.c 2017-04-18 17:54:22.000000000 +0200
3238 @@ -363,14 +363,14 @@
3239                           ACPI_BITMASK_ALL_FIXED_STATUS,
3240                           ACPI_FORMAT_UINT64(acpi_gbl_xpm1a_status.address)));
3242 -       lock_flags = acpi_os_acquire_lock(acpi_gbl_hardware_lock);
3243 +       raw_spin_lock_irqsave(acpi_gbl_hardware_lock, lock_flags);
3245         /* Clear the fixed events in PM1 A/B */
3247         status = acpi_hw_register_write(ACPI_REGISTER_PM1_STATUS,
3248                                         ACPI_BITMASK_ALL_FIXED_STATUS);
3250 -       acpi_os_release_lock(acpi_gbl_hardware_lock, lock_flags);
3251 +       raw_spin_unlock_irqrestore(acpi_gbl_hardware_lock, lock_flags);
3253         if (ACPI_FAILURE(status)) {
3254                 goto exit;
3255 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/acpi/acpica/hwxface.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/acpi/acpica/hwxface.c
3256 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/acpi/acpica/hwxface.c   2017-04-16 10:37:38.000000000 +0200
3257 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/acpi/acpica/hwxface.c        2017-04-18 17:54:22.000000000 +0200
3258 @@ -373,7 +373,7 @@
3259                 return_ACPI_STATUS(AE_BAD_PARAMETER);
3260         }
3262 -       lock_flags = acpi_os_acquire_lock(acpi_gbl_hardware_lock);
3263 +       raw_spin_lock_irqsave(acpi_gbl_hardware_lock, lock_flags);
3265         /*
3266          * At this point, we know that the parent register is one of the
3267 @@ -434,7 +434,7 @@
3269  unlock_and_exit:
3271 -       acpi_os_release_lock(acpi_gbl_hardware_lock, lock_flags);
3272 +       raw_spin_unlock_irqrestore(acpi_gbl_hardware_lock, lock_flags);
3273         return_ACPI_STATUS(status);
3276 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/acpi/acpica/utmutex.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/acpi/acpica/utmutex.c
3277 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/acpi/acpica/utmutex.c   2017-04-16 10:37:39.000000000 +0200
3278 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/acpi/acpica/utmutex.c        2017-04-18 17:54:22.000000000 +0200
3279 @@ -88,7 +88,7 @@
3280                 return_ACPI_STATUS (status);
3281         }
3283 -       status = acpi_os_create_lock (&acpi_gbl_hardware_lock);
3284 +       status = acpi_os_create_raw_lock (&acpi_gbl_hardware_lock);
3285         if (ACPI_FAILURE (status)) {
3286                 return_ACPI_STATUS (status);
3287         }
3288 @@ -145,7 +145,7 @@
3289         /* Delete the spinlocks */
3291         acpi_os_delete_lock(acpi_gbl_gpe_lock);
3292 -       acpi_os_delete_lock(acpi_gbl_hardware_lock);
3293 +       acpi_os_delete_raw_lock(acpi_gbl_hardware_lock);
3294         acpi_os_delete_lock(acpi_gbl_reference_count_lock);
3296         /* Delete the reader/writer lock */
3297 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/ata/libata-sff.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/ata/libata-sff.c
3298 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/ata/libata-sff.c        2017-04-16 10:37:39.000000000 +0200
3299 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/ata/libata-sff.c     2017-04-18 17:54:22.000000000 +0200
3300 @@ -678,9 +678,9 @@
3301         unsigned long flags;
3302         unsigned int consumed;
3304 -       local_irq_save(flags);
3305 +       local_irq_save_nort(flags);
3306         consumed = ata_sff_data_xfer32(dev, buf, buflen, rw);
3307 -       local_irq_restore(flags);
3308 +       local_irq_restore_nort(flags);
3310         return consumed;
3312 @@ -719,7 +719,7 @@
3313                 unsigned long flags;
3315                 /* FIXME: use a bounce buffer */
3316 -               local_irq_save(flags);
3317 +               local_irq_save_nort(flags);
3318                 buf = kmap_atomic(page);
3320                 /* do the actual data transfer */
3321 @@ -727,7 +727,7 @@
3322                                        do_write);
3324                 kunmap_atomic(buf);
3325 -               local_irq_restore(flags);
3326 +               local_irq_restore_nort(flags);
3327         } else {
3328                 buf = page_address(page);
3329                 ap->ops->sff_data_xfer(qc->dev, buf + offset, qc->sect_size,
3330 @@ -864,7 +864,7 @@
3331                 unsigned long flags;
3333                 /* FIXME: use bounce buffer */
3334 -               local_irq_save(flags);
3335 +               local_irq_save_nort(flags);
3336                 buf = kmap_atomic(page);
3338                 /* do the actual data transfer */
3339 @@ -872,7 +872,7 @@
3340                                                                 count, rw);
3342                 kunmap_atomic(buf);
3343 -               local_irq_restore(flags);
3344 +               local_irq_restore_nort(flags);
3345         } else {
3346                 buf = page_address(page);
3347                 consumed = ap->ops->sff_data_xfer(dev,  buf + offset,
3348 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/block/zram/zcomp.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/block/zram/zcomp.c
3349 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/block/zram/zcomp.c      2017-04-16 10:37:39.000000000 +0200
3350 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/block/zram/zcomp.c   2017-04-18 17:54:22.000000000 +0200
3351 @@ -118,12 +118,19 @@
3353  struct zcomp_strm *zcomp_stream_get(struct zcomp *comp)
3355 -       return *get_cpu_ptr(comp->stream);
3356 +       struct zcomp_strm *zstrm;
3358 +       zstrm = *this_cpu_ptr(comp->stream);
3359 +       spin_lock(&zstrm->zcomp_lock);
3360 +       return zstrm;
3363  void zcomp_stream_put(struct zcomp *comp)
3365 -       put_cpu_ptr(comp->stream);
3366 +       struct zcomp_strm *zstrm;
3368 +       zstrm = *this_cpu_ptr(comp->stream);
3369 +       spin_unlock(&zstrm->zcomp_lock);
3372  int zcomp_compress(struct zcomp_strm *zstrm,
3373 @@ -174,6 +181,7 @@
3374                         pr_err("Can't allocate a compression stream\n");
3375                         return NOTIFY_BAD;
3376                 }
3377 +               spin_lock_init(&zstrm->zcomp_lock);
3378                 *per_cpu_ptr(comp->stream, cpu) = zstrm;
3379                 break;
3380         case CPU_DEAD:
3381 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/block/zram/zcomp.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/block/zram/zcomp.h
3382 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/block/zram/zcomp.h      2017-04-16 10:37:39.000000000 +0200
3383 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/block/zram/zcomp.h   2017-04-18 17:54:22.000000000 +0200
3384 @@ -14,6 +14,7 @@
3385         /* compression/decompression buffer */
3386         void *buffer;
3387         struct crypto_comp *tfm;
3388 +       spinlock_t zcomp_lock;
3389  };
3391  /* dynamic per-device compression frontend */
3392 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/block/zram/zram_drv.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/block/zram/zram_drv.c
3393 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/block/zram/zram_drv.c   2017-04-16 10:37:39.000000000 +0200
3394 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/block/zram/zram_drv.c        2017-04-18 17:54:22.000000000 +0200
3395 @@ -528,6 +528,8 @@
3396                 goto out_error;
3397         }
3399 +       zram_meta_init_table_locks(meta, disksize);
3401         return meta;
3403  out_error:
3404 @@ -575,28 +577,28 @@
3405         struct zram_meta *meta = zram->meta;
3406         unsigned long handle;
3407         unsigned int size;
3408 +       struct zcomp_strm *zstrm;
3410 -       bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
3411 +       zram_lock_table(&meta->table[index]);
3412         handle = meta->table[index].handle;
3413         size = zram_get_obj_size(meta, index);
3415         if (!handle || zram_test_flag(meta, index, ZRAM_ZERO)) {
3416 -               bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
3417 +               zram_unlock_table(&meta->table[index]);
3418                 clear_page(mem);
3419                 return 0;
3420         }
3422 +       zstrm = zcomp_stream_get(zram->comp);
3423         cmem = zs_map_object(meta->mem_pool, handle, ZS_MM_RO);
3424         if (size == PAGE_SIZE) {
3425                 copy_page(mem, cmem);
3426         } else {
3427 -               struct zcomp_strm *zstrm = zcomp_stream_get(zram->comp);
3429                 ret = zcomp_decompress(zstrm, cmem, size, mem);
3430 -               zcomp_stream_put(zram->comp);
3431         }
3432         zs_unmap_object(meta->mem_pool, handle);
3433 -       bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
3434 +       zcomp_stream_put(zram->comp);
3435 +       zram_unlock_table(&meta->table[index]);
3437         /* Should NEVER happen. Return bio error if it does. */
3438         if (unlikely(ret)) {
3439 @@ -616,14 +618,14 @@
3440         struct zram_meta *meta = zram->meta;
3441         page = bvec->bv_page;
3443 -       bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
3444 +       zram_lock_table(&meta->table[index]);
3445         if (unlikely(!meta->table[index].handle) ||
3446                         zram_test_flag(meta, index, ZRAM_ZERO)) {
3447 -               bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
3448 +               zram_unlock_table(&meta->table[index]);
3449                 handle_zero_page(bvec);
3450                 return 0;
3451         }
3452 -       bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
3453 +       zram_unlock_table(&meta->table[index]);
3455         if (is_partial_io(bvec))
3456                 /* Use  a temporary buffer to decompress the page */
3457 @@ -700,10 +702,10 @@
3458                 if (user_mem)
3459                         kunmap_atomic(user_mem);
3460                 /* Free memory associated with this sector now. */
3461 -               bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
3462 +               zram_lock_table(&meta->table[index]);
3463                 zram_free_page(zram, index);
3464                 zram_set_flag(meta, index, ZRAM_ZERO);
3465 -               bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
3466 +               zram_unlock_table(&meta->table[index]);
3468                 atomic64_inc(&zram->stats.zero_pages);
3469                 ret = 0;
3470 @@ -794,12 +796,12 @@
3471          * Free memory associated with this sector
3472          * before overwriting unused sectors.
3473          */
3474 -       bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
3475 +       zram_lock_table(&meta->table[index]);
3476         zram_free_page(zram, index);
3478         meta->table[index].handle = handle;
3479         zram_set_obj_size(meta, index, clen);
3480 -       bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
3481 +       zram_unlock_table(&meta->table[index]);
3483         /* Update stats */
3484         atomic64_add(clen, &zram->stats.compr_data_size);
3485 @@ -842,9 +844,9 @@
3486         }
3488         while (n >= PAGE_SIZE) {
3489 -               bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
3490 +               zram_lock_table(&meta->table[index]);
3491                 zram_free_page(zram, index);
3492 -               bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
3493 +               zram_unlock_table(&meta->table[index]);
3494                 atomic64_inc(&zram->stats.notify_free);
3495                 index++;
3496                 n -= PAGE_SIZE;
3497 @@ -973,9 +975,9 @@
3498         zram = bdev->bd_disk->private_data;
3499         meta = zram->meta;
3501 -       bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
3502 +       zram_lock_table(&meta->table[index]);
3503         zram_free_page(zram, index);
3504 -       bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
3505 +       zram_unlock_table(&meta->table[index]);
3506         atomic64_inc(&zram->stats.notify_free);
3509 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/block/zram/zram_drv.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/block/zram/zram_drv.h
3510 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/block/zram/zram_drv.h   2017-04-16 10:37:39.000000000 +0200
3511 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/block/zram/zram_drv.h        2017-04-18 17:54:22.000000000 +0200
3512 @@ -73,6 +73,9 @@
3513  struct zram_table_entry {
3514         unsigned long handle;
3515         unsigned long value;
3516 +#ifdef CONFIG_PREEMPT_RT_BASE
3517 +       spinlock_t lock;
3518 +#endif
3519  };
3521  struct zram_stats {
3522 @@ -120,4 +123,42 @@
3523          */
3524         bool claim; /* Protected by bdev->bd_mutex */
3525  };
3527 +#ifndef CONFIG_PREEMPT_RT_BASE
3528 +static inline void zram_lock_table(struct zram_table_entry *table)
3530 +       bit_spin_lock(ZRAM_ACCESS, &table->value);
3533 +static inline void zram_unlock_table(struct zram_table_entry *table)
3535 +       bit_spin_unlock(ZRAM_ACCESS, &table->value);
3538 +static inline void zram_meta_init_table_locks(struct zram_meta *meta, u64 disksize) { }
3539 +#else /* CONFIG_PREEMPT_RT_BASE */
3540 +static inline void zram_lock_table(struct zram_table_entry *table)
3542 +       spin_lock(&table->lock);
3543 +       __set_bit(ZRAM_ACCESS, &table->value);
3546 +static inline void zram_unlock_table(struct zram_table_entry *table)
3548 +       __clear_bit(ZRAM_ACCESS, &table->value);
3549 +       spin_unlock(&table->lock);
3552 +static inline void zram_meta_init_table_locks(struct zram_meta *meta, u64 disksize)
3554 +        size_t num_pages = disksize >> PAGE_SHIFT;
3555 +        size_t index;
3557 +        for (index = 0; index < num_pages; index++) {
3558 +               spinlock_t *lock = &meta->table[index].lock;
3559 +               spin_lock_init(lock);
3560 +        }
3562 +#endif /* CONFIG_PREEMPT_RT_BASE */
3564  #endif
3565 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/char/random.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/char/random.c
3566 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/char/random.c   2017-04-16 10:37:39.000000000 +0200
3567 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/char/random.c        2017-04-18 17:54:22.000000000 +0200
3568 @@ -1028,8 +1028,6 @@
3569         } sample;
3570         long delta, delta2, delta3;
3572 -       preempt_disable();
3574         sample.jiffies = jiffies;
3575         sample.cycles = random_get_entropy();
3576         sample.num = num;
3577 @@ -1070,7 +1068,6 @@
3578                  */
3579                 credit_entropy_bits(r, min_t(int, fls(delta>>1), 11));
3580         }
3581 -       preempt_enable();
3584  void add_input_randomness(unsigned int type, unsigned int code,
3585 @@ -1123,28 +1120,27 @@
3586         return *(ptr + f->reg_idx++);
3589 -void add_interrupt_randomness(int irq, int irq_flags)
3590 +void add_interrupt_randomness(int irq, int irq_flags, __u64 ip)
3592         struct entropy_store    *r;
3593         struct fast_pool        *fast_pool = this_cpu_ptr(&irq_randomness);
3594 -       struct pt_regs          *regs = get_irq_regs();
3595         unsigned long           now = jiffies;
3596         cycles_t                cycles = random_get_entropy();
3597         __u32                   c_high, j_high;
3598 -       __u64                   ip;
3599         unsigned long           seed;
3600         int                     credit = 0;
3602         if (cycles == 0)
3603 -               cycles = get_reg(fast_pool, regs);
3604 +               cycles = get_reg(fast_pool, NULL);
3605         c_high = (sizeof(cycles) > 4) ? cycles >> 32 : 0;
3606         j_high = (sizeof(now) > 4) ? now >> 32 : 0;
3607         fast_pool->pool[0] ^= cycles ^ j_high ^ irq;
3608         fast_pool->pool[1] ^= now ^ c_high;
3609 -       ip = regs ? instruction_pointer(regs) : _RET_IP_;
3610 +       if (!ip)
3611 +               ip = _RET_IP_;
3612         fast_pool->pool[2] ^= ip;
3613         fast_pool->pool[3] ^= (sizeof(ip) > 4) ? ip >> 32 :
3614 -               get_reg(fast_pool, regs);
3615 +               get_reg(fast_pool, NULL);
3617         fast_mix(fast_pool);
3618         add_interrupt_bench(cycles);
3619 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/clocksource/tcb_clksrc.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/clocksource/tcb_clksrc.c
3620 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/clocksource/tcb_clksrc.c        2017-04-16 10:37:40.000000000 +0200
3621 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/clocksource/tcb_clksrc.c     2017-04-18 17:54:22.000000000 +0200
3622 @@ -23,8 +23,7 @@
3623   *     this 32 bit free-running counter. the second channel is not used.
3624   *
3625   *   - The third channel may be used to provide a 16-bit clockevent
3626 - *     source, used in either periodic or oneshot mode.  This runs
3627 - *     at 32 KiHZ, and can handle delays of up to two seconds.
3628 + *     source, used in either periodic or oneshot mode.
3629   *
3630   * A boot clocksource and clockevent source are also currently needed,
3631   * unless the relevant platforms (ARM/AT91, AVR32/AT32) are changed so
3632 @@ -74,6 +73,8 @@
3633  struct tc_clkevt_device {
3634         struct clock_event_device       clkevt;
3635         struct clk                      *clk;
3636 +       bool                            clk_enabled;
3637 +       u32                             freq;
3638         void __iomem                    *regs;
3639  };
3641 @@ -82,15 +83,26 @@
3642         return container_of(clkevt, struct tc_clkevt_device, clkevt);
3645 -/* For now, we always use the 32K clock ... this optimizes for NO_HZ,
3646 - * because using one of the divided clocks would usually mean the
3647 - * tick rate can never be less than several dozen Hz (vs 0.5 Hz).
3648 - *
3649 - * A divided clock could be good for high resolution timers, since
3650 - * 30.5 usec resolution can seem "low".
3651 - */
3652  static u32 timer_clock;
3654 +static void tc_clk_disable(struct clock_event_device *d)
3656 +       struct tc_clkevt_device *tcd = to_tc_clkevt(d);
3658 +       clk_disable(tcd->clk);
3659 +       tcd->clk_enabled = false;
3662 +static void tc_clk_enable(struct clock_event_device *d)
3664 +       struct tc_clkevt_device *tcd = to_tc_clkevt(d);
3666 +       if (tcd->clk_enabled)
3667 +               return;
3668 +       clk_enable(tcd->clk);
3669 +       tcd->clk_enabled = true;
3672  static int tc_shutdown(struct clock_event_device *d)
3674         struct tc_clkevt_device *tcd = to_tc_clkevt(d);
3675 @@ -98,8 +110,14 @@
3677         __raw_writel(0xff, regs + ATMEL_TC_REG(2, IDR));
3678         __raw_writel(ATMEL_TC_CLKDIS, regs + ATMEL_TC_REG(2, CCR));
3679 +       return 0;
3682 +static int tc_shutdown_clk_off(struct clock_event_device *d)
3684 +       tc_shutdown(d);
3685         if (!clockevent_state_detached(d))
3686 -               clk_disable(tcd->clk);
3687 +               tc_clk_disable(d);
3689         return 0;
3691 @@ -112,9 +130,9 @@
3692         if (clockevent_state_oneshot(d) || clockevent_state_periodic(d))
3693                 tc_shutdown(d);
3695 -       clk_enable(tcd->clk);
3696 +       tc_clk_enable(d);
3698 -       /* slow clock, count up to RC, then irq and stop */
3699 +       /* count up to RC, then irq and stop */
3700         __raw_writel(timer_clock | ATMEL_TC_CPCSTOP | ATMEL_TC_WAVE |
3701                      ATMEL_TC_WAVESEL_UP_AUTO, regs + ATMEL_TC_REG(2, CMR));
3702         __raw_writel(ATMEL_TC_CPCS, regs + ATMEL_TC_REG(2, IER));
3703 @@ -134,12 +152,12 @@
3704         /* By not making the gentime core emulate periodic mode on top
3705          * of oneshot, we get lower overhead and improved accuracy.
3706          */
3707 -       clk_enable(tcd->clk);
3708 +       tc_clk_enable(d);
3710 -       /* slow clock, count up to RC, then irq and restart */
3711 +       /* count up to RC, then irq and restart */
3712         __raw_writel(timer_clock | ATMEL_TC_WAVE | ATMEL_TC_WAVESEL_UP_AUTO,
3713                      regs + ATMEL_TC_REG(2, CMR));
3714 -       __raw_writel((32768 + HZ / 2) / HZ, tcaddr + ATMEL_TC_REG(2, RC));
3715 +       __raw_writel((tcd->freq + HZ / 2) / HZ, tcaddr + ATMEL_TC_REG(2, RC));
3717         /* Enable clock and interrupts on RC compare */
3718         __raw_writel(ATMEL_TC_CPCS, regs + ATMEL_TC_REG(2, IER));
3719 @@ -166,9 +184,13 @@
3720                 .features               = CLOCK_EVT_FEAT_PERIODIC |
3721                                           CLOCK_EVT_FEAT_ONESHOT,
3722                 /* Should be lower than at91rm9200's system timer */
3723 +#ifdef CONFIG_ATMEL_TCB_CLKSRC_USE_SLOW_CLOCK
3724                 .rating                 = 125,
3725 +#else
3726 +               .rating                 = 200,
3727 +#endif
3728                 .set_next_event         = tc_next_event,
3729 -               .set_state_shutdown     = tc_shutdown,
3730 +               .set_state_shutdown     = tc_shutdown_clk_off,
3731                 .set_state_periodic     = tc_set_periodic,
3732                 .set_state_oneshot      = tc_set_oneshot,
3733         },
3734 @@ -188,8 +210,9 @@
3735         return IRQ_NONE;
3738 -static int __init setup_clkevents(struct atmel_tc *tc, int clk32k_divisor_idx)
3739 +static int __init setup_clkevents(struct atmel_tc *tc, int divisor_idx)
3741 +       unsigned divisor = atmel_tc_divisors[divisor_idx];
3742         int ret;
3743         struct clk *t2_clk = tc->clk[2];
3744         int irq = tc->irq[2];
3745 @@ -210,7 +233,11 @@
3746         clkevt.regs = tc->regs;
3747         clkevt.clk = t2_clk;
3749 -       timer_clock = clk32k_divisor_idx;
3750 +       timer_clock = divisor_idx;
3751 +       if (!divisor)
3752 +               clkevt.freq = 32768;
3753 +       else
3754 +               clkevt.freq = clk_get_rate(t2_clk) / divisor;
3756         clkevt.clkevt.cpumask = cpumask_of(0);
3758 @@ -221,7 +248,7 @@
3759                 return ret;
3760         }
3762 -       clockevents_config_and_register(&clkevt.clkevt, 32768, 1, 0xffff);
3763 +       clockevents_config_and_register(&clkevt.clkevt, clkevt.freq, 1, 0xffff);
3765         return ret;
3767 @@ -358,7 +385,11 @@
3768                 goto err_disable_t1;
3770         /* channel 2:  periodic and oneshot timer support */
3771 +#ifdef CONFIG_ATMEL_TCB_CLKSRC_USE_SLOW_CLOCK
3772         ret = setup_clkevents(tc, clk32k_divisor_idx);
3773 +#else
3774 +       ret = setup_clkevents(tc, best_divisor_idx);
3775 +#endif
3776         if (ret)
3777                 goto err_unregister_clksrc;
3779 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/clocksource/timer-atmel-pit.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/clocksource/timer-atmel-pit.c
3780 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/clocksource/timer-atmel-pit.c   2017-04-16 10:37:40.000000000 +0200
3781 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/clocksource/timer-atmel-pit.c        2017-04-18 17:54:22.000000000 +0200
3782 @@ -46,6 +46,7 @@
3783         u32             cycle;
3784         u32             cnt;
3785         unsigned int    irq;
3786 +       bool            irq_requested;
3787         struct clk      *mck;
3788  };
3790 @@ -96,15 +97,29 @@
3792         /* disable irq, leaving the clocksource active */
3793         pit_write(data->base, AT91_PIT_MR, (data->cycle - 1) | AT91_PIT_PITEN);
3794 +       if (data->irq_requested) {
3795 +               free_irq(data->irq, data);
3796 +               data->irq_requested = false;
3797 +       }
3798         return 0;
3801 +static irqreturn_t at91sam926x_pit_interrupt(int irq, void *dev_id);
3802  /*
3803   * Clockevent device:  interrupts every 1/HZ (== pit_cycles * MCK/16)
3804   */
3805  static int pit_clkevt_set_periodic(struct clock_event_device *dev)
3807         struct pit_data *data = clkevt_to_pit_data(dev);
3808 +       int ret;
3810 +       ret = request_irq(data->irq, at91sam926x_pit_interrupt,
3811 +                         IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
3812 +                         "at91_tick", data);
3813 +       if (ret)
3814 +               panic(pr_fmt("Unable to setup IRQ\n"));
3816 +       data->irq_requested = true;
3818         /* update clocksource counter */
3819         data->cnt += data->cycle * PIT_PICNT(pit_read(data->base, AT91_PIT_PIVR));
3820 @@ -230,15 +245,6 @@
3821                 return ret;
3822         }
3824 -       /* Set up irq handler */
3825 -       ret = request_irq(data->irq, at91sam926x_pit_interrupt,
3826 -                         IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
3827 -                         "at91_tick", data);
3828 -       if (ret) {
3829 -               pr_err("Unable to setup IRQ\n");
3830 -               return ret;
3831 -       }
3833         /* Set up and register clockevents */
3834         data->clkevt.name = "pit";
3835         data->clkevt.features = CLOCK_EVT_FEAT_PERIODIC;
3836 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/clocksource/timer-atmel-st.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/clocksource/timer-atmel-st.c
3837 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/clocksource/timer-atmel-st.c    2017-04-16 10:37:40.000000000 +0200
3838 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/clocksource/timer-atmel-st.c 2017-04-18 17:54:22.000000000 +0200
3839 @@ -115,18 +115,29 @@
3840         last_crtr = read_CRTR();
3843 +static int atmel_st_irq;
3845  static int clkevt32k_shutdown(struct clock_event_device *evt)
3847         clkdev32k_disable_and_flush_irq();
3848         irqmask = 0;
3849         regmap_write(regmap_st, AT91_ST_IER, irqmask);
3850 +       free_irq(atmel_st_irq, regmap_st);
3851         return 0;
3854  static int clkevt32k_set_oneshot(struct clock_event_device *dev)
3856 +       int ret;
3858         clkdev32k_disable_and_flush_irq();
3860 +       ret = request_irq(atmel_st_irq, at91rm9200_timer_interrupt,
3861 +                         IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
3862 +                         "at91_tick", regmap_st);
3863 +       if (ret)
3864 +               panic(pr_fmt("Unable to setup IRQ\n"));
3866         /*
3867          * ALM for oneshot irqs, set by next_event()
3868          * before 32 seconds have passed.
3869 @@ -139,8 +150,16 @@
3871  static int clkevt32k_set_periodic(struct clock_event_device *dev)
3873 +       int ret;
3875         clkdev32k_disable_and_flush_irq();
3877 +       ret = request_irq(atmel_st_irq, at91rm9200_timer_interrupt,
3878 +                         IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
3879 +                         "at91_tick", regmap_st);
3880 +       if (ret)
3881 +               panic(pr_fmt("Unable to setup IRQ\n"));
3883         /* PIT for periodic irqs; fixed rate of 1/HZ */
3884         irqmask = AT91_ST_PITS;
3885         regmap_write(regmap_st, AT91_ST_PIMR, timer_latch);
3886 @@ -198,7 +217,7 @@
3888         struct clk *sclk;
3889         unsigned int sclk_rate, val;
3890 -       int irq, ret;
3891 +       int ret;
3893         regmap_st = syscon_node_to_regmap(node);
3894         if (IS_ERR(regmap_st)) {
3895 @@ -212,21 +231,12 @@
3896         regmap_read(regmap_st, AT91_ST_SR, &val);
3898         /* Get the interrupts property */
3899 -       irq  = irq_of_parse_and_map(node, 0);
3900 -       if (!irq) {
3901 +       atmel_st_irq  = irq_of_parse_and_map(node, 0);
3902 +       if (!atmel_st_irq) {
3903                 pr_err("Unable to get IRQ from DT\n");
3904                 return -EINVAL;
3905         }
3907 -       /* Make IRQs happen for the system timer */
3908 -       ret = request_irq(irq, at91rm9200_timer_interrupt,
3909 -                         IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
3910 -                         "at91_tick", regmap_st);
3911 -       if (ret) {
3912 -               pr_err("Unable to setup IRQ\n");
3913 -               return ret;
3914 -       }
3916         sclk = of_clk_get(node, 0);
3917         if (IS_ERR(sclk)) {
3918                 pr_err("Unable to get slow clock\n");
3919 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/connector/cn_proc.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/connector/cn_proc.c
3920 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/connector/cn_proc.c     2017-04-16 10:37:40.000000000 +0200
3921 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/connector/cn_proc.c  2017-04-18 17:54:22.000000000 +0200
3922 @@ -32,6 +32,7 @@
3923  #include <linux/pid_namespace.h>
3925  #include <linux/cn_proc.h>
3926 +#include <linux/locallock.h>
3928  /*
3929   * Size of a cn_msg followed by a proc_event structure.  Since the
3930 @@ -54,10 +55,11 @@
3932  /* proc_event_counts is used as the sequence number of the netlink message */
3933  static DEFINE_PER_CPU(__u32, proc_event_counts) = { 0 };
3934 +static DEFINE_LOCAL_IRQ_LOCK(send_msg_lock);
3936  static inline void send_msg(struct cn_msg *msg)
3938 -       preempt_disable();
3939 +       local_lock(send_msg_lock);
3941         msg->seq = __this_cpu_inc_return(proc_event_counts) - 1;
3942         ((struct proc_event *)msg->data)->cpu = smp_processor_id();
3943 @@ -70,7 +72,7 @@
3944          */
3945         cn_netlink_send(msg, 0, CN_IDX_PROC, GFP_NOWAIT);
3947 -       preempt_enable();
3948 +       local_unlock(send_msg_lock);
3951  void proc_fork_connector(struct task_struct *task)
3952 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/cpufreq/Kconfig.x86 linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/cpufreq/Kconfig.x86
3953 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/cpufreq/Kconfig.x86     2017-04-16 10:37:40.000000000 +0200
3954 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/cpufreq/Kconfig.x86  2017-04-18 17:54:22.000000000 +0200
3955 @@ -124,7 +124,7 @@
3957  config X86_POWERNOW_K8
3958         tristate "AMD Opteron/Athlon64 PowerNow!"
3959 -       depends on ACPI && ACPI_PROCESSOR && X86_ACPI_CPUFREQ
3960 +       depends on ACPI && ACPI_PROCESSOR && X86_ACPI_CPUFREQ && !PREEMPT_RT_BASE
3961         help
3962           This adds the CPUFreq driver for K8/early Opteron/Athlon64 processors.
3963           Support for K10 and newer processors is now in acpi-cpufreq.
3964 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/gpu/drm/i915/i915_gem_execbuffer.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
3965 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/gpu/drm/i915/i915_gem_execbuffer.c      2017-04-16 10:37:42.000000000 +0200
3966 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/gpu/drm/i915/i915_gem_execbuffer.c   2017-04-18 17:54:22.000000000 +0200
3967 @@ -1537,7 +1537,9 @@
3968         if (ret)
3969                 return ret;
3971 +#ifndef CONFIG_PREEMPT_RT_BASE
3972         trace_i915_gem_ring_dispatch(params->request, params->dispatch_flags);
3973 +#endif
3975         i915_gem_execbuffer_move_to_active(vmas, params->request);
3977 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/gpu/drm/i915/i915_gem_shrinker.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/gpu/drm/i915/i915_gem_shrinker.c
3978 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/gpu/drm/i915/i915_gem_shrinker.c        2017-04-16 10:37:42.000000000 +0200
3979 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/gpu/drm/i915/i915_gem_shrinker.c     2017-04-18 17:54:22.000000000 +0200
3980 @@ -40,7 +40,7 @@
3981         if (!mutex_is_locked(mutex))
3982                 return false;
3984 -#if defined(CONFIG_DEBUG_MUTEXES) || defined(CONFIG_MUTEX_SPIN_ON_OWNER)
3985 +#if (defined(CONFIG_DEBUG_MUTEXES) || defined(CONFIG_MUTEX_SPIN_ON_OWNER)) && !defined(CONFIG_PREEMPT_RT_BASE)
3986         return mutex->owner == task;
3987  #else
3988         /* Since UP may be pre-empted, we cannot assume that we own the lock */
3989 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/gpu/drm/i915/i915_irq.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/gpu/drm/i915/i915_irq.c
3990 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/gpu/drm/i915/i915_irq.c 2017-04-16 10:37:42.000000000 +0200
3991 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/gpu/drm/i915/i915_irq.c      2017-04-18 17:54:22.000000000 +0200
3992 @@ -812,6 +812,7 @@
3993         spin_lock_irqsave(&dev_priv->uncore.lock, irqflags);
3995         /* preempt_disable_rt() should go right here in PREEMPT_RT patchset. */
3996 +       preempt_disable_rt();
3998         /* Get optional system timestamp before query. */
3999         if (stime)
4000 @@ -863,6 +864,7 @@
4001                 *etime = ktime_get();
4003         /* preempt_enable_rt() should go right here in PREEMPT_RT patchset. */
4004 +       preempt_enable_rt();
4006         spin_unlock_irqrestore(&dev_priv->uncore.lock, irqflags);
4008 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/gpu/drm/i915/intel_display.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/gpu/drm/i915/intel_display.c
4009 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/gpu/drm/i915/intel_display.c    2017-04-16 10:37:42.000000000 +0200
4010 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/gpu/drm/i915/intel_display.c 2017-04-18 17:54:22.000000000 +0200
4011 @@ -12141,7 +12141,7 @@
4012         struct intel_crtc *intel_crtc = to_intel_crtc(crtc);
4013         struct intel_flip_work *work;
4015 -       WARN_ON(!in_interrupt());
4016 +       WARN_ON_NONRT(!in_interrupt());
4018         if (crtc == NULL)
4019                 return;
4020 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/gpu/drm/i915/intel_sprite.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/gpu/drm/i915/intel_sprite.c
4021 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/gpu/drm/i915/intel_sprite.c     2017-04-16 10:37:42.000000000 +0200
4022 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/gpu/drm/i915/intel_sprite.c  2017-04-18 17:54:22.000000000 +0200
4023 @@ -35,6 +35,7 @@
4024  #include <drm/drm_rect.h>
4025  #include <drm/drm_atomic.h>
4026  #include <drm/drm_plane_helper.h>
4027 +#include <linux/locallock.h>
4028  #include "intel_drv.h"
4029  #include "intel_frontbuffer.h"
4030  #include <drm/i915_drm.h>
4031 @@ -65,6 +66,8 @@
4032                             1000 * adjusted_mode->crtc_htotal);
4035 +static DEFINE_LOCAL_IRQ_LOCK(pipe_update_lock);
4037  /**
4038   * intel_pipe_update_start() - start update of a set of display registers
4039   * @crtc: the crtc of which the registers are going to be updated
4040 @@ -95,7 +98,7 @@
4041         min = vblank_start - intel_usecs_to_scanlines(adjusted_mode, 100);
4042         max = vblank_start - 1;
4044 -       local_irq_disable();
4045 +       local_lock_irq(pipe_update_lock);
4047         if (min <= 0 || max <= 0)
4048                 return;
4049 @@ -125,11 +128,11 @@
4050                         break;
4051                 }
4053 -               local_irq_enable();
4054 +               local_unlock_irq(pipe_update_lock);
4056                 timeout = schedule_timeout(timeout);
4058 -               local_irq_disable();
4059 +               local_lock_irq(pipe_update_lock);
4060         }
4062         finish_wait(wq, &wait);
4063 @@ -181,7 +184,7 @@
4064                 crtc->base.state->event = NULL;
4065         }
4067 -       local_irq_enable();
4068 +       local_unlock_irq(pipe_update_lock);
4070         if (crtc->debug.start_vbl_count &&
4071             crtc->debug.start_vbl_count != end_vbl_count) {
4072 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/gpu/drm/msm/msm_gem_shrinker.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/gpu/drm/msm/msm_gem_shrinker.c
4073 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/gpu/drm/msm/msm_gem_shrinker.c  2017-04-16 10:37:42.000000000 +0200
4074 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/gpu/drm/msm/msm_gem_shrinker.c       2017-04-18 17:54:22.000000000 +0200
4075 @@ -23,7 +23,7 @@
4076         if (!mutex_is_locked(mutex))
4077                 return false;
4079 -#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_MUTEXES)
4080 +#if (defined(CONFIG_SMP) || defined(CONFIG_DEBUG_MUTEXES)) && !defined(CONFIG_PREEMPT_RT_BASE)
4081         return mutex->owner == task;
4082  #else
4083         /* Since UP may be pre-empted, we cannot assume that we own the lock */
4084 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/gpu/drm/radeon/radeon_display.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/gpu/drm/radeon/radeon_display.c
4085 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/gpu/drm/radeon/radeon_display.c 2017-04-16 10:37:43.000000000 +0200
4086 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/gpu/drm/radeon/radeon_display.c      2017-04-18 17:54:23.000000000 +0200
4087 @@ -1845,6 +1845,7 @@
4088         struct radeon_device *rdev = dev->dev_private;
4090         /* preempt_disable_rt() should go right here in PREEMPT_RT patchset. */
4091 +       preempt_disable_rt();
4093         /* Get optional system timestamp before query. */
4094         if (stime)
4095 @@ -1937,6 +1938,7 @@
4096                 *etime = ktime_get();
4098         /* preempt_enable_rt() should go right here in PREEMPT_RT patchset. */
4099 +       preempt_enable_rt();
4101         /* Decode into vertical and horizontal scanout position. */
4102         *vpos = position & 0x1fff;
4103 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/hv/vmbus_drv.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/hv/vmbus_drv.c
4104 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/hv/vmbus_drv.c  2017-04-16 10:37:44.000000000 +0200
4105 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/hv/vmbus_drv.c       2017-04-18 17:54:23.000000000 +0200
4106 @@ -761,6 +761,8 @@
4107         void *page_addr;
4108         struct hv_message *msg;
4109         union hv_synic_event_flags *event;
4110 +       struct pt_regs *regs = get_irq_regs();
4111 +       u64 ip = regs ? instruction_pointer(regs) : 0;
4112         bool handled = false;
4114         page_addr = hv_context.synic_event_page[cpu];
4115 @@ -808,7 +810,7 @@
4116                         tasklet_schedule(hv_context.msg_dpc[cpu]);
4117         }
4119 -       add_interrupt_randomness(HYPERVISOR_CALLBACK_VECTOR, 0);
4120 +       add_interrupt_randomness(HYPERVISOR_CALLBACK_VECTOR, 0, ip);
4124 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/ide/alim15x3.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/ide/alim15x3.c
4125 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/ide/alim15x3.c  2017-04-16 10:37:44.000000000 +0200
4126 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/ide/alim15x3.c       2017-04-18 17:54:23.000000000 +0200
4127 @@ -234,7 +234,7 @@
4129         isa_dev = pci_get_device(PCI_VENDOR_ID_AL, PCI_DEVICE_ID_AL_M1533, NULL);
4131 -       local_irq_save(flags);
4132 +       local_irq_save_nort(flags);
4134         if (m5229_revision < 0xC2) {
4135                 /*
4136 @@ -325,7 +325,7 @@
4137         }
4138         pci_dev_put(north);
4139         pci_dev_put(isa_dev);
4140 -       local_irq_restore(flags);
4141 +       local_irq_restore_nort(flags);
4142         return 0;
4145 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/ide/hpt366.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/ide/hpt366.c
4146 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/ide/hpt366.c    2017-04-16 10:37:44.000000000 +0200
4147 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/ide/hpt366.c 2017-04-18 17:54:23.000000000 +0200
4148 @@ -1236,7 +1236,7 @@
4150         dma_old = inb(base + 2);
4152 -       local_irq_save(flags);
4153 +       local_irq_save_nort(flags);
4155         dma_new = dma_old;
4156         pci_read_config_byte(dev, hwif->channel ? 0x4b : 0x43, &masterdma);
4157 @@ -1247,7 +1247,7 @@
4158         if (dma_new != dma_old)
4159                 outb(dma_new, base + 2);
4161 -       local_irq_restore(flags);
4162 +       local_irq_restore_nort(flags);
4164         printk(KERN_INFO "    %s: BM-DMA at 0x%04lx-0x%04lx\n",
4165                          hwif->name, base, base + 7);
4166 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/ide/ide-io-std.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/ide/ide-io-std.c
4167 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/ide/ide-io-std.c        2017-04-16 10:37:44.000000000 +0200
4168 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/ide/ide-io-std.c     2017-04-18 17:54:23.000000000 +0200
4169 @@ -175,7 +175,7 @@
4170                 unsigned long uninitialized_var(flags);
4172                 if ((io_32bit & 2) && !mmio) {
4173 -                       local_irq_save(flags);
4174 +                       local_irq_save_nort(flags);
4175                         ata_vlb_sync(io_ports->nsect_addr);
4176                 }
4178 @@ -186,7 +186,7 @@
4179                         insl(data_addr, buf, words);
4181                 if ((io_32bit & 2) && !mmio)
4182 -                       local_irq_restore(flags);
4183 +                       local_irq_restore_nort(flags);
4185                 if (((len + 1) & 3) < 2)
4186                         return;
4187 @@ -219,7 +219,7 @@
4188                 unsigned long uninitialized_var(flags);
4190                 if ((io_32bit & 2) && !mmio) {
4191 -                       local_irq_save(flags);
4192 +                       local_irq_save_nort(flags);
4193                         ata_vlb_sync(io_ports->nsect_addr);
4194                 }
4196 @@ -230,7 +230,7 @@
4197                         outsl(data_addr, buf, words);
4199                 if ((io_32bit & 2) && !mmio)
4200 -                       local_irq_restore(flags);
4201 +                       local_irq_restore_nort(flags);
4203                 if (((len + 1) & 3) < 2)
4204                         return;
4205 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/ide/ide-io.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/ide/ide-io.c
4206 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/ide/ide-io.c    2017-04-16 10:37:44.000000000 +0200
4207 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/ide/ide-io.c 2017-04-18 17:54:23.000000000 +0200
4208 @@ -659,7 +659,7 @@
4209                 /* disable_irq_nosync ?? */
4210                 disable_irq(hwif->irq);
4211                 /* local CPU only, as if we were handling an interrupt */
4212 -               local_irq_disable();
4213 +               local_irq_disable_nort();
4214                 if (hwif->polling) {
4215                         startstop = handler(drive);
4216                 } else if (drive_is_ready(drive)) {
4217 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/ide/ide-iops.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/ide/ide-iops.c
4218 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/ide/ide-iops.c  2017-04-16 10:37:44.000000000 +0200
4219 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/ide/ide-iops.c       2017-04-18 17:54:23.000000000 +0200
4220 @@ -129,12 +129,12 @@
4221                                 if ((stat & ATA_BUSY) == 0)
4222                                         break;
4224 -                               local_irq_restore(flags);
4225 +                               local_irq_restore_nort(flags);
4226                                 *rstat = stat;
4227                                 return -EBUSY;
4228                         }
4229                 }
4230 -               local_irq_restore(flags);
4231 +               local_irq_restore_nort(flags);
4232         }
4233         /*
4234          * Allow status to settle, then read it again.
4235 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/ide/ide-probe.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/ide/ide-probe.c
4236 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/ide/ide-probe.c 2017-04-16 10:37:44.000000000 +0200
4237 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/ide/ide-probe.c      2017-04-18 17:54:23.000000000 +0200
4238 @@ -196,10 +196,10 @@
4239         int bswap = 1;
4241         /* local CPU only; some systems need this */
4242 -       local_irq_save(flags);
4243 +       local_irq_save_nort(flags);
4244         /* read 512 bytes of id info */
4245         hwif->tp_ops->input_data(drive, NULL, id, SECTOR_SIZE);
4246 -       local_irq_restore(flags);
4247 +       local_irq_restore_nort(flags);
4249         drive->dev_flags |= IDE_DFLAG_ID_READ;
4250  #ifdef DEBUG
4251 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/ide/ide-taskfile.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/ide/ide-taskfile.c
4252 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/ide/ide-taskfile.c      2017-04-16 10:37:44.000000000 +0200
4253 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/ide/ide-taskfile.c   2017-04-18 17:54:23.000000000 +0200
4254 @@ -250,7 +250,7 @@
4256                 page_is_high = PageHighMem(page);
4257                 if (page_is_high)
4258 -                       local_irq_save(flags);
4259 +                       local_irq_save_nort(flags);
4261                 buf = kmap_atomic(page) + offset;
4263 @@ -271,7 +271,7 @@
4264                 kunmap_atomic(buf);
4266                 if (page_is_high)
4267 -                       local_irq_restore(flags);
4268 +                       local_irq_restore_nort(flags);
4270                 len -= nr_bytes;
4271         }
4272 @@ -414,7 +414,7 @@
4273         }
4275         if ((drive->dev_flags & IDE_DFLAG_UNMASK) == 0)
4276 -               local_irq_disable();
4277 +               local_irq_disable_nort();
4279         ide_set_handler(drive, &task_pio_intr, WAIT_WORSTCASE);
4281 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/infiniband/ulp/ipoib/ipoib_multicast.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
4282 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/infiniband/ulp/ipoib/ipoib_multicast.c  2017-04-16 10:37:45.000000000 +0200
4283 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c       2017-04-18 17:54:23.000000000 +0200
4284 @@ -902,7 +902,7 @@
4286         ipoib_dbg_mcast(priv, "restarting multicast task\n");
4288 -       local_irq_save(flags);
4289 +       local_irq_save_nort(flags);
4290         netif_addr_lock(dev);
4291         spin_lock(&priv->lock);
4293 @@ -984,7 +984,7 @@
4295         spin_unlock(&priv->lock);
4296         netif_addr_unlock(dev);
4297 -       local_irq_restore(flags);
4298 +       local_irq_restore_nort(flags);
4300         /*
4301          * make sure the in-flight joins have finished before we attempt
4302 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/input/gameport/gameport.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/input/gameport/gameport.c
4303 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/input/gameport/gameport.c       2017-04-16 10:37:45.000000000 +0200
4304 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/input/gameport/gameport.c    2017-04-18 17:54:23.000000000 +0200
4305 @@ -91,13 +91,13 @@
4306         tx = ~0;
4308         for (i = 0; i < 50; i++) {
4309 -               local_irq_save(flags);
4310 +               local_irq_save_nort(flags);
4311                 t1 = ktime_get_ns();
4312                 for (t = 0; t < 50; t++)
4313                         gameport_read(gameport);
4314                 t2 = ktime_get_ns();
4315                 t3 = ktime_get_ns();
4316 -               local_irq_restore(flags);
4317 +               local_irq_restore_nort(flags);
4318                 udelay(i * 10);
4319                 t = (t2 - t1) - (t3 - t2);
4320                 if (t < tx)
4321 @@ -124,12 +124,12 @@
4322         tx = 1 << 30;
4324         for(i = 0; i < 50; i++) {
4325 -               local_irq_save(flags);
4326 +               local_irq_save_nort(flags);
4327                 GET_TIME(t1);
4328                 for (t = 0; t < 50; t++) gameport_read(gameport);
4329                 GET_TIME(t2);
4330                 GET_TIME(t3);
4331 -               local_irq_restore(flags);
4332 +               local_irq_restore_nort(flags);
4333                 udelay(i * 10);
4334                 if ((t = DELTA(t2,t1) - DELTA(t3,t2)) < tx) tx = t;
4335         }
4336 @@ -148,11 +148,11 @@
4337         tx = 1 << 30;
4339         for(i = 0; i < 50; i++) {
4340 -               local_irq_save(flags);
4341 +               local_irq_save_nort(flags);
4342                 t1 = rdtsc();
4343                 for (t = 0; t < 50; t++) gameport_read(gameport);
4344                 t2 = rdtsc();
4345 -               local_irq_restore(flags);
4346 +               local_irq_restore_nort(flags);
4347                 udelay(i * 10);
4348                 if (t2 - t1 < tx) tx = t2 - t1;
4349         }
4350 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/iommu/amd_iommu.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/iommu/amd_iommu.c
4351 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/iommu/amd_iommu.c       2017-04-16 10:37:46.000000000 +0200
4352 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/iommu/amd_iommu.c    2017-04-18 17:54:23.000000000 +0200
4353 @@ -1923,10 +1923,10 @@
4354         int ret;
4356         /*
4357 -        * Must be called with IRQs disabled. Warn here to detect early
4358 -        * when its not.
4359 +        * Must be called with IRQs disabled on a non RT kernel. Warn here to
4360 +        * detect early when its not.
4361          */
4362 -       WARN_ON(!irqs_disabled());
4363 +       WARN_ON_NONRT(!irqs_disabled());
4365         /* lock domain */
4366         spin_lock(&domain->lock);
4367 @@ -2094,10 +2094,10 @@
4368         struct protection_domain *domain;
4370         /*
4371 -        * Must be called with IRQs disabled. Warn here to detect early
4372 -        * when its not.
4373 +        * Must be called with IRQs disabled on a non RT kernel. Warn here to
4374 +        * detect early when its not.
4375          */
4376 -       WARN_ON(!irqs_disabled());
4377 +       WARN_ON_NONRT(!irqs_disabled());
4379         if (WARN_ON(!dev_data->domain))
4380                 return;
4381 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/iommu/intel-iommu.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/iommu/intel-iommu.c
4382 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/iommu/intel-iommu.c     2017-04-16 10:37:46.000000000 +0200
4383 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/iommu/intel-iommu.c  2017-04-18 17:54:23.000000000 +0200
4384 @@ -479,7 +479,7 @@
4385         struct deferred_flush_table *tables;
4386  };
4388 -DEFINE_PER_CPU(struct deferred_flush_data, deferred_flush);
4389 +static DEFINE_PER_CPU(struct deferred_flush_data, deferred_flush);
4391  /* bitmap for indexing intel_iommus */
4392  static int g_num_of_iommus;
4393 @@ -3716,10 +3716,8 @@
4394         struct intel_iommu *iommu;
4395         struct deferred_flush_entry *entry;
4396         struct deferred_flush_data *flush_data;
4397 -       unsigned int cpuid;
4399 -       cpuid = get_cpu();
4400 -       flush_data = per_cpu_ptr(&deferred_flush, cpuid);
4401 +       flush_data = raw_cpu_ptr(&deferred_flush);
4403         /* Flush all CPUs' entries to avoid deferring too much.  If
4404          * this becomes a bottleneck, can just flush us, and rely on
4405 @@ -3752,8 +3750,6 @@
4406         }
4407         flush_data->size++;
4408         spin_unlock_irqrestore(&flush_data->lock, flags);
4410 -       put_cpu();
4413  static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
4414 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/iommu/iova.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/iommu/iova.c
4415 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/iommu/iova.c    2017-04-16 10:37:46.000000000 +0200
4416 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/iommu/iova.c 2017-04-18 17:54:23.000000000 +0200
4417 @@ -22,6 +22,7 @@
4418  #include <linux/slab.h>
4419  #include <linux/smp.h>
4420  #include <linux/bitops.h>
4421 +#include <linux/cpu.h>
4423  static bool iova_rcache_insert(struct iova_domain *iovad,
4424                                unsigned long pfn,
4425 @@ -420,10 +421,8 @@
4427                 /* Try replenishing IOVAs by flushing rcache. */
4428                 flushed_rcache = true;
4429 -               preempt_disable();
4430                 for_each_online_cpu(cpu)
4431                         free_cpu_cached_iovas(cpu, iovad);
4432 -               preempt_enable();
4433                 goto retry;
4434         }
4436 @@ -751,7 +750,7 @@
4437         bool can_insert = false;
4438         unsigned long flags;
4440 -       cpu_rcache = get_cpu_ptr(rcache->cpu_rcaches);
4441 +       cpu_rcache = raw_cpu_ptr(rcache->cpu_rcaches);
4442         spin_lock_irqsave(&cpu_rcache->lock, flags);
4444         if (!iova_magazine_full(cpu_rcache->loaded)) {
4445 @@ -781,7 +780,6 @@
4446                 iova_magazine_push(cpu_rcache->loaded, iova_pfn);
4448         spin_unlock_irqrestore(&cpu_rcache->lock, flags);
4449 -       put_cpu_ptr(rcache->cpu_rcaches);
4451         if (mag_to_free) {
4452                 iova_magazine_free_pfns(mag_to_free, iovad);
4453 @@ -815,7 +813,7 @@
4454         bool has_pfn = false;
4455         unsigned long flags;
4457 -       cpu_rcache = get_cpu_ptr(rcache->cpu_rcaches);
4458 +       cpu_rcache = raw_cpu_ptr(rcache->cpu_rcaches);
4459         spin_lock_irqsave(&cpu_rcache->lock, flags);
4461         if (!iova_magazine_empty(cpu_rcache->loaded)) {
4462 @@ -837,7 +835,6 @@
4463                 iova_pfn = iova_magazine_pop(cpu_rcache->loaded, limit_pfn);
4465         spin_unlock_irqrestore(&cpu_rcache->lock, flags);
4466 -       put_cpu_ptr(rcache->cpu_rcaches);
4468         return iova_pfn;
4470 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/leds/trigger/Kconfig linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/leds/trigger/Kconfig
4471 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/leds/trigger/Kconfig    2017-04-16 10:37:47.000000000 +0200
4472 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/leds/trigger/Kconfig 2017-04-18 17:54:23.000000000 +0200
4473 @@ -69,7 +69,7 @@
4475  config LEDS_TRIGGER_CPU
4476         bool "LED CPU Trigger"
4477 -       depends on LEDS_TRIGGERS
4478 +       depends on LEDS_TRIGGERS && !PREEMPT_RT_BASE
4479         help
4480           This allows LEDs to be controlled by active CPUs. This shows
4481           the active CPUs across an array of LEDs so you can see which
4482 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/md/bcache/Kconfig linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/md/bcache/Kconfig
4483 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/md/bcache/Kconfig       2017-04-16 10:37:47.000000000 +0200
4484 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/md/bcache/Kconfig    2017-04-18 17:54:23.000000000 +0200
4485 @@ -1,6 +1,7 @@
4487  config BCACHE
4488         tristate "Block device as cache"
4489 +       depends on !PREEMPT_RT_FULL
4490         ---help---
4491         Allows a block device to be used as cache for other devices; uses
4492         a btree for indexing and the layout is optimized for SSDs.
4493 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/md/dm-rq.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/md/dm-rq.c
4494 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/md/dm-rq.c      2017-04-16 10:37:47.000000000 +0200
4495 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/md/dm-rq.c   2017-04-18 17:54:23.000000000 +0200
4496 @@ -842,7 +842,7 @@
4497                 /* Establish tio->ti before queuing work (map_tio_request) */
4498                 tio->ti = ti;
4499                 kthread_queue_work(&md->kworker, &tio->work);
4500 -               BUG_ON(!irqs_disabled());
4501 +               BUG_ON_NONRT(!irqs_disabled());
4502         }
4505 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/md/raid5.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/md/raid5.c
4506 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/md/raid5.c      2017-04-16 10:37:47.000000000 +0200
4507 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/md/raid5.c   2017-04-18 17:54:23.000000000 +0200
4508 @@ -1928,8 +1928,9 @@
4509         struct raid5_percpu *percpu;
4510         unsigned long cpu;
4512 -       cpu = get_cpu();
4513 +       cpu = get_cpu_light();
4514         percpu = per_cpu_ptr(conf->percpu, cpu);
4515 +       spin_lock(&percpu->lock);
4516         if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) {
4517                 ops_run_biofill(sh);
4518                 overlap_clear++;
4519 @@ -1985,7 +1986,8 @@
4520                         if (test_and_clear_bit(R5_Overlap, &dev->flags))
4521                                 wake_up(&sh->raid_conf->wait_for_overlap);
4522                 }
4523 -       put_cpu();
4524 +       spin_unlock(&percpu->lock);
4525 +       put_cpu_light();
4528  static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp,
4529 @@ -6391,6 +6393,7 @@
4530                        __func__, cpu);
4531                 return -ENOMEM;
4532         }
4533 +       spin_lock_init(&per_cpu_ptr(conf->percpu, cpu)->lock);
4534         return 0;
4537 @@ -6401,7 +6404,6 @@
4538         conf->percpu = alloc_percpu(struct raid5_percpu);
4539         if (!conf->percpu)
4540                 return -ENOMEM;
4542         err = cpuhp_state_add_instance(CPUHP_MD_RAID5_PREPARE, &conf->node);
4543         if (!err) {
4544                 conf->scribble_disks = max(conf->raid_disks,
4545 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/md/raid5.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/md/raid5.h
4546 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/md/raid5.h      2017-04-16 10:37:47.000000000 +0200
4547 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/md/raid5.h   2017-04-18 17:54:23.000000000 +0200
4548 @@ -504,6 +504,7 @@
4549         int                     recovery_disabled;
4550         /* per cpu variables */
4551         struct raid5_percpu {
4552 +               spinlock_t      lock;           /* Protection for -RT */
4553                 struct page     *spare_page; /* Used when checking P/Q in raid6 */
4554                 struct flex_array *scribble;   /* space for constructing buffer
4555                                               * lists and performing address
4556 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/misc/Kconfig linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/misc/Kconfig
4557 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/misc/Kconfig    2017-04-16 10:37:53.000000000 +0200
4558 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/misc/Kconfig 2017-04-18 17:54:23.000000000 +0200
4559 @@ -62,6 +62,7 @@
4560  config ATMEL_TCLIB
4561         bool "Atmel AT32/AT91 Timer/Counter Library"
4562         depends on (AVR32 || ARCH_AT91)
4563 +       default y if PREEMPT_RT_FULL
4564         help
4565           Select this if you want a library to allocate the Timer/Counter
4566           blocks found on many Atmel processors.  This facilitates using
4567 @@ -77,8 +78,7 @@
4568           are combined to make a single 32-bit timer.
4570           When GENERIC_CLOCKEVENTS is defined, the third timer channel
4571 -         may be used as a clock event device supporting oneshot mode
4572 -         (delays of up to two seconds) based on the 32 KiHz clock.
4573 +         may be used as a clock event device supporting oneshot mode.
4575  config ATMEL_TCB_CLKSRC_BLOCK
4576         int
4577 @@ -92,6 +92,15 @@
4578           TC can be used for other purposes, such as PWM generation and
4579           interval timing.
4581 +config ATMEL_TCB_CLKSRC_USE_SLOW_CLOCK
4582 +       bool "TC Block use 32 KiHz clock"
4583 +       depends on ATMEL_TCB_CLKSRC
4584 +       default y if !PREEMPT_RT_FULL
4585 +       help
4586 +         Select this to use 32 KiHz base clock rate as TC block clock
4587 +         source for clock events.
4590  config DUMMY_IRQ
4591         tristate "Dummy IRQ handler"
4592         default n
4593 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/mmc/host/mmci.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/mmc/host/mmci.c
4594 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/mmc/host/mmci.c 2017-04-16 10:37:53.000000000 +0200
4595 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/mmc/host/mmci.c      2017-04-18 17:54:23.000000000 +0200
4596 @@ -1147,15 +1147,12 @@
4597         struct sg_mapping_iter *sg_miter = &host->sg_miter;
4598         struct variant_data *variant = host->variant;
4599         void __iomem *base = host->base;
4600 -       unsigned long flags;
4601         u32 status;
4603         status = readl(base + MMCISTATUS);
4605         dev_dbg(mmc_dev(host->mmc), "irq1 (pio) %08x\n", status);
4607 -       local_irq_save(flags);
4609         do {
4610                 unsigned int remain, len;
4611                 char *buffer;
4612 @@ -1195,8 +1192,6 @@
4614         sg_miter_stop(sg_miter);
4616 -       local_irq_restore(flags);
4618         /*
4619          * If we have less than the fifo 'half-full' threshold to transfer,
4620          * trigger a PIO interrupt as soon as any data is available.
4621 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/net/ethernet/3com/3c59x.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/net/ethernet/3com/3c59x.c
4622 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/net/ethernet/3com/3c59x.c       2017-04-16 10:37:55.000000000 +0200
4623 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/net/ethernet/3com/3c59x.c    2017-04-18 17:54:23.000000000 +0200
4624 @@ -842,9 +842,9 @@
4626         struct vortex_private *vp = netdev_priv(dev);
4627         unsigned long flags;
4628 -       local_irq_save(flags);
4629 +       local_irq_save_nort(flags);
4630         (vp->full_bus_master_rx ? boomerang_interrupt:vortex_interrupt)(dev->irq,dev);
4631 -       local_irq_restore(flags);
4632 +       local_irq_restore_nort(flags);
4634  #endif
4636 @@ -1910,12 +1910,12 @@
4637                          * Block interrupts because vortex_interrupt does a bare spin_lock()
4638                          */
4639                         unsigned long flags;
4640 -                       local_irq_save(flags);
4641 +                       local_irq_save_nort(flags);
4642                         if (vp->full_bus_master_tx)
4643                                 boomerang_interrupt(dev->irq, dev);
4644                         else
4645                                 vortex_interrupt(dev->irq, dev);
4646 -                       local_irq_restore(flags);
4647 +                       local_irq_restore_nort(flags);
4648                 }
4649         }
4651 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/net/ethernet/realtek/8139too.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/net/ethernet/realtek/8139too.c
4652 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/net/ethernet/realtek/8139too.c  2017-04-16 10:37:58.000000000 +0200
4653 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/net/ethernet/realtek/8139too.c       2017-04-18 17:54:23.000000000 +0200
4654 @@ -2233,7 +2233,7 @@
4655         struct rtl8139_private *tp = netdev_priv(dev);
4656         const int irq = tp->pci_dev->irq;
4658 -       disable_irq(irq);
4659 +       disable_irq_nosync(irq);
4660         rtl8139_interrupt(irq, dev);
4661         enable_irq(irq);
4663 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/net/wireless/intersil/orinoco/orinoco_usb.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/net/wireless/intersil/orinoco/orinoco_usb.c
4664 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/net/wireless/intersil/orinoco/orinoco_usb.c     2017-04-16 10:38:02.000000000 +0200
4665 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/net/wireless/intersil/orinoco/orinoco_usb.c  2017-04-18 17:54:24.000000000 +0200
4666 @@ -697,7 +697,7 @@
4667                         while (!ctx->done.done && msecs--)
4668                                 udelay(1000);
4669                 } else {
4670 -                       wait_event_interruptible(ctx->done.wait,
4671 +                       swait_event_interruptible(ctx->done.wait,
4672                                                  ctx->done.done);
4673                 }
4674                 break;
4675 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/pci/access.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/pci/access.c
4676 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/pci/access.c    2017-04-16 10:38:05.000000000 +0200
4677 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/pci/access.c 2017-04-18 17:54:24.000000000 +0200
4678 @@ -672,7 +672,7 @@
4679         WARN_ON(!dev->block_cfg_access);
4681         dev->block_cfg_access = 0;
4682 -       wake_up_all(&pci_cfg_wait);
4683 +       wake_up_all_locked(&pci_cfg_wait);
4684         raw_spin_unlock_irqrestore(&pci_lock, flags);
4686  EXPORT_SYMBOL_GPL(pci_cfg_access_unlock);
4687 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/pinctrl/qcom/pinctrl-msm.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/pinctrl/qcom/pinctrl-msm.c
4688 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/pinctrl/qcom/pinctrl-msm.c      2017-04-16 10:38:05.000000000 +0200
4689 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/pinctrl/qcom/pinctrl-msm.c   2017-04-18 17:54:24.000000000 +0200
4690 @@ -61,7 +61,7 @@
4691         struct notifier_block restart_nb;
4692         int irq;
4694 -       spinlock_t lock;
4695 +       raw_spinlock_t lock;
4697         DECLARE_BITMAP(dual_edge_irqs, MAX_NR_GPIO);
4698         DECLARE_BITMAP(enabled_irqs, MAX_NR_GPIO);
4699 @@ -153,14 +153,14 @@
4700         if (WARN_ON(i == g->nfuncs))
4701                 return -EINVAL;
4703 -       spin_lock_irqsave(&pctrl->lock, flags);
4704 +       raw_spin_lock_irqsave(&pctrl->lock, flags);
4706         val = readl(pctrl->regs + g->ctl_reg);
4707         val &= ~mask;
4708         val |= i << g->mux_bit;
4709         writel(val, pctrl->regs + g->ctl_reg);
4711 -       spin_unlock_irqrestore(&pctrl->lock, flags);
4712 +       raw_spin_unlock_irqrestore(&pctrl->lock, flags);
4714         return 0;
4716 @@ -323,14 +323,14 @@
4717                         break;
4718                 case PIN_CONFIG_OUTPUT:
4719                         /* set output value */
4720 -                       spin_lock_irqsave(&pctrl->lock, flags);
4721 +                       raw_spin_lock_irqsave(&pctrl->lock, flags);
4722                         val = readl(pctrl->regs + g->io_reg);
4723                         if (arg)
4724                                 val |= BIT(g->out_bit);
4725                         else
4726                                 val &= ~BIT(g->out_bit);
4727                         writel(val, pctrl->regs + g->io_reg);
4728 -                       spin_unlock_irqrestore(&pctrl->lock, flags);
4729 +                       raw_spin_unlock_irqrestore(&pctrl->lock, flags);
4731                         /* enable output */
4732                         arg = 1;
4733 @@ -351,12 +351,12 @@
4734                         return -EINVAL;
4735                 }
4737 -               spin_lock_irqsave(&pctrl->lock, flags);
4738 +               raw_spin_lock_irqsave(&pctrl->lock, flags);
4739                 val = readl(pctrl->regs + g->ctl_reg);
4740                 val &= ~(mask << bit);
4741                 val |= arg << bit;
4742                 writel(val, pctrl->regs + g->ctl_reg);
4743 -               spin_unlock_irqrestore(&pctrl->lock, flags);
4744 +               raw_spin_unlock_irqrestore(&pctrl->lock, flags);
4745         }
4747         return 0;
4748 @@ -384,13 +384,13 @@
4750         g = &pctrl->soc->groups[offset];
4752 -       spin_lock_irqsave(&pctrl->lock, flags);
4753 +       raw_spin_lock_irqsave(&pctrl->lock, flags);
4755         val = readl(pctrl->regs + g->ctl_reg);
4756         val &= ~BIT(g->oe_bit);
4757         writel(val, pctrl->regs + g->ctl_reg);
4759 -       spin_unlock_irqrestore(&pctrl->lock, flags);
4760 +       raw_spin_unlock_irqrestore(&pctrl->lock, flags);
4762         return 0;
4764 @@ -404,7 +404,7 @@
4766         g = &pctrl->soc->groups[offset];
4768 -       spin_lock_irqsave(&pctrl->lock, flags);
4769 +       raw_spin_lock_irqsave(&pctrl->lock, flags);
4771         val = readl(pctrl->regs + g->io_reg);
4772         if (value)
4773 @@ -417,7 +417,7 @@
4774         val |= BIT(g->oe_bit);
4775         writel(val, pctrl->regs + g->ctl_reg);
4777 -       spin_unlock_irqrestore(&pctrl->lock, flags);
4778 +       raw_spin_unlock_irqrestore(&pctrl->lock, flags);
4780         return 0;
4782 @@ -443,7 +443,7 @@
4784         g = &pctrl->soc->groups[offset];
4786 -       spin_lock_irqsave(&pctrl->lock, flags);
4787 +       raw_spin_lock_irqsave(&pctrl->lock, flags);
4789         val = readl(pctrl->regs + g->io_reg);
4790         if (value)
4791 @@ -452,7 +452,7 @@
4792                 val &= ~BIT(g->out_bit);
4793         writel(val, pctrl->regs + g->io_reg);
4795 -       spin_unlock_irqrestore(&pctrl->lock, flags);
4796 +       raw_spin_unlock_irqrestore(&pctrl->lock, flags);
4799  #ifdef CONFIG_DEBUG_FS
4800 @@ -571,7 +571,7 @@
4802         g = &pctrl->soc->groups[d->hwirq];
4804 -       spin_lock_irqsave(&pctrl->lock, flags);
4805 +       raw_spin_lock_irqsave(&pctrl->lock, flags);
4807         val = readl(pctrl->regs + g->intr_cfg_reg);
4808         val &= ~BIT(g->intr_enable_bit);
4809 @@ -579,7 +579,7 @@
4811         clear_bit(d->hwirq, pctrl->enabled_irqs);
4813 -       spin_unlock_irqrestore(&pctrl->lock, flags);
4814 +       raw_spin_unlock_irqrestore(&pctrl->lock, flags);
4817  static void msm_gpio_irq_unmask(struct irq_data *d)
4818 @@ -592,7 +592,7 @@
4820         g = &pctrl->soc->groups[d->hwirq];
4822 -       spin_lock_irqsave(&pctrl->lock, flags);
4823 +       raw_spin_lock_irqsave(&pctrl->lock, flags);
4825         val = readl(pctrl->regs + g->intr_cfg_reg);
4826         val |= BIT(g->intr_enable_bit);
4827 @@ -600,7 +600,7 @@
4829         set_bit(d->hwirq, pctrl->enabled_irqs);
4831 -       spin_unlock_irqrestore(&pctrl->lock, flags);
4832 +       raw_spin_unlock_irqrestore(&pctrl->lock, flags);
4835  static void msm_gpio_irq_ack(struct irq_data *d)
4836 @@ -613,7 +613,7 @@
4838         g = &pctrl->soc->groups[d->hwirq];
4840 -       spin_lock_irqsave(&pctrl->lock, flags);
4841 +       raw_spin_lock_irqsave(&pctrl->lock, flags);
4843         val = readl(pctrl->regs + g->intr_status_reg);
4844         if (g->intr_ack_high)
4845 @@ -625,7 +625,7 @@
4846         if (test_bit(d->hwirq, pctrl->dual_edge_irqs))
4847                 msm_gpio_update_dual_edge_pos(pctrl, g, d);
4849 -       spin_unlock_irqrestore(&pctrl->lock, flags);
4850 +       raw_spin_unlock_irqrestore(&pctrl->lock, flags);
4853  static int msm_gpio_irq_set_type(struct irq_data *d, unsigned int type)
4854 @@ -638,7 +638,7 @@
4856         g = &pctrl->soc->groups[d->hwirq];
4858 -       spin_lock_irqsave(&pctrl->lock, flags);
4859 +       raw_spin_lock_irqsave(&pctrl->lock, flags);
4861         /*
4862          * For hw without possibility of detecting both edges
4863 @@ -712,7 +712,7 @@
4864         if (test_bit(d->hwirq, pctrl->dual_edge_irqs))
4865                 msm_gpio_update_dual_edge_pos(pctrl, g, d);
4867 -       spin_unlock_irqrestore(&pctrl->lock, flags);
4868 +       raw_spin_unlock_irqrestore(&pctrl->lock, flags);
4870         if (type & (IRQ_TYPE_LEVEL_LOW | IRQ_TYPE_LEVEL_HIGH))
4871                 irq_set_handler_locked(d, handle_level_irq);
4872 @@ -728,11 +728,11 @@
4873         struct msm_pinctrl *pctrl = gpiochip_get_data(gc);
4874         unsigned long flags;
4876 -       spin_lock_irqsave(&pctrl->lock, flags);
4877 +       raw_spin_lock_irqsave(&pctrl->lock, flags);
4879         irq_set_irq_wake(pctrl->irq, on);
4881 -       spin_unlock_irqrestore(&pctrl->lock, flags);
4882 +       raw_spin_unlock_irqrestore(&pctrl->lock, flags);
4884         return 0;
4886 @@ -878,7 +878,7 @@
4887         pctrl->soc = soc_data;
4888         pctrl->chip = msm_gpio_template;
4890 -       spin_lock_init(&pctrl->lock);
4891 +       raw_spin_lock_init(&pctrl->lock);
4893         res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
4894         pctrl->regs = devm_ioremap_resource(&pdev->dev, res);
4895 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/scsi/fcoe/fcoe.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/scsi/fcoe/fcoe.c
4896 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/scsi/fcoe/fcoe.c        2017-04-16 10:38:09.000000000 +0200
4897 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/scsi/fcoe/fcoe.c     2017-04-18 17:54:24.000000000 +0200
4898 @@ -1455,11 +1455,11 @@
4899  static int fcoe_alloc_paged_crc_eof(struct sk_buff *skb, int tlen)
4901         struct fcoe_percpu_s *fps;
4902 -       int rc;
4903 +       int rc, cpu = get_cpu_light();
4905 -       fps = &get_cpu_var(fcoe_percpu);
4906 +       fps = &per_cpu(fcoe_percpu, cpu);
4907         rc = fcoe_get_paged_crc_eof(skb, tlen, fps);
4908 -       put_cpu_var(fcoe_percpu);
4909 +       put_cpu_light();
4911         return rc;
4913 @@ -1646,11 +1646,11 @@
4914                 return 0;
4915         }
4917 -       stats = per_cpu_ptr(lport->stats, get_cpu());
4918 +       stats = per_cpu_ptr(lport->stats, get_cpu_light());
4919         stats->InvalidCRCCount++;
4920         if (stats->InvalidCRCCount < 5)
4921                 printk(KERN_WARNING "fcoe: dropping frame with CRC error\n");
4922 -       put_cpu();
4923 +       put_cpu_light();
4924         return -EINVAL;
4927 @@ -1693,7 +1693,7 @@
4928          */
4929         hp = (struct fcoe_hdr *) skb_network_header(skb);
4931 -       stats = per_cpu_ptr(lport->stats, get_cpu());
4932 +       stats = per_cpu_ptr(lport->stats, get_cpu_light());
4933         if (unlikely(FC_FCOE_DECAPS_VER(hp) != FC_FCOE_VER)) {
4934                 if (stats->ErrorFrames < 5)
4935                         printk(KERN_WARNING "fcoe: FCoE version "
4936 @@ -1725,13 +1725,13 @@
4937                 goto drop;
4939         if (!fcoe_filter_frames(lport, fp)) {
4940 -               put_cpu();
4941 +               put_cpu_light();
4942                 fc_exch_recv(lport, fp);
4943                 return;
4944         }
4945  drop:
4946         stats->ErrorFrames++;
4947 -       put_cpu();
4948 +       put_cpu_light();
4949         kfree_skb(skb);
4952 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/scsi/fcoe/fcoe_ctlr.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/scsi/fcoe/fcoe_ctlr.c
4953 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/scsi/fcoe/fcoe_ctlr.c   2017-04-16 10:38:09.000000000 +0200
4954 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/scsi/fcoe/fcoe_ctlr.c        2017-04-18 17:54:24.000000000 +0200
4955 @@ -834,7 +834,7 @@
4957         INIT_LIST_HEAD(&del_list);
4959 -       stats = per_cpu_ptr(fip->lp->stats, get_cpu());
4960 +       stats = per_cpu_ptr(fip->lp->stats, get_cpu_light());
4962         list_for_each_entry_safe(fcf, next, &fip->fcfs, list) {
4963                 deadline = fcf->time + fcf->fka_period + fcf->fka_period / 2;
4964 @@ -870,7 +870,7 @@
4965                                 sel_time = fcf->time;
4966                 }
4967         }
4968 -       put_cpu();
4969 +       put_cpu_light();
4971         list_for_each_entry_safe(fcf, next, &del_list, list) {
4972                 /* Removes fcf from current list */
4973 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/scsi/libfc/fc_exch.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/scsi/libfc/fc_exch.c
4974 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/scsi/libfc/fc_exch.c    2017-04-16 10:38:09.000000000 +0200
4975 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/scsi/libfc/fc_exch.c 2017-04-18 17:54:24.000000000 +0200
4976 @@ -814,10 +814,10 @@
4977         }
4978         memset(ep, 0, sizeof(*ep));
4980 -       cpu = get_cpu();
4981 +       cpu = get_cpu_light();
4982         pool = per_cpu_ptr(mp->pool, cpu);
4983         spin_lock_bh(&pool->lock);
4984 -       put_cpu();
4985 +       put_cpu_light();
4987         /* peek cache of free slot */
4988         if (pool->left != FC_XID_UNKNOWN) {
4989 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/scsi/libsas/sas_ata.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/scsi/libsas/sas_ata.c
4990 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/scsi/libsas/sas_ata.c   2017-04-16 10:38:09.000000000 +0200
4991 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/scsi/libsas/sas_ata.c        2017-04-18 17:54:24.000000000 +0200
4992 @@ -190,7 +190,7 @@
4993         /* TODO: audit callers to ensure they are ready for qc_issue to
4994          * unconditionally re-enable interrupts
4995          */
4996 -       local_irq_save(flags);
4997 +       local_irq_save_nort(flags);
4998         spin_unlock(ap->lock);
5000         /* If the device fell off, no sense in issuing commands */
5001 @@ -252,7 +252,7 @@
5003   out:
5004         spin_lock(ap->lock);
5005 -       local_irq_restore(flags);
5006 +       local_irq_restore_nort(flags);
5007         return ret;
5010 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/scsi/qla2xxx/qla_inline.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/scsi/qla2xxx/qla_inline.h
5011 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/scsi/qla2xxx/qla_inline.h       2017-04-16 10:38:10.000000000 +0200
5012 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/scsi/qla2xxx/qla_inline.h    2017-04-18 17:54:25.000000000 +0200
5013 @@ -59,12 +59,12 @@
5015         unsigned long flags;
5016         struct qla_hw_data *ha = rsp->hw;
5017 -       local_irq_save(flags);
5018 +       local_irq_save_nort(flags);
5019         if (IS_P3P_TYPE(ha))
5020                 qla82xx_poll(0, rsp);
5021         else
5022                 ha->isp_ops->intr_handler(0, rsp);
5023 -       local_irq_restore(flags);
5024 +       local_irq_restore_nort(flags);
5027  static inline uint8_t *
5028 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/scsi/qla2xxx/qla_isr.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/scsi/qla2xxx/qla_isr.c
5029 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/scsi/qla2xxx/qla_isr.c  2017-04-16 10:38:10.000000000 +0200
5030 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/scsi/qla2xxx/qla_isr.c       2017-04-18 17:54:25.000000000 +0200
5031 @@ -3125,7 +3125,11 @@
5032                 * kref_put().
5033                 */
5034                 kref_get(&qentry->irq_notify.kref);
5035 +#ifdef CONFIG_PREEMPT_RT_BASE
5036 +               swork_queue(&qentry->irq_notify.swork);
5037 +#else
5038                 schedule_work(&qentry->irq_notify.work);
5039 +#endif
5040         }
5042         /*
5043 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/thermal/x86_pkg_temp_thermal.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/thermal/x86_pkg_temp_thermal.c
5044 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/thermal/x86_pkg_temp_thermal.c  2017-04-16 10:38:14.000000000 +0200
5045 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/thermal/x86_pkg_temp_thermal.c       2017-04-18 17:54:25.000000000 +0200
5046 @@ -29,6 +29,7 @@
5047  #include <linux/pm.h>
5048  #include <linux/thermal.h>
5049  #include <linux/debugfs.h>
5050 +#include <linux/swork.h>
5051  #include <asm/cpu_device_id.h>
5052  #include <asm/mce.h>
5054 @@ -353,7 +354,7 @@
5055         }
5058 -static int pkg_temp_thermal_platform_thermal_notify(__u64 msr_val)
5059 +static void platform_thermal_notify_work(struct swork_event *event)
5061         unsigned long flags;
5062         int cpu = smp_processor_id();
5063 @@ -370,7 +371,7 @@
5064                         pkg_work_scheduled[phy_id]) {
5065                 disable_pkg_thres_interrupt();
5066                 spin_unlock_irqrestore(&pkg_work_lock, flags);
5067 -               return -EINVAL;
5068 +               return;
5069         }
5070         pkg_work_scheduled[phy_id] = 1;
5071         spin_unlock_irqrestore(&pkg_work_lock, flags);
5072 @@ -379,9 +380,48 @@
5073         schedule_delayed_work_on(cpu,
5074                                 &per_cpu(pkg_temp_thermal_threshold_work, cpu),
5075                                 msecs_to_jiffies(notify_delay_ms));
5078 +#ifdef CONFIG_PREEMPT_RT_FULL
5079 +static struct swork_event notify_work;
5081 +static int thermal_notify_work_init(void)
5083 +       int err;
5085 +       err = swork_get();
5086 +       if (err)
5087 +               return err;
5089 +       INIT_SWORK(&notify_work, platform_thermal_notify_work);
5090         return 0;
5093 +static void thermal_notify_work_cleanup(void)
5095 +       swork_put();
5098 +static int pkg_temp_thermal_platform_thermal_notify(__u64 msr_val)
5100 +       swork_queue(&notify_work);
5101 +       return 0;
5104 +#else  /* !CONFIG_PREEMPT_RT_FULL */
5106 +static int thermal_notify_work_init(void) { return 0; }
5108 +static void thermal_notify_work_cleanup(void) {  }
5110 +static int pkg_temp_thermal_platform_thermal_notify(__u64 msr_val)
5112 +       platform_thermal_notify_work(NULL);
5114 +       return 0;
5116 +#endif /* CONFIG_PREEMPT_RT_FULL */
5118  static int find_siblings_cpu(int cpu)
5120         int i;
5121 @@ -585,6 +625,9 @@
5122         if (!x86_match_cpu(pkg_temp_thermal_ids))
5123                 return -ENODEV;
5125 +       if (!thermal_notify_work_init())
5126 +               return -ENODEV;
5128         spin_lock_init(&pkg_work_lock);
5129         platform_thermal_package_notify =
5130                         pkg_temp_thermal_platform_thermal_notify;
5131 @@ -609,7 +652,7 @@
5132         kfree(pkg_work_scheduled);
5133         platform_thermal_package_notify = NULL;
5134         platform_thermal_package_rate_control = NULL;
5136 +       thermal_notify_work_cleanup();
5137         return -ENODEV;
5140 @@ -634,6 +677,7 @@
5141         mutex_unlock(&phy_dev_list_mutex);
5142         platform_thermal_package_notify = NULL;
5143         platform_thermal_package_rate_control = NULL;
5144 +       thermal_notify_work_cleanup();
5145         for_each_online_cpu(i)
5146                 cancel_delayed_work_sync(
5147                         &per_cpu(pkg_temp_thermal_threshold_work, i));
5148 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/tty/serial/8250/8250_core.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/tty/serial/8250/8250_core.c
5149 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/tty/serial/8250/8250_core.c     2017-04-16 10:38:14.000000000 +0200
5150 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/tty/serial/8250/8250_core.c  2017-04-18 17:54:25.000000000 +0200
5151 @@ -58,7 +58,16 @@
5153  static unsigned int skip_txen_test; /* force skip of txen test at init time */
5155 -#define PASS_LIMIT     512
5157 + * On -rt we can have a more delays, and legitimately
5158 + * so - so don't drop work spuriously and spam the
5159 + * syslog:
5160 + */
5161 +#ifdef CONFIG_PREEMPT_RT_FULL
5162 +# define PASS_LIMIT    1000000
5163 +#else
5164 +# define PASS_LIMIT    512
5165 +#endif
5167  #include <asm/serial.h>
5168  /*
5169 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/tty/serial/8250/8250_port.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/tty/serial/8250/8250_port.c
5170 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/tty/serial/8250/8250_port.c     2017-04-16 10:38:14.000000000 +0200
5171 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/tty/serial/8250/8250_port.c  2017-04-18 17:54:25.000000000 +0200
5172 @@ -35,6 +35,7 @@
5173  #include <linux/nmi.h>
5174  #include <linux/mutex.h>
5175  #include <linux/slab.h>
5176 +#include <linux/kdb.h>
5177  #include <linux/uaccess.h>
5178  #include <linux/pm_runtime.h>
5179  #include <linux/timer.h>
5180 @@ -3144,9 +3145,9 @@
5182         serial8250_rpm_get(up);
5184 -       if (port->sysrq)
5185 +       if (port->sysrq || oops_in_progress)
5186                 locked = 0;
5187 -       else if (oops_in_progress)
5188 +       else if (in_kdb_printk())
5189                 locked = spin_trylock_irqsave(&port->lock, flags);
5190         else
5191                 spin_lock_irqsave(&port->lock, flags);
5192 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/tty/serial/amba-pl011.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/tty/serial/amba-pl011.c
5193 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/tty/serial/amba-pl011.c 2017-04-16 10:38:14.000000000 +0200
5194 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/tty/serial/amba-pl011.c      2017-04-18 17:54:25.000000000 +0200
5195 @@ -2211,13 +2211,19 @@
5197         clk_enable(uap->clk);
5199 -       local_irq_save(flags);
5200 +       /*
5201 +        * local_irq_save(flags);
5202 +        *
5203 +        * This local_irq_save() is nonsense. If we come in via sysrq
5204 +        * handling then interrupts are already disabled. Aside of
5205 +        * that the port.sysrq check is racy on SMP regardless.
5206 +       */
5207         if (uap->port.sysrq)
5208                 locked = 0;
5209         else if (oops_in_progress)
5210 -               locked = spin_trylock(&uap->port.lock);
5211 +               locked = spin_trylock_irqsave(&uap->port.lock, flags);
5212         else
5213 -               spin_lock(&uap->port.lock);
5214 +               spin_lock_irqsave(&uap->port.lock, flags);
5216         /*
5217          *      First save the CR then disable the interrupts
5218 @@ -2241,8 +2247,7 @@
5219                 pl011_write(old_cr, uap, REG_CR);
5221         if (locked)
5222 -               spin_unlock(&uap->port.lock);
5223 -       local_irq_restore(flags);
5224 +               spin_unlock_irqrestore(&uap->port.lock, flags);
5226         clk_disable(uap->clk);
5228 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/tty/serial/omap-serial.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/tty/serial/omap-serial.c
5229 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/tty/serial/omap-serial.c        2017-04-16 10:38:15.000000000 +0200
5230 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/tty/serial/omap-serial.c     2017-04-18 17:54:25.000000000 +0200
5231 @@ -1257,13 +1257,10 @@
5233         pm_runtime_get_sync(up->dev);
5235 -       local_irq_save(flags);
5236 -       if (up->port.sysrq)
5237 -               locked = 0;
5238 -       else if (oops_in_progress)
5239 -               locked = spin_trylock(&up->port.lock);
5240 +       if (up->port.sysrq || oops_in_progress)
5241 +               locked = spin_trylock_irqsave(&up->port.lock, flags);
5242         else
5243 -               spin_lock(&up->port.lock);
5244 +               spin_lock_irqsave(&up->port.lock, flags);
5246         /*
5247          * First save the IER then disable the interrupts
5248 @@ -1292,8 +1289,7 @@
5249         pm_runtime_mark_last_busy(up->dev);
5250         pm_runtime_put_autosuspend(up->dev);
5251         if (locked)
5252 -               spin_unlock(&up->port.lock);
5253 -       local_irq_restore(flags);
5254 +               spin_unlock_irqrestore(&up->port.lock, flags);
5257  static int __init
5258 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/usb/core/hcd.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/usb/core/hcd.c
5259 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/usb/core/hcd.c  2017-04-16 10:38:15.000000000 +0200
5260 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/usb/core/hcd.c       2017-04-18 17:54:25.000000000 +0200
5261 @@ -1764,9 +1764,9 @@
5262          * and no one may trigger the above deadlock situation when
5263          * running complete() in tasklet.
5264          */
5265 -       local_irq_save(flags);
5266 +       local_irq_save_nort(flags);
5267         urb->complete(urb);
5268 -       local_irq_restore(flags);
5269 +       local_irq_restore_nort(flags);
5271         usb_anchor_resume_wakeups(anchor);
5272         atomic_dec(&urb->use_count);
5273 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/usb/gadget/function/f_fs.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/usb/gadget/function/f_fs.c
5274 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/usb/gadget/function/f_fs.c      2017-04-16 10:38:15.000000000 +0200
5275 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/usb/gadget/function/f_fs.c   2017-04-18 17:54:25.000000000 +0200
5276 @@ -1593,7 +1593,7 @@
5277                 pr_info("%s(): freeing\n", __func__);
5278                 ffs_data_clear(ffs);
5279                 BUG_ON(waitqueue_active(&ffs->ev.waitq) ||
5280 -                      waitqueue_active(&ffs->ep0req_completion.wait));
5281 +                      swait_active(&ffs->ep0req_completion.wait));
5282                 kfree(ffs->dev_name);
5283                 kfree(ffs);
5284         }
5285 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/usb/gadget/legacy/inode.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/usb/gadget/legacy/inode.c
5286 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/drivers/usb/gadget/legacy/inode.c       2017-04-16 10:38:15.000000000 +0200
5287 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/drivers/usb/gadget/legacy/inode.c    2017-04-18 17:54:25.000000000 +0200
5288 @@ -346,7 +346,7 @@
5289         spin_unlock_irq (&epdata->dev->lock);
5291         if (likely (value == 0)) {
5292 -               value = wait_event_interruptible (done.wait, done.done);
5293 +               value = swait_event_interruptible (done.wait, done.done);
5294                 if (value != 0) {
5295                         spin_lock_irq (&epdata->dev->lock);
5296                         if (likely (epdata->ep != NULL)) {
5297 @@ -355,7 +355,7 @@
5298                                 usb_ep_dequeue (epdata->ep, epdata->req);
5299                                 spin_unlock_irq (&epdata->dev->lock);
5301 -                               wait_event (done.wait, done.done);
5302 +                               swait_event (done.wait, done.done);
5303                                 if (epdata->status == -ECONNRESET)
5304                                         epdata->status = -EINTR;
5305                         } else {
5306 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/fs/aio.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/fs/aio.c
5307 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/fs/aio.c        2017-04-16 10:38:19.000000000 +0200
5308 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/fs/aio.c     2017-04-18 17:54:25.000000000 +0200
5309 @@ -40,6 +40,7 @@
5310  #include <linux/ramfs.h>
5311  #include <linux/percpu-refcount.h>
5312  #include <linux/mount.h>
5313 +#include <linux/swork.h>
5315  #include <asm/kmap_types.h>
5316  #include <asm/uaccess.h>
5317 @@ -115,7 +116,7 @@
5318         struct page             **ring_pages;
5319         long                    nr_pages;
5321 -       struct work_struct      free_work;
5322 +       struct swork_event      free_work;
5324         /*
5325          * signals when all in-flight requests are done
5326 @@ -258,6 +259,7 @@
5327                 .mount          = aio_mount,
5328                 .kill_sb        = kill_anon_super,
5329         };
5330 +       BUG_ON(swork_get());
5331         aio_mnt = kern_mount(&aio_fs);
5332         if (IS_ERR(aio_mnt))
5333                 panic("Failed to create aio fs mount.");
5334 @@ -581,9 +583,9 @@
5335         return cancel(&kiocb->common);
5338 -static void free_ioctx(struct work_struct *work)
5339 +static void free_ioctx(struct swork_event *sev)
5341 -       struct kioctx *ctx = container_of(work, struct kioctx, free_work);
5342 +       struct kioctx *ctx = container_of(sev, struct kioctx, free_work);
5344         pr_debug("freeing %p\n", ctx);
5346 @@ -602,8 +604,8 @@
5347         if (ctx->rq_wait && atomic_dec_and_test(&ctx->rq_wait->count))
5348                 complete(&ctx->rq_wait->comp);
5350 -       INIT_WORK(&ctx->free_work, free_ioctx);
5351 -       schedule_work(&ctx->free_work);
5352 +       INIT_SWORK(&ctx->free_work, free_ioctx);
5353 +       swork_queue(&ctx->free_work);
5356  /*
5357 @@ -611,9 +613,9 @@
5358   * and ctx->users has dropped to 0, so we know no more kiocbs can be submitted -
5359   * now it's safe to cancel any that need to be.
5360   */
5361 -static void free_ioctx_users(struct percpu_ref *ref)
5362 +static void free_ioctx_users_work(struct swork_event *sev)
5364 -       struct kioctx *ctx = container_of(ref, struct kioctx, users);
5365 +       struct kioctx *ctx = container_of(sev, struct kioctx, free_work);
5366         struct aio_kiocb *req;
5368         spin_lock_irq(&ctx->ctx_lock);
5369 @@ -632,6 +634,14 @@
5370         percpu_ref_put(&ctx->reqs);
5373 +static void free_ioctx_users(struct percpu_ref *ref)
5375 +       struct kioctx *ctx = container_of(ref, struct kioctx, users);
5377 +       INIT_SWORK(&ctx->free_work, free_ioctx_users_work);
5378 +       swork_queue(&ctx->free_work);
5381  static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
5383         unsigned i, new_nr;
5384 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/fs/autofs4/autofs_i.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/fs/autofs4/autofs_i.h
5385 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/fs/autofs4/autofs_i.h   2017-04-16 10:38:19.000000000 +0200
5386 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/fs/autofs4/autofs_i.h        2017-04-18 17:54:25.000000000 +0200
5387 @@ -31,6 +31,7 @@
5388  #include <linux/sched.h>
5389  #include <linux/mount.h>
5390  #include <linux/namei.h>
5391 +#include <linux/delay.h>
5392  #include <asm/current.h>
5393  #include <linux/uaccess.h>
5395 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/fs/autofs4/expire.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/fs/autofs4/expire.c
5396 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/fs/autofs4/expire.c     2017-04-16 10:38:19.000000000 +0200
5397 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/fs/autofs4/expire.c  2017-04-18 17:54:25.000000000 +0200
5398 @@ -148,7 +148,7 @@
5399                         parent = p->d_parent;
5400                         if (!spin_trylock(&parent->d_lock)) {
5401                                 spin_unlock(&p->d_lock);
5402 -                               cpu_relax();
5403 +                               cpu_chill();
5404                                 goto relock;
5405                         }
5406                         spin_unlock(&p->d_lock);
5407 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/fs/buffer.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/fs/buffer.c
5408 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/fs/buffer.c     2017-04-16 10:38:19.000000000 +0200
5409 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/fs/buffer.c  2017-04-18 17:54:25.000000000 +0200
5410 @@ -301,8 +301,7 @@
5411          * decide that the page is now completely done.
5412          */
5413         first = page_buffers(page);
5414 -       local_irq_save(flags);
5415 -       bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
5416 +       flags = bh_uptodate_lock_irqsave(first);
5417         clear_buffer_async_read(bh);
5418         unlock_buffer(bh);
5419         tmp = bh;
5420 @@ -315,8 +314,7 @@
5421                 }
5422                 tmp = tmp->b_this_page;
5423         } while (tmp != bh);
5424 -       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
5425 -       local_irq_restore(flags);
5426 +       bh_uptodate_unlock_irqrestore(first, flags);
5428         /*
5429          * If none of the buffers had errors and they are all
5430 @@ -328,9 +326,7 @@
5431         return;
5433  still_busy:
5434 -       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
5435 -       local_irq_restore(flags);
5436 -       return;
5437 +       bh_uptodate_unlock_irqrestore(first, flags);
5440  /*
5441 @@ -358,8 +354,7 @@
5442         }
5444         first = page_buffers(page);
5445 -       local_irq_save(flags);
5446 -       bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
5447 +       flags = bh_uptodate_lock_irqsave(first);
5449         clear_buffer_async_write(bh);
5450         unlock_buffer(bh);
5451 @@ -371,15 +366,12 @@
5452                 }
5453                 tmp = tmp->b_this_page;
5454         }
5455 -       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
5456 -       local_irq_restore(flags);
5457 +       bh_uptodate_unlock_irqrestore(first, flags);
5458         end_page_writeback(page);
5459         return;
5461  still_busy:
5462 -       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
5463 -       local_irq_restore(flags);
5464 -       return;
5465 +       bh_uptodate_unlock_irqrestore(first, flags);
5467  EXPORT_SYMBOL(end_buffer_async_write);
5469 @@ -3383,6 +3375,7 @@
5470         struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
5471         if (ret) {
5472                 INIT_LIST_HEAD(&ret->b_assoc_buffers);
5473 +               buffer_head_init_locks(ret);
5474                 preempt_disable();
5475                 __this_cpu_inc(bh_accounting.nr);
5476                 recalc_bh_state();
5477 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/fs/cifs/readdir.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/fs/cifs/readdir.c
5478 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/fs/cifs/readdir.c       2017-04-16 10:38:20.000000000 +0200
5479 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/fs/cifs/readdir.c    2017-04-18 17:54:25.000000000 +0200
5480 @@ -80,7 +80,7 @@
5481         struct inode *inode;
5482         struct super_block *sb = parent->d_sb;
5483         struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
5484 -       DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
5485 +       DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
5487         cifs_dbg(FYI, "%s: for %s\n", __func__, name->name);
5489 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/fs/dcache.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/fs/dcache.c
5490 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/fs/dcache.c     2017-04-16 10:38:20.000000000 +0200
5491 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/fs/dcache.c  2017-04-18 17:54:25.000000000 +0200
5492 @@ -19,6 +19,7 @@
5493  #include <linux/mm.h>
5494  #include <linux/fs.h>
5495  #include <linux/fsnotify.h>
5496 +#include <linux/delay.h>
5497  #include <linux/slab.h>
5498  #include <linux/init.h>
5499  #include <linux/hash.h>
5500 @@ -750,6 +751,8 @@
5501   */
5502  void dput(struct dentry *dentry)
5504 +       struct dentry *parent;
5506         if (unlikely(!dentry))
5507                 return;
5509 @@ -788,9 +791,18 @@
5510         return;
5512  kill_it:
5513 -       dentry = dentry_kill(dentry);
5514 -       if (dentry) {
5515 -               cond_resched();
5516 +       parent = dentry_kill(dentry);
5517 +       if (parent) {
5518 +               int r;
5520 +               if (parent == dentry) {
5521 +                       /* the task with the highest priority won't schedule */
5522 +                       r = cond_resched();
5523 +                       if (!r)
5524 +                               cpu_chill();
5525 +               } else {
5526 +                       dentry = parent;
5527 +               }
5528                 goto repeat;
5529         }
5531 @@ -2324,7 +2336,7 @@
5532         if (dentry->d_lockref.count == 1) {
5533                 if (!spin_trylock(&inode->i_lock)) {
5534                         spin_unlock(&dentry->d_lock);
5535 -                       cpu_relax();
5536 +                       cpu_chill();
5537                         goto again;
5538                 }
5539                 dentry->d_flags &= ~DCACHE_CANT_MOUNT;
5540 @@ -2384,21 +2396,24 @@
5542  static void d_wait_lookup(struct dentry *dentry)
5544 -       if (d_in_lookup(dentry)) {
5545 -               DECLARE_WAITQUEUE(wait, current);
5546 -               add_wait_queue(dentry->d_wait, &wait);
5547 -               do {
5548 -                       set_current_state(TASK_UNINTERRUPTIBLE);
5549 -                       spin_unlock(&dentry->d_lock);
5550 -                       schedule();
5551 -                       spin_lock(&dentry->d_lock);
5552 -               } while (d_in_lookup(dentry));
5553 -       }
5554 +       struct swait_queue __wait;
5556 +       if (!d_in_lookup(dentry))
5557 +               return;
5559 +       INIT_LIST_HEAD(&__wait.task_list);
5560 +       do {
5561 +               prepare_to_swait(dentry->d_wait, &__wait, TASK_UNINTERRUPTIBLE);
5562 +               spin_unlock(&dentry->d_lock);
5563 +               schedule();
5564 +               spin_lock(&dentry->d_lock);
5565 +       } while (d_in_lookup(dentry));
5566 +       finish_swait(dentry->d_wait, &__wait);
5569  struct dentry *d_alloc_parallel(struct dentry *parent,
5570                                 const struct qstr *name,
5571 -                               wait_queue_head_t *wq)
5572 +                               struct swait_queue_head *wq)
5574         unsigned int hash = name->hash;
5575         struct hlist_bl_head *b = in_lookup_hash(parent, hash);
5576 @@ -2507,7 +2522,7 @@
5577         hlist_bl_lock(b);
5578         dentry->d_flags &= ~DCACHE_PAR_LOOKUP;
5579         __hlist_bl_del(&dentry->d_u.d_in_lookup_hash);
5580 -       wake_up_all(dentry->d_wait);
5581 +       swake_up_all(dentry->d_wait);
5582         dentry->d_wait = NULL;
5583         hlist_bl_unlock(b);
5584         INIT_HLIST_NODE(&dentry->d_u.d_alias);
5585 @@ -3604,6 +3619,11 @@
5587  void __init vfs_caches_init_early(void)
5589 +       int i;
5591 +       for (i = 0; i < ARRAY_SIZE(in_lookup_hashtable); i++)
5592 +               INIT_HLIST_BL_HEAD(&in_lookup_hashtable[i]);
5594         dcache_init_early();
5595         inode_init_early();
5597 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/fs/eventpoll.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/fs/eventpoll.c
5598 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/fs/eventpoll.c  2017-04-16 10:38:20.000000000 +0200
5599 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/fs/eventpoll.c       2017-04-18 17:54:25.000000000 +0200
5600 @@ -510,12 +510,12 @@
5601   */
5602  static void ep_poll_safewake(wait_queue_head_t *wq)
5604 -       int this_cpu = get_cpu();
5605 +       int this_cpu = get_cpu_light();
5607         ep_call_nested(&poll_safewake_ncalls, EP_MAX_NESTS,
5608                        ep_poll_wakeup_proc, NULL, wq, (void *) (long) this_cpu);
5610 -       put_cpu();
5611 +       put_cpu_light();
5614  static void ep_remove_wait_queue(struct eppoll_entry *pwq)
5615 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/fs/exec.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/fs/exec.c
5616 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/fs/exec.c       2017-04-16 10:38:20.000000000 +0200
5617 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/fs/exec.c    2017-04-18 17:54:25.000000000 +0200
5618 @@ -1017,12 +1017,14 @@
5619                 }
5620         }
5621         task_lock(tsk);
5622 +       preempt_disable_rt();
5623         active_mm = tsk->active_mm;
5624         tsk->mm = mm;
5625         tsk->active_mm = mm;
5626         activate_mm(active_mm, mm);
5627         tsk->mm->vmacache_seqnum = 0;
5628         vmacache_flush(tsk);
5629 +       preempt_enable_rt();
5630         task_unlock(tsk);
5631         if (old_mm) {
5632                 up_read(&old_mm->mmap_sem);
5633 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/fs/fuse/dir.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/fs/fuse/dir.c
5634 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/fs/fuse/dir.c   2017-04-16 10:38:20.000000000 +0200
5635 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/fs/fuse/dir.c        2017-04-18 17:54:25.000000000 +0200
5636 @@ -1191,7 +1191,7 @@
5637         struct inode *dir = d_inode(parent);
5638         struct fuse_conn *fc;
5639         struct inode *inode;
5640 -       DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
5641 +       DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
5643         if (!o->nodeid) {
5644                 /*
5645 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/fs/jbd2/checkpoint.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/fs/jbd2/checkpoint.c
5646 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/fs/jbd2/checkpoint.c    2017-04-16 10:38:20.000000000 +0200
5647 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/fs/jbd2/checkpoint.c 2017-04-18 17:54:25.000000000 +0200
5648 @@ -116,6 +116,8 @@
5649         nblocks = jbd2_space_needed(journal);
5650         while (jbd2_log_space_left(journal) < nblocks) {
5651                 write_unlock(&journal->j_state_lock);
5652 +               if (current->plug)
5653 +                       io_schedule();
5654                 mutex_lock(&journal->j_checkpoint_mutex);
5656                 /*
5657 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/fs/locks.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/fs/locks.c
5658 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/fs/locks.c      2017-04-16 10:38:21.000000000 +0200
5659 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/fs/locks.c   2017-04-18 17:54:25.000000000 +0200
5660 @@ -935,7 +935,7 @@
5661                         return -ENOMEM;
5662         }
5664 -       percpu_down_read_preempt_disable(&file_rwsem);
5665 +       percpu_down_read(&file_rwsem);
5666         spin_lock(&ctx->flc_lock);
5667         if (request->fl_flags & FL_ACCESS)
5668                 goto find_conflict;
5669 @@ -976,7 +976,7 @@
5671  out:
5672         spin_unlock(&ctx->flc_lock);
5673 -       percpu_up_read_preempt_enable(&file_rwsem);
5674 +       percpu_up_read(&file_rwsem);
5675         if (new_fl)
5676                 locks_free_lock(new_fl);
5677         locks_dispose_list(&dispose);
5678 @@ -1013,7 +1013,7 @@
5679                 new_fl2 = locks_alloc_lock();
5680         }
5682 -       percpu_down_read_preempt_disable(&file_rwsem);
5683 +       percpu_down_read(&file_rwsem);
5684         spin_lock(&ctx->flc_lock);
5685         /*
5686          * New lock request. Walk all POSIX locks and look for conflicts. If
5687 @@ -1185,7 +1185,7 @@
5688         }
5689   out:
5690         spin_unlock(&ctx->flc_lock);
5691 -       percpu_up_read_preempt_enable(&file_rwsem);
5692 +       percpu_up_read(&file_rwsem);
5693         /*
5694          * Free any unused locks.
5695          */
5696 @@ -1460,7 +1460,7 @@
5697                 return error;
5698         }
5700 -       percpu_down_read_preempt_disable(&file_rwsem);
5701 +       percpu_down_read(&file_rwsem);
5702         spin_lock(&ctx->flc_lock);
5704         time_out_leases(inode, &dispose);
5705 @@ -1512,13 +1512,13 @@
5706         locks_insert_block(fl, new_fl);
5707         trace_break_lease_block(inode, new_fl);
5708         spin_unlock(&ctx->flc_lock);
5709 -       percpu_up_read_preempt_enable(&file_rwsem);
5710 +       percpu_up_read(&file_rwsem);
5712         locks_dispose_list(&dispose);
5713         error = wait_event_interruptible_timeout(new_fl->fl_wait,
5714                                                 !new_fl->fl_next, break_time);
5716 -       percpu_down_read_preempt_disable(&file_rwsem);
5717 +       percpu_down_read(&file_rwsem);
5718         spin_lock(&ctx->flc_lock);
5719         trace_break_lease_unblock(inode, new_fl);
5720         locks_delete_block(new_fl);
5721 @@ -1535,7 +1535,7 @@
5722         }
5723  out:
5724         spin_unlock(&ctx->flc_lock);
5725 -       percpu_up_read_preempt_enable(&file_rwsem);
5726 +       percpu_up_read(&file_rwsem);
5727         locks_dispose_list(&dispose);
5728         locks_free_lock(new_fl);
5729         return error;
5730 @@ -1609,7 +1609,7 @@
5732         ctx = smp_load_acquire(&inode->i_flctx);
5733         if (ctx && !list_empty_careful(&ctx->flc_lease)) {
5734 -               percpu_down_read_preempt_disable(&file_rwsem);
5735 +               percpu_down_read(&file_rwsem);
5736                 spin_lock(&ctx->flc_lock);
5737                 time_out_leases(inode, &dispose);
5738                 list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
5739 @@ -1619,7 +1619,7 @@
5740                         break;
5741                 }
5742                 spin_unlock(&ctx->flc_lock);
5743 -               percpu_up_read_preempt_enable(&file_rwsem);
5744 +               percpu_up_read(&file_rwsem);
5746                 locks_dispose_list(&dispose);
5747         }
5748 @@ -1694,7 +1694,7 @@
5749                 return -EINVAL;
5750         }
5752 -       percpu_down_read_preempt_disable(&file_rwsem);
5753 +       percpu_down_read(&file_rwsem);
5754         spin_lock(&ctx->flc_lock);
5755         time_out_leases(inode, &dispose);
5756         error = check_conflicting_open(dentry, arg, lease->fl_flags);
5757 @@ -1765,7 +1765,7 @@
5758                 lease->fl_lmops->lm_setup(lease, priv);
5759  out:
5760         spin_unlock(&ctx->flc_lock);
5761 -       percpu_up_read_preempt_enable(&file_rwsem);
5762 +       percpu_up_read(&file_rwsem);
5763         locks_dispose_list(&dispose);
5764         if (is_deleg)
5765                 inode_unlock(inode);
5766 @@ -1788,7 +1788,7 @@
5767                 return error;
5768         }
5770 -       percpu_down_read_preempt_disable(&file_rwsem);
5771 +       percpu_down_read(&file_rwsem);
5772         spin_lock(&ctx->flc_lock);
5773         list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
5774                 if (fl->fl_file == filp &&
5775 @@ -1801,7 +1801,7 @@
5776         if (victim)
5777                 error = fl->fl_lmops->lm_change(victim, F_UNLCK, &dispose);
5778         spin_unlock(&ctx->flc_lock);
5779 -       percpu_up_read_preempt_enable(&file_rwsem);
5780 +       percpu_up_read(&file_rwsem);
5781         locks_dispose_list(&dispose);
5782         return error;
5784 @@ -2532,13 +2532,13 @@
5785         if (list_empty(&ctx->flc_lease))
5786                 return;
5788 -       percpu_down_read_preempt_disable(&file_rwsem);
5789 +       percpu_down_read(&file_rwsem);
5790         spin_lock(&ctx->flc_lock);
5791         list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, fl_list)
5792                 if (filp == fl->fl_file)
5793                         lease_modify(fl, F_UNLCK, &dispose);
5794         spin_unlock(&ctx->flc_lock);
5795 -       percpu_up_read_preempt_enable(&file_rwsem);
5796 +       percpu_up_read(&file_rwsem);
5798         locks_dispose_list(&dispose);
5800 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/fs/namei.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/fs/namei.c
5801 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/fs/namei.c      2017-04-16 10:38:21.000000000 +0200
5802 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/fs/namei.c   2017-04-18 17:54:25.000000000 +0200
5803 @@ -1626,7 +1626,7 @@
5805         struct dentry *dentry = ERR_PTR(-ENOENT), *old;
5806         struct inode *inode = dir->d_inode;
5807 -       DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
5808 +       DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
5810         inode_lock_shared(inode);
5811         /* Don't go there if it's already dead */
5812 @@ -3083,7 +3083,7 @@
5813         struct dentry *dentry;
5814         int error, create_error = 0;
5815         umode_t mode = op->mode;
5816 -       DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
5817 +       DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
5819         if (unlikely(IS_DEADDIR(dir_inode)))
5820                 return -ENOENT;
5821 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/fs/namespace.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/fs/namespace.c
5822 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/fs/namespace.c  2017-04-16 10:38:21.000000000 +0200
5823 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/fs/namespace.c       2017-04-18 17:54:25.000000000 +0200
5824 @@ -14,6 +14,7 @@
5825  #include <linux/mnt_namespace.h>
5826  #include <linux/user_namespace.h>
5827  #include <linux/namei.h>
5828 +#include <linux/delay.h>
5829  #include <linux/security.h>
5830  #include <linux/idr.h>
5831  #include <linux/init.h>                /* init_rootfs */
5832 @@ -356,8 +357,11 @@
5833          * incremented count after it has set MNT_WRITE_HOLD.
5834          */
5835         smp_mb();
5836 -       while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD)
5837 -               cpu_relax();
5838 +       while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) {
5839 +               preempt_enable();
5840 +               cpu_chill();
5841 +               preempt_disable();
5842 +       }
5843         /*
5844          * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
5845          * be set to match its requirements. So we must not load that until
5846 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/fs/nfs/delegation.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/fs/nfs/delegation.c
5847 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/fs/nfs/delegation.c     2017-04-16 10:38:21.000000000 +0200
5848 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/fs/nfs/delegation.c  2017-04-18 17:54:25.000000000 +0200
5849 @@ -150,11 +150,11 @@
5850                 sp = state->owner;
5851                 /* Block nfs4_proc_unlck */
5852                 mutex_lock(&sp->so_delegreturn_mutex);
5853 -               seq = raw_seqcount_begin(&sp->so_reclaim_seqcount);
5854 +               seq = read_seqbegin(&sp->so_reclaim_seqlock);
5855                 err = nfs4_open_delegation_recall(ctx, state, stateid, type);
5856                 if (!err)
5857                         err = nfs_delegation_claim_locks(ctx, state, stateid);
5858 -               if (!err && read_seqcount_retry(&sp->so_reclaim_seqcount, seq))
5859 +               if (!err && read_seqretry(&sp->so_reclaim_seqlock, seq))
5860                         err = -EAGAIN;
5861                 mutex_unlock(&sp->so_delegreturn_mutex);
5862                 put_nfs_open_context(ctx);
5863 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/fs/nfs/dir.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/fs/nfs/dir.c
5864 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/fs/nfs/dir.c    2017-04-16 10:38:21.000000000 +0200
5865 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/fs/nfs/dir.c 2017-04-18 17:54:25.000000000 +0200
5866 @@ -485,7 +485,7 @@
5867  void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
5869         struct qstr filename = QSTR_INIT(entry->name, entry->len);
5870 -       DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
5871 +       DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
5872         struct dentry *dentry;
5873         struct dentry *alias;
5874         struct inode *dir = d_inode(parent);
5875 @@ -1487,7 +1487,7 @@
5876                     struct file *file, unsigned open_flags,
5877                     umode_t mode, int *opened)
5879 -       DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
5880 +       DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
5881         struct nfs_open_context *ctx;
5882         struct dentry *res;
5883         struct iattr attr = { .ia_valid = ATTR_OPEN };
5884 @@ -1802,7 +1802,11 @@
5886         trace_nfs_rmdir_enter(dir, dentry);
5887         if (d_really_is_positive(dentry)) {
5888 +#ifdef CONFIG_PREEMPT_RT_BASE
5889 +               down(&NFS_I(d_inode(dentry))->rmdir_sem);
5890 +#else
5891                 down_write(&NFS_I(d_inode(dentry))->rmdir_sem);
5892 +#endif
5893                 error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name);
5894                 /* Ensure the VFS deletes this inode */
5895                 switch (error) {
5896 @@ -1812,7 +1816,11 @@
5897                 case -ENOENT:
5898                         nfs_dentry_handle_enoent(dentry);
5899                 }
5900 +#ifdef CONFIG_PREEMPT_RT_BASE
5901 +               up(&NFS_I(d_inode(dentry))->rmdir_sem);
5902 +#else
5903                 up_write(&NFS_I(d_inode(dentry))->rmdir_sem);
5904 +#endif
5905         } else
5906                 error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name);
5907         trace_nfs_rmdir_exit(dir, dentry, error);
5908 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/fs/nfs/inode.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/fs/nfs/inode.c
5909 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/fs/nfs/inode.c  2017-04-16 10:38:21.000000000 +0200
5910 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/fs/nfs/inode.c       2017-04-18 17:54:25.000000000 +0200
5911 @@ -1957,7 +1957,11 @@
5912         nfsi->nrequests = 0;
5913         nfsi->commit_info.ncommit = 0;
5914         atomic_set(&nfsi->commit_info.rpcs_out, 0);
5915 +#ifdef CONFIG_PREEMPT_RT_BASE
5916 +       sema_init(&nfsi->rmdir_sem, 1);
5917 +#else
5918         init_rwsem(&nfsi->rmdir_sem);
5919 +#endif
5920         nfs4_init_once(nfsi);
5923 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/fs/nfs/nfs4_fs.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/fs/nfs/nfs4_fs.h
5924 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/fs/nfs/nfs4_fs.h        2017-04-16 10:38:21.000000000 +0200
5925 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/fs/nfs/nfs4_fs.h     2017-04-18 17:54:25.000000000 +0200
5926 @@ -111,7 +111,7 @@
5927         unsigned long        so_flags;
5928         struct list_head     so_states;
5929         struct nfs_seqid_counter so_seqid;
5930 -       seqcount_t           so_reclaim_seqcount;
5931 +       seqlock_t            so_reclaim_seqlock;
5932         struct mutex         so_delegreturn_mutex;
5933  };
5935 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/fs/nfs/nfs4proc.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/fs/nfs/nfs4proc.c
5936 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/fs/nfs/nfs4proc.c       2017-04-16 10:38:21.000000000 +0200
5937 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/fs/nfs/nfs4proc.c    2017-04-18 17:54:25.000000000 +0200
5938 @@ -2695,7 +2695,7 @@
5939         unsigned int seq;
5940         int ret;
5942 -       seq = raw_seqcount_begin(&sp->so_reclaim_seqcount);
5943 +       seq = raw_seqcount_begin(&sp->so_reclaim_seqlock.seqcount);
5945         ret = _nfs4_proc_open(opendata);
5946         if (ret != 0)
5947 @@ -2733,7 +2733,7 @@
5949         if (d_inode(dentry) == state->inode) {
5950                 nfs_inode_attach_open_context(ctx);
5951 -               if (read_seqcount_retry(&sp->so_reclaim_seqcount, seq))
5952 +               if (read_seqretry(&sp->so_reclaim_seqlock, seq))
5953                         nfs4_schedule_stateid_recovery(server, state);
5954         }
5955  out:
5956 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/fs/nfs/nfs4state.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/fs/nfs/nfs4state.c
5957 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/fs/nfs/nfs4state.c      2017-04-16 10:38:21.000000000 +0200
5958 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/fs/nfs/nfs4state.c   2017-04-18 17:54:25.000000000 +0200
5959 @@ -488,7 +488,7 @@
5960         nfs4_init_seqid_counter(&sp->so_seqid);
5961         atomic_set(&sp->so_count, 1);
5962         INIT_LIST_HEAD(&sp->so_lru);
5963 -       seqcount_init(&sp->so_reclaim_seqcount);
5964 +       seqlock_init(&sp->so_reclaim_seqlock);
5965         mutex_init(&sp->so_delegreturn_mutex);
5966         return sp;
5968 @@ -1497,8 +1497,12 @@
5969          * recovering after a network partition or a reboot from a
5970          * server that doesn't support a grace period.
5971          */
5972 +#ifdef CONFIG_PREEMPT_RT_FULL
5973 +       write_seqlock(&sp->so_reclaim_seqlock);
5974 +#else
5975 +       write_seqcount_begin(&sp->so_reclaim_seqlock.seqcount);
5976 +#endif
5977         spin_lock(&sp->so_lock);
5978 -       raw_write_seqcount_begin(&sp->so_reclaim_seqcount);
5979  restart:
5980         list_for_each_entry(state, &sp->so_states, open_states) {
5981                 if (!test_and_clear_bit(ops->state_flag_bit, &state->flags))
5982 @@ -1567,14 +1571,20 @@
5983                 spin_lock(&sp->so_lock);
5984                 goto restart;
5985         }
5986 -       raw_write_seqcount_end(&sp->so_reclaim_seqcount);
5987         spin_unlock(&sp->so_lock);
5988 +#ifdef CONFIG_PREEMPT_RT_FULL
5989 +       write_sequnlock(&sp->so_reclaim_seqlock);
5990 +#else
5991 +       write_seqcount_end(&sp->so_reclaim_seqlock.seqcount);
5992 +#endif
5993         return 0;
5994  out_err:
5995         nfs4_put_open_state(state);
5996 -       spin_lock(&sp->so_lock);
5997 -       raw_write_seqcount_end(&sp->so_reclaim_seqcount);
5998 -       spin_unlock(&sp->so_lock);
5999 +#ifdef CONFIG_PREEMPT_RT_FULL
6000 +       write_sequnlock(&sp->so_reclaim_seqlock);
6001 +#else
6002 +       write_seqcount_end(&sp->so_reclaim_seqlock.seqcount);
6003 +#endif
6004         return status;
6007 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/fs/nfs/unlink.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/fs/nfs/unlink.c
6008 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/fs/nfs/unlink.c 2017-04-16 10:38:21.000000000 +0200
6009 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/fs/nfs/unlink.c      2017-04-18 17:54:25.000000000 +0200
6010 @@ -12,7 +12,7 @@
6011  #include <linux/sunrpc/clnt.h>
6012  #include <linux/nfs_fs.h>
6013  #include <linux/sched.h>
6014 -#include <linux/wait.h>
6015 +#include <linux/swait.h>
6016  #include <linux/namei.h>
6017  #include <linux/fsnotify.h>
6019 @@ -51,6 +51,29 @@
6020                 rpc_restart_call_prepare(task);
6023 +#ifdef CONFIG_PREEMPT_RT_BASE
6024 +static void nfs_down_anon(struct semaphore *sema)
6026 +       down(sema);
6029 +static void nfs_up_anon(struct semaphore *sema)
6031 +       up(sema);
6034 +#else
6035 +static void nfs_down_anon(struct rw_semaphore *rwsem)
6037 +       down_read_non_owner(rwsem);
6040 +static void nfs_up_anon(struct rw_semaphore *rwsem)
6042 +       up_read_non_owner(rwsem);
6044 +#endif
6046  /**
6047   * nfs_async_unlink_release - Release the sillydelete data.
6048   * @task: rpc_task of the sillydelete
6049 @@ -64,7 +87,7 @@
6050         struct dentry *dentry = data->dentry;
6051         struct super_block *sb = dentry->d_sb;
6053 -       up_read_non_owner(&NFS_I(d_inode(dentry->d_parent))->rmdir_sem);
6054 +       nfs_up_anon(&NFS_I(d_inode(dentry->d_parent))->rmdir_sem);
6055         d_lookup_done(dentry);
6056         nfs_free_unlinkdata(data);
6057         dput(dentry);
6058 @@ -117,10 +140,10 @@
6059         struct inode *dir = d_inode(dentry->d_parent);
6060         struct dentry *alias;
6062 -       down_read_non_owner(&NFS_I(dir)->rmdir_sem);
6063 +       nfs_down_anon(&NFS_I(dir)->rmdir_sem);
6064         alias = d_alloc_parallel(dentry->d_parent, &data->args.name, &data->wq);
6065         if (IS_ERR(alias)) {
6066 -               up_read_non_owner(&NFS_I(dir)->rmdir_sem);
6067 +               nfs_up_anon(&NFS_I(dir)->rmdir_sem);
6068                 return 0;
6069         }
6070         if (!d_in_lookup(alias)) {
6071 @@ -142,7 +165,7 @@
6072                         ret = 0;
6073                 spin_unlock(&alias->d_lock);
6074                 dput(alias);
6075 -               up_read_non_owner(&NFS_I(dir)->rmdir_sem);
6076 +               nfs_up_anon(&NFS_I(dir)->rmdir_sem);
6077                 /*
6078                  * If we'd displaced old cached devname, free it.  At that
6079                  * point dentry is definitely not a root, so we won't need
6080 @@ -182,7 +205,7 @@
6081                 goto out_free_name;
6082         }
6083         data->res.dir_attr = &data->dir_attr;
6084 -       init_waitqueue_head(&data->wq);
6085 +       init_swait_queue_head(&data->wq);
6087         status = -EBUSY;
6088         spin_lock(&dentry->d_lock);
6089 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/fs/ntfs/aops.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/fs/ntfs/aops.c
6090 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/fs/ntfs/aops.c  2017-04-16 10:38:21.000000000 +0200
6091 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/fs/ntfs/aops.c       2017-04-18 17:54:25.000000000 +0200
6092 @@ -92,13 +92,13 @@
6093                         ofs = 0;
6094                         if (file_ofs < init_size)
6095                                 ofs = init_size - file_ofs;
6096 -                       local_irq_save(flags);
6097 +                       local_irq_save_nort(flags);
6098                         kaddr = kmap_atomic(page);
6099                         memset(kaddr + bh_offset(bh) + ofs, 0,
6100                                         bh->b_size - ofs);
6101                         flush_dcache_page(page);
6102                         kunmap_atomic(kaddr);
6103 -                       local_irq_restore(flags);
6104 +                       local_irq_restore_nort(flags);
6105                 }
6106         } else {
6107                 clear_buffer_uptodate(bh);
6108 @@ -107,8 +107,7 @@
6109                                 "0x%llx.", (unsigned long long)bh->b_blocknr);
6110         }
6111         first = page_buffers(page);
6112 -       local_irq_save(flags);
6113 -       bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
6114 +       flags = bh_uptodate_lock_irqsave(first);
6115         clear_buffer_async_read(bh);
6116         unlock_buffer(bh);
6117         tmp = bh;
6118 @@ -123,8 +122,7 @@
6119                 }
6120                 tmp = tmp->b_this_page;
6121         } while (tmp != bh);
6122 -       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
6123 -       local_irq_restore(flags);
6124 +       bh_uptodate_unlock_irqrestore(first, flags);
6125         /*
6126          * If none of the buffers had errors then we can set the page uptodate,
6127          * but we first have to perform the post read mst fixups, if the
6128 @@ -145,13 +143,13 @@
6129                 recs = PAGE_SIZE / rec_size;
6130                 /* Should have been verified before we got here... */
6131                 BUG_ON(!recs);
6132 -               local_irq_save(flags);
6133 +               local_irq_save_nort(flags);
6134                 kaddr = kmap_atomic(page);
6135                 for (i = 0; i < recs; i++)
6136                         post_read_mst_fixup((NTFS_RECORD*)(kaddr +
6137                                         i * rec_size), rec_size);
6138                 kunmap_atomic(kaddr);
6139 -               local_irq_restore(flags);
6140 +               local_irq_restore_nort(flags);
6141                 flush_dcache_page(page);
6142                 if (likely(page_uptodate && !PageError(page)))
6143                         SetPageUptodate(page);
6144 @@ -159,9 +157,7 @@
6145         unlock_page(page);
6146         return;
6147  still_busy:
6148 -       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
6149 -       local_irq_restore(flags);
6150 -       return;
6151 +       bh_uptodate_unlock_irqrestore(first, flags);
6154  /**
6155 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/fs/proc/base.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/fs/proc/base.c
6156 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/fs/proc/base.c  2017-04-16 10:38:22.000000000 +0200
6157 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/fs/proc/base.c       2017-04-18 17:54:25.000000000 +0200
6158 @@ -1834,7 +1834,7 @@
6160         child = d_hash_and_lookup(dir, &qname);
6161         if (!child) {
6162 -               DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
6163 +               DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
6164                 child = d_alloc_parallel(dir, &qname, &wq);
6165                 if (IS_ERR(child))
6166                         goto end_instantiate;
6167 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/fs/proc/proc_sysctl.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/fs/proc/proc_sysctl.c
6168 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/fs/proc/proc_sysctl.c   2017-04-16 10:38:22.000000000 +0200
6169 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/fs/proc/proc_sysctl.c        2017-04-18 17:54:26.000000000 +0200
6170 @@ -632,7 +632,7 @@
6172         child = d_lookup(dir, &qname);
6173         if (!child) {
6174 -               DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
6175 +               DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
6176                 child = d_alloc_parallel(dir, &qname, &wq);
6177                 if (IS_ERR(child))
6178                         return false;
6179 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/fs/timerfd.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/fs/timerfd.c
6180 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/fs/timerfd.c    2017-04-16 10:38:22.000000000 +0200
6181 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/fs/timerfd.c 2017-04-18 17:54:26.000000000 +0200
6182 @@ -460,7 +460,10 @@
6183                                 break;
6184                 }
6185                 spin_unlock_irq(&ctx->wqh.lock);
6186 -               cpu_relax();
6187 +               if (isalarm(ctx))
6188 +                       hrtimer_wait_for_timer(&ctx->t.alarm.timer);
6189 +               else
6190 +                       hrtimer_wait_for_timer(&ctx->t.tmr);
6191         }
6193         /*
6194 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/acpi/platform/aclinux.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/acpi/platform/aclinux.h
6195 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/acpi/platform/aclinux.h 2017-04-16 10:38:23.000000000 +0200
6196 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/acpi/platform/aclinux.h      2017-04-18 17:54:26.000000000 +0200
6197 @@ -133,6 +133,7 @@
6199  #define acpi_cache_t                        struct kmem_cache
6200  #define acpi_spinlock                       spinlock_t *
6201 +#define acpi_raw_spinlock              raw_spinlock_t *
6202  #define acpi_cpu_flags                      unsigned long
6204  /* Use native linux version of acpi_os_allocate_zeroed */
6205 @@ -151,6 +152,20 @@
6206  #define ACPI_USE_ALTERNATE_PROTOTYPE_acpi_os_get_thread_id
6207  #define ACPI_USE_ALTERNATE_PROTOTYPE_acpi_os_create_lock
6209 +#define acpi_os_create_raw_lock(__handle)                      \
6210 +({                                                             \
6211 +        raw_spinlock_t *lock = ACPI_ALLOCATE(sizeof(*lock));   \
6212 +                                                               \
6213 +        if (lock) {                                            \
6214 +               *(__handle) = lock;                             \
6215 +               raw_spin_lock_init(*(__handle));                \
6216 +        }                                                      \
6217 +        lock ? AE_OK : AE_NO_MEMORY;                           \
6218 + })
6220 +#define acpi_os_delete_raw_lock(__handle)      kfree(__handle)
6223  /*
6224   * OSL interfaces used by debugger/disassembler
6225   */
6226 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/asm-generic/bug.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/asm-generic/bug.h
6227 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/asm-generic/bug.h       2017-04-16 10:38:23.000000000 +0200
6228 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/asm-generic/bug.h    2017-04-18 17:54:26.000000000 +0200
6229 @@ -215,6 +215,20 @@
6230  # define WARN_ON_SMP(x)                        ({0;})
6231  #endif
6233 +#ifdef CONFIG_PREEMPT_RT_BASE
6234 +# define BUG_ON_RT(c)                  BUG_ON(c)
6235 +# define BUG_ON_NONRT(c)               do { } while (0)
6236 +# define WARN_ON_RT(condition)         WARN_ON(condition)
6237 +# define WARN_ON_NONRT(condition)      do { } while (0)
6238 +# define WARN_ON_ONCE_NONRT(condition) do { } while (0)
6239 +#else
6240 +# define BUG_ON_RT(c)                  do { } while (0)
6241 +# define BUG_ON_NONRT(c)               BUG_ON(c)
6242 +# define WARN_ON_RT(condition)         do { } while (0)
6243 +# define WARN_ON_NONRT(condition)      WARN_ON(condition)
6244 +# define WARN_ON_ONCE_NONRT(condition) WARN_ON_ONCE(condition)
6245 +#endif
6247  #endif /* __ASSEMBLY__ */
6249  #endif
6250 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/blk-mq.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/blk-mq.h
6251 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/blk-mq.h  2017-04-16 10:38:24.000000000 +0200
6252 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/blk-mq.h       2017-04-18 17:54:26.000000000 +0200
6253 @@ -209,7 +209,7 @@
6254         return unique_tag & BLK_MQ_UNIQUE_TAG_MASK;
6258 +void __blk_mq_complete_request_remote_work(struct work_struct *work);
6259  int blk_mq_request_started(struct request *rq);
6260  void blk_mq_start_request(struct request *rq);
6261  void blk_mq_end_request(struct request *rq, int error);
6262 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/blkdev.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/blkdev.h
6263 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/blkdev.h  2017-04-16 10:38:24.000000000 +0200
6264 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/blkdev.h       2017-04-18 17:54:26.000000000 +0200
6265 @@ -89,6 +89,7 @@
6266         struct list_head queuelist;
6267         union {
6268                 struct call_single_data csd;
6269 +               struct work_struct work;
6270                 u64 fifo_time;
6271         };
6273 @@ -467,7 +468,7 @@
6274         struct throtl_data *td;
6275  #endif
6276         struct rcu_head         rcu_head;
6277 -       wait_queue_head_t       mq_freeze_wq;
6278 +       struct swait_queue_head mq_freeze_wq;
6279         struct percpu_ref       q_usage_counter;
6280         struct list_head        all_q_node;
6282 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/bottom_half.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/bottom_half.h
6283 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/bottom_half.h     2017-04-16 10:38:24.000000000 +0200
6284 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/bottom_half.h  2017-04-18 17:54:26.000000000 +0200
6285 @@ -3,6 +3,39 @@
6287  #include <linux/preempt.h>
6289 +#ifdef CONFIG_PREEMPT_RT_FULL
6291 +extern void __local_bh_disable(void);
6292 +extern void _local_bh_enable(void);
6293 +extern void __local_bh_enable(void);
6295 +static inline void local_bh_disable(void)
6297 +       __local_bh_disable();
6300 +static inline void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
6302 +       __local_bh_disable();
6305 +static inline void local_bh_enable(void)
6307 +       __local_bh_enable();
6310 +static inline void __local_bh_enable_ip(unsigned long ip, unsigned int cnt)
6312 +       __local_bh_enable();
6315 +static inline void local_bh_enable_ip(unsigned long ip)
6317 +       __local_bh_enable();
6320 +#else
6322  #ifdef CONFIG_TRACE_IRQFLAGS
6323  extern void __local_bh_disable_ip(unsigned long ip, unsigned int cnt);
6324  #else
6325 @@ -30,5 +63,6 @@
6327         __local_bh_enable_ip(_THIS_IP_, SOFTIRQ_DISABLE_OFFSET);
6329 +#endif
6331  #endif /* _LINUX_BH_H */
6332 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/buffer_head.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/buffer_head.h
6333 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/buffer_head.h     2017-04-16 10:38:24.000000000 +0200
6334 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/buffer_head.h  2017-04-18 17:54:26.000000000 +0200
6335 @@ -75,8 +75,50 @@
6336         struct address_space *b_assoc_map;      /* mapping this buffer is
6337                                                    associated with */
6338         atomic_t b_count;               /* users using this buffer_head */
6339 +#ifdef CONFIG_PREEMPT_RT_BASE
6340 +       spinlock_t b_uptodate_lock;
6341 +#if IS_ENABLED(CONFIG_JBD2)
6342 +       spinlock_t b_state_lock;
6343 +       spinlock_t b_journal_head_lock;
6344 +#endif
6345 +#endif
6346  };
6348 +static inline unsigned long bh_uptodate_lock_irqsave(struct buffer_head *bh)
6350 +       unsigned long flags;
6352 +#ifndef CONFIG_PREEMPT_RT_BASE
6353 +       local_irq_save(flags);
6354 +       bit_spin_lock(BH_Uptodate_Lock, &bh->b_state);
6355 +#else
6356 +       spin_lock_irqsave(&bh->b_uptodate_lock, flags);
6357 +#endif
6358 +       return flags;
6361 +static inline void
6362 +bh_uptodate_unlock_irqrestore(struct buffer_head *bh, unsigned long flags)
6364 +#ifndef CONFIG_PREEMPT_RT_BASE
6365 +       bit_spin_unlock(BH_Uptodate_Lock, &bh->b_state);
6366 +       local_irq_restore(flags);
6367 +#else
6368 +       spin_unlock_irqrestore(&bh->b_uptodate_lock, flags);
6369 +#endif
6372 +static inline void buffer_head_init_locks(struct buffer_head *bh)
6374 +#ifdef CONFIG_PREEMPT_RT_BASE
6375 +       spin_lock_init(&bh->b_uptodate_lock);
6376 +#if IS_ENABLED(CONFIG_JBD2)
6377 +       spin_lock_init(&bh->b_state_lock);
6378 +       spin_lock_init(&bh->b_journal_head_lock);
6379 +#endif
6380 +#endif
6383  /*
6384   * macro tricks to expand the set_buffer_foo(), clear_buffer_foo()
6385   * and buffer_foo() functions.
6386 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/cgroup-defs.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/cgroup-defs.h
6387 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/cgroup-defs.h     2017-04-16 10:38:24.000000000 +0200
6388 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/cgroup-defs.h  2017-04-18 17:54:26.000000000 +0200
6389 @@ -16,6 +16,7 @@
6390  #include <linux/percpu-refcount.h>
6391  #include <linux/percpu-rwsem.h>
6392  #include <linux/workqueue.h>
6393 +#include <linux/swork.h>
6395  #ifdef CONFIG_CGROUPS
6397 @@ -137,6 +138,7 @@
6398         /* percpu_ref killing and RCU release */
6399         struct rcu_head rcu_head;
6400         struct work_struct destroy_work;
6401 +       struct swork_event destroy_swork;
6402  };
6404  /*
6405 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/completion.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/completion.h
6406 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/completion.h      2017-04-16 10:38:24.000000000 +0200
6407 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/completion.h   2017-04-18 17:54:26.000000000 +0200
6408 @@ -7,8 +7,7 @@
6409   * Atomic wait-for-completion handler data structures.
6410   * See kernel/sched/completion.c for details.
6411   */
6413 -#include <linux/wait.h>
6414 +#include <linux/swait.h>
6416  /*
6417   * struct completion - structure used to maintain state for a "completion"
6418 @@ -24,11 +23,11 @@
6419   */
6420  struct completion {
6421         unsigned int done;
6422 -       wait_queue_head_t wait;
6423 +       struct swait_queue_head wait;
6424  };
6426  #define COMPLETION_INITIALIZER(work) \
6427 -       { 0, __WAIT_QUEUE_HEAD_INITIALIZER((work).wait) }
6428 +       { 0, __SWAIT_QUEUE_HEAD_INITIALIZER((work).wait) }
6430  #define COMPLETION_INITIALIZER_ONSTACK(work) \
6431         ({ init_completion(&work); work; })
6432 @@ -73,7 +72,7 @@
6433  static inline void init_completion(struct completion *x)
6435         x->done = 0;
6436 -       init_waitqueue_head(&x->wait);
6437 +       init_swait_queue_head(&x->wait);
6440  /**
6441 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/cpu.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/cpu.h
6442 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/cpu.h     2017-04-16 10:38:24.000000000 +0200
6443 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/cpu.h  2017-04-18 17:54:26.000000000 +0200
6444 @@ -182,6 +182,8 @@
6445  extern void put_online_cpus(void);
6446  extern void cpu_hotplug_disable(void);
6447  extern void cpu_hotplug_enable(void);
6448 +extern void pin_current_cpu(void);
6449 +extern void unpin_current_cpu(void);
6450  #define hotcpu_notifier(fn, pri)       cpu_notifier(fn, pri)
6451  #define __hotcpu_notifier(fn, pri)     __cpu_notifier(fn, pri)
6452  #define register_hotcpu_notifier(nb)   register_cpu_notifier(nb)
6453 @@ -199,6 +201,8 @@
6454  #define put_online_cpus()      do { } while (0)
6455  #define cpu_hotplug_disable()  do { } while (0)
6456  #define cpu_hotplug_enable()   do { } while (0)
6457 +static inline void pin_current_cpu(void) { }
6458 +static inline void unpin_current_cpu(void) { }
6459  #define hotcpu_notifier(fn, pri)       do { (void)(fn); } while (0)
6460  #define __hotcpu_notifier(fn, pri)     do { (void)(fn); } while (0)
6461  /* These aren't inline functions due to a GCC bug. */
6462 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/dcache.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/dcache.h
6463 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/dcache.h  2017-04-16 10:38:24.000000000 +0200
6464 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/dcache.h       2017-04-18 17:54:26.000000000 +0200
6465 @@ -11,6 +11,7 @@
6466  #include <linux/rcupdate.h>
6467  #include <linux/lockref.h>
6468  #include <linux/stringhash.h>
6469 +#include <linux/wait.h>
6471  struct path;
6472  struct vfsmount;
6473 @@ -100,7 +101,7 @@
6475         union {
6476                 struct list_head d_lru;         /* LRU list */
6477 -               wait_queue_head_t *d_wait;      /* in-lookup ones only */
6478 +               struct swait_queue_head *d_wait;        /* in-lookup ones only */
6479         };
6480         struct list_head d_child;       /* child of parent list */
6481         struct list_head d_subdirs;     /* our children */
6482 @@ -230,7 +231,7 @@
6483  extern struct dentry * d_alloc(struct dentry *, const struct qstr *);
6484  extern struct dentry * d_alloc_pseudo(struct super_block *, const struct qstr *);
6485  extern struct dentry * d_alloc_parallel(struct dentry *, const struct qstr *,
6486 -                                       wait_queue_head_t *);
6487 +                                       struct swait_queue_head *);
6488  extern struct dentry * d_splice_alias(struct inode *, struct dentry *);
6489  extern struct dentry * d_add_ci(struct dentry *, struct inode *, struct qstr *);
6490  extern struct dentry * d_exact_alias(struct dentry *, struct inode *);
6491 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/delay.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/delay.h
6492 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/delay.h   2017-04-16 10:38:24.000000000 +0200
6493 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/delay.h        2017-04-18 17:54:26.000000000 +0200
6494 @@ -52,4 +52,10 @@
6495         msleep(seconds * 1000);
6498 +#ifdef CONFIG_PREEMPT_RT_FULL
6499 +extern void cpu_chill(void);
6500 +#else
6501 +# define cpu_chill()   cpu_relax()
6502 +#endif
6504  #endif /* defined(_LINUX_DELAY_H) */
6505 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/highmem.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/highmem.h
6506 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/highmem.h 2017-04-16 10:38:24.000000000 +0200
6507 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/highmem.h      2017-04-18 17:54:26.000000000 +0200
6508 @@ -7,6 +7,7 @@
6509  #include <linux/mm.h>
6510  #include <linux/uaccess.h>
6511  #include <linux/hardirq.h>
6512 +#include <linux/sched.h>
6514  #include <asm/cacheflush.h>
6516 @@ -65,7 +66,7 @@
6518  static inline void *kmap_atomic(struct page *page)
6520 -       preempt_disable();
6521 +       preempt_disable_nort();
6522         pagefault_disable();
6523         return page_address(page);
6525 @@ -74,7 +75,7 @@
6526  static inline void __kunmap_atomic(void *addr)
6528         pagefault_enable();
6529 -       preempt_enable();
6530 +       preempt_enable_nort();
6533  #define kmap_atomic_pfn(pfn)   kmap_atomic(pfn_to_page(pfn))
6534 @@ -86,32 +87,51 @@
6536  #if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32)
6538 +#ifndef CONFIG_PREEMPT_RT_FULL
6539  DECLARE_PER_CPU(int, __kmap_atomic_idx);
6540 +#endif
6542  static inline int kmap_atomic_idx_push(void)
6544 +#ifndef CONFIG_PREEMPT_RT_FULL
6545         int idx = __this_cpu_inc_return(__kmap_atomic_idx) - 1;
6547 -#ifdef CONFIG_DEBUG_HIGHMEM
6548 +# ifdef CONFIG_DEBUG_HIGHMEM
6549         WARN_ON_ONCE(in_irq() && !irqs_disabled());
6550         BUG_ON(idx >= KM_TYPE_NR);
6551 -#endif
6552 +# endif
6553         return idx;
6554 +#else
6555 +       current->kmap_idx++;
6556 +       BUG_ON(current->kmap_idx > KM_TYPE_NR);
6557 +       return current->kmap_idx - 1;
6558 +#endif
6561  static inline int kmap_atomic_idx(void)
6563 +#ifndef CONFIG_PREEMPT_RT_FULL
6564         return __this_cpu_read(__kmap_atomic_idx) - 1;
6565 +#else
6566 +       return current->kmap_idx - 1;
6567 +#endif
6570  static inline void kmap_atomic_idx_pop(void)
6572 -#ifdef CONFIG_DEBUG_HIGHMEM
6573 +#ifndef CONFIG_PREEMPT_RT_FULL
6574 +# ifdef CONFIG_DEBUG_HIGHMEM
6575         int idx = __this_cpu_dec_return(__kmap_atomic_idx);
6577         BUG_ON(idx < 0);
6578 -#else
6579 +# else
6580         __this_cpu_dec(__kmap_atomic_idx);
6581 +# endif
6582 +#else
6583 +       current->kmap_idx--;
6584 +# ifdef CONFIG_DEBUG_HIGHMEM
6585 +       BUG_ON(current->kmap_idx < 0);
6586 +# endif
6587  #endif
6590 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/hrtimer.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/hrtimer.h
6591 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/hrtimer.h 2017-04-16 10:38:24.000000000 +0200
6592 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/hrtimer.h      2017-04-18 17:54:26.000000000 +0200
6593 @@ -87,6 +87,9 @@
6594   * @function:  timer expiry callback function
6595   * @base:      pointer to the timer base (per cpu and per clock)
6596   * @state:     state information (See bit values above)
6597 + * @cb_entry:  list entry to defer timers from hardirq context
6598 + * @irqsafe:   timer can run in hardirq context
6599 + * @praecox:   timer expiry time if expired at the time of programming
6600   * @is_rel:    Set if the timer was armed relative
6601   * @start_pid:  timer statistics field to store the pid of the task which
6602   *             started the timer
6603 @@ -103,6 +106,11 @@
6604         enum hrtimer_restart            (*function)(struct hrtimer *);
6605         struct hrtimer_clock_base       *base;
6606         u8                              state;
6607 +       struct list_head                cb_entry;
6608 +       int                             irqsafe;
6609 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
6610 +       ktime_t                         praecox;
6611 +#endif
6612         u8                              is_rel;
6613  #ifdef CONFIG_TIMER_STATS
6614         int                             start_pid;
6615 @@ -123,11 +131,7 @@
6616         struct task_struct *task;
6617  };
6619 -#ifdef CONFIG_64BIT
6620  # define HRTIMER_CLOCK_BASE_ALIGN      64
6621 -#else
6622 -# define HRTIMER_CLOCK_BASE_ALIGN      32
6623 -#endif
6625  /**
6626   * struct hrtimer_clock_base - the timer base for a specific clock
6627 @@ -136,6 +140,7 @@
6628   *                     timer to a base on another cpu.
6629   * @clockid:           clock id for per_cpu support
6630   * @active:            red black tree root node for the active timers
6631 + * @expired:           list head for deferred timers.
6632   * @get_time:          function to retrieve the current time of the clock
6633   * @offset:            offset of this clock to the monotonic base
6634   */
6635 @@ -144,6 +149,7 @@
6636         int                     index;
6637         clockid_t               clockid;
6638         struct timerqueue_head  active;
6639 +       struct list_head        expired;
6640         ktime_t                 (*get_time)(void);
6641         ktime_t                 offset;
6642  } __attribute__((__aligned__(HRTIMER_CLOCK_BASE_ALIGN)));
6643 @@ -187,6 +193,7 @@
6644         raw_spinlock_t                  lock;
6645         seqcount_t                      seq;
6646         struct hrtimer                  *running;
6647 +       struct hrtimer                  *running_soft;
6648         unsigned int                    cpu;
6649         unsigned int                    active_bases;
6650         unsigned int                    clock_was_set_seq;
6651 @@ -203,6 +210,9 @@
6652         unsigned int                    nr_hangs;
6653         unsigned int                    max_hang_time;
6654  #endif
6655 +#ifdef CONFIG_PREEMPT_RT_BASE
6656 +       wait_queue_head_t               wait;
6657 +#endif
6658         struct hrtimer_clock_base       clock_base[HRTIMER_MAX_CLOCK_BASES];
6659  } ____cacheline_aligned;
6661 @@ -412,6 +422,13 @@
6662         hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
6665 +/* Softirq preemption could deadlock timer removal */
6666 +#ifdef CONFIG_PREEMPT_RT_BASE
6667 +  extern void hrtimer_wait_for_timer(const struct hrtimer *timer);
6668 +#else
6669 +# define hrtimer_wait_for_timer(timer) do { cpu_relax(); } while (0)
6670 +#endif
6672  /* Query timers: */
6673  extern ktime_t __hrtimer_get_remaining(const struct hrtimer *timer, bool adjust);
6675 @@ -436,9 +453,15 @@
6676   * Helper function to check, whether the timer is running the callback
6677   * function
6678   */
6679 -static inline int hrtimer_callback_running(struct hrtimer *timer)
6680 +static inline int hrtimer_callback_running(const struct hrtimer *timer)
6682 -       return timer->base->cpu_base->running == timer;
6683 +       if (timer->base->cpu_base->running == timer)
6684 +               return 1;
6685 +#ifdef CONFIG_PREEMPT_RT_BASE
6686 +       if (timer->base->cpu_base->running_soft == timer)
6687 +               return 1;
6688 +#endif
6689 +       return 0;
6692  /* Forward a hrtimer so it expires after now: */
6693 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/idr.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/idr.h
6694 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/idr.h     2017-04-16 10:38:24.000000000 +0200
6695 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/idr.h  2017-04-18 17:54:26.000000000 +0200
6696 @@ -95,10 +95,14 @@
6697   * Each idr_preload() should be matched with an invocation of this
6698   * function.  See idr_preload() for details.
6699   */
6700 +#ifdef CONFIG_PREEMPT_RT_FULL
6701 +void idr_preload_end(void);
6702 +#else
6703  static inline void idr_preload_end(void)
6705         preempt_enable();
6707 +#endif
6709  /**
6710   * idr_find - return pointer for given id
6711 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/init_task.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/init_task.h
6712 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/init_task.h       2017-04-16 10:38:24.000000000 +0200
6713 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/init_task.h    2017-04-18 17:54:26.000000000 +0200
6714 @@ -150,6 +150,12 @@
6715  # define INIT_PERF_EVENTS(tsk)
6716  #endif
6718 +#ifdef CONFIG_PREEMPT_RT_BASE
6719 +# define INIT_TIMER_LIST               .posix_timer_list = NULL,
6720 +#else
6721 +# define INIT_TIMER_LIST
6722 +#endif
6724  #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
6725  # define INIT_VTIME(tsk)                                               \
6726         .vtime_seqcount = SEQCNT_ZERO(tsk.vtime_seqcount),      \
6727 @@ -250,6 +256,7 @@
6728         .cpu_timers     = INIT_CPU_TIMERS(tsk.cpu_timers),              \
6729         .pi_lock        = __RAW_SPIN_LOCK_UNLOCKED(tsk.pi_lock),        \
6730         .timer_slack_ns = 50000, /* 50 usec default slack */            \
6731 +       INIT_TIMER_LIST                                                 \
6732         .pids = {                                                       \
6733                 [PIDTYPE_PID]  = INIT_PID_LINK(PIDTYPE_PID),            \
6734                 [PIDTYPE_PGID] = INIT_PID_LINK(PIDTYPE_PGID),           \
6735 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/interrupt.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/interrupt.h
6736 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/interrupt.h       2017-04-16 10:38:24.000000000 +0200
6737 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/interrupt.h    2017-04-18 17:54:26.000000000 +0200
6738 @@ -14,6 +14,7 @@
6739  #include <linux/hrtimer.h>
6740  #include <linux/kref.h>
6741  #include <linux/workqueue.h>
6742 +#include <linux/swork.h>
6744  #include <linux/atomic.h>
6745  #include <asm/ptrace.h>
6746 @@ -61,6 +62,7 @@
6747   *                interrupt handler after suspending interrupts. For system
6748   *                wakeup devices users need to implement wakeup detection in
6749   *                their interrupt handlers.
6750 + * IRQF_NO_SOFTIRQ_CALL - Do not process softirqs in the irq thread context (RT)
6751   */
6752  #define IRQF_SHARED            0x00000080
6753  #define IRQF_PROBE_SHARED      0x00000100
6754 @@ -74,6 +76,7 @@
6755  #define IRQF_NO_THREAD         0x00010000
6756  #define IRQF_EARLY_RESUME      0x00020000
6757  #define IRQF_COND_SUSPEND      0x00040000
6758 +#define IRQF_NO_SOFTIRQ_CALL   0x00080000
6760  #define IRQF_TIMER             (__IRQF_TIMER | IRQF_NO_SUSPEND | IRQF_NO_THREAD)
6762 @@ -196,7 +199,7 @@
6763  #ifdef CONFIG_LOCKDEP
6764  # define local_irq_enable_in_hardirq() do { } while (0)
6765  #else
6766 -# define local_irq_enable_in_hardirq() local_irq_enable()
6767 +# define local_irq_enable_in_hardirq() local_irq_enable_nort()
6768  #endif
6770  extern void disable_irq_nosync(unsigned int irq);
6771 @@ -216,6 +219,7 @@
6772   * struct irq_affinity_notify - context for notification of IRQ affinity changes
6773   * @irq:               Interrupt to which notification applies
6774   * @kref:              Reference count, for internal use
6775 + * @swork:             Swork item, for internal use
6776   * @work:              Work item, for internal use
6777   * @notify:            Function to be called on change.  This will be
6778   *                     called in process context.
6779 @@ -227,7 +231,11 @@
6780  struct irq_affinity_notify {
6781         unsigned int irq;
6782         struct kref kref;
6783 +#ifdef CONFIG_PREEMPT_RT_BASE
6784 +       struct swork_event swork;
6785 +#else
6786         struct work_struct work;
6787 +#endif
6788         void (*notify)(struct irq_affinity_notify *, const cpumask_t *mask);
6789         void (*release)(struct kref *ref);
6790  };
6791 @@ -406,9 +414,13 @@
6792                                  bool state);
6794  #ifdef CONFIG_IRQ_FORCED_THREADING
6795 +# ifndef CONFIG_PREEMPT_RT_BASE
6796  extern bool force_irqthreads;
6797 +# else
6798 +#  define force_irqthreads     (true)
6799 +# endif
6800  #else
6801 -#define force_irqthreads       (0)
6802 +#define force_irqthreads       (false)
6803  #endif
6805  #ifndef __ARCH_SET_SOFTIRQ_PENDING
6806 @@ -465,9 +477,10 @@
6807         void    (*action)(struct softirq_action *);
6808  };
6810 +#ifndef CONFIG_PREEMPT_RT_FULL
6811  asmlinkage void do_softirq(void);
6812  asmlinkage void __do_softirq(void);
6814 +static inline void thread_do_softirq(void) { do_softirq(); }
6815  #ifdef __ARCH_HAS_DO_SOFTIRQ
6816  void do_softirq_own_stack(void);
6817  #else
6818 @@ -476,13 +489,25 @@
6819         __do_softirq();
6821  #endif
6822 +#else
6823 +extern void thread_do_softirq(void);
6824 +#endif
6826  extern void open_softirq(int nr, void (*action)(struct softirq_action *));
6827  extern void softirq_init(void);
6828  extern void __raise_softirq_irqoff(unsigned int nr);
6829 +#ifdef CONFIG_PREEMPT_RT_FULL
6830 +extern void __raise_softirq_irqoff_ksoft(unsigned int nr);
6831 +#else
6832 +static inline void __raise_softirq_irqoff_ksoft(unsigned int nr)
6834 +       __raise_softirq_irqoff(nr);
6836 +#endif
6838  extern void raise_softirq_irqoff(unsigned int nr);
6839  extern void raise_softirq(unsigned int nr);
6840 +extern void softirq_check_pending_idle(void);
6842  DECLARE_PER_CPU(struct task_struct *, ksoftirqd);
6844 @@ -504,8 +529,9 @@
6845       to be executed on some cpu at least once after this.
6846     * If the tasklet is already scheduled, but its execution is still not
6847       started, it will be executed only once.
6848 -   * If this tasklet is already running on another CPU (or schedule is called
6849 -     from tasklet itself), it is rescheduled for later.
6850 +   * If this tasklet is already running on another CPU, it is rescheduled
6851 +     for later.
6852 +   * Schedule must not be called from the tasklet itself (a lockup occurs)
6853     * Tasklet is strictly serialized wrt itself, but not
6854       wrt another tasklets. If client needs some intertask synchronization,
6855       he makes it with spinlocks.
6856 @@ -530,27 +556,36 @@
6857  enum
6859         TASKLET_STATE_SCHED,    /* Tasklet is scheduled for execution */
6860 -       TASKLET_STATE_RUN       /* Tasklet is running (SMP only) */
6861 +       TASKLET_STATE_RUN,      /* Tasklet is running (SMP only) */
6862 +       TASKLET_STATE_PENDING   /* Tasklet is pending */
6863  };
6865 -#ifdef CONFIG_SMP
6866 +#define TASKLET_STATEF_SCHED   (1 << TASKLET_STATE_SCHED)
6867 +#define TASKLET_STATEF_RUN     (1 << TASKLET_STATE_RUN)
6868 +#define TASKLET_STATEF_PENDING (1 << TASKLET_STATE_PENDING)
6870 +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
6871  static inline int tasklet_trylock(struct tasklet_struct *t)
6873         return !test_and_set_bit(TASKLET_STATE_RUN, &(t)->state);
6876 +static inline int tasklet_tryunlock(struct tasklet_struct *t)
6878 +       return cmpxchg(&t->state, TASKLET_STATEF_RUN, 0) == TASKLET_STATEF_RUN;
6881  static inline void tasklet_unlock(struct tasklet_struct *t)
6883         smp_mb__before_atomic();
6884         clear_bit(TASKLET_STATE_RUN, &(t)->state);
6887 -static inline void tasklet_unlock_wait(struct tasklet_struct *t)
6889 -       while (test_bit(TASKLET_STATE_RUN, &(t)->state)) { barrier(); }
6891 +extern void tasklet_unlock_wait(struct tasklet_struct *t);
6893  #else
6894  #define tasklet_trylock(t) 1
6895 +#define tasklet_tryunlock(t)   1
6896  #define tasklet_unlock_wait(t) do { } while (0)
6897  #define tasklet_unlock(t) do { } while (0)
6898  #endif
6899 @@ -599,12 +634,7 @@
6900         smp_mb();
6903 -static inline void tasklet_enable(struct tasklet_struct *t)
6905 -       smp_mb__before_atomic();
6906 -       atomic_dec(&t->count);
6909 +extern void tasklet_enable(struct tasklet_struct *t);
6910  extern void tasklet_kill(struct tasklet_struct *t);
6911  extern void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu);
6912  extern void tasklet_init(struct tasklet_struct *t,
6913 @@ -635,6 +665,12 @@
6914         tasklet_kill(&ttimer->tasklet);
6917 +#ifdef CONFIG_PREEMPT_RT_FULL
6918 +extern void softirq_early_init(void);
6919 +#else
6920 +static inline void softirq_early_init(void) { }
6921 +#endif
6923  /*
6924   * Autoprobing for irqs:
6925   *
6926 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/irq.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/irq.h
6927 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/irq.h     2017-04-16 10:38:24.000000000 +0200
6928 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/irq.h  2017-04-18 17:54:26.000000000 +0200
6929 @@ -72,6 +72,7 @@
6930   * IRQ_IS_POLLED               - Always polled by another interrupt. Exclude
6931   *                               it from the spurious interrupt detection
6932   *                               mechanism and from core side polling.
6933 + * IRQ_NO_SOFTIRQ_CALL         - No softirq processing in the irq thread context (RT)
6934   * IRQ_DISABLE_UNLAZY          - Disable lazy irq disable
6935   */
6936  enum {
6937 @@ -99,13 +100,14 @@
6938         IRQ_PER_CPU_DEVID       = (1 << 17),
6939         IRQ_IS_POLLED           = (1 << 18),
6940         IRQ_DISABLE_UNLAZY      = (1 << 19),
6941 +       IRQ_NO_SOFTIRQ_CALL     = (1 << 20),
6942  };
6944  #define IRQF_MODIFY_MASK       \
6945         (IRQ_TYPE_SENSE_MASK | IRQ_NOPROBE | IRQ_NOREQUEST | \
6946          IRQ_NOAUTOEN | IRQ_MOVE_PCNTXT | IRQ_LEVEL | IRQ_NO_BALANCING | \
6947          IRQ_PER_CPU | IRQ_NESTED_THREAD | IRQ_NOTHREAD | IRQ_PER_CPU_DEVID | \
6948 -        IRQ_IS_POLLED | IRQ_DISABLE_UNLAZY)
6949 +        IRQ_IS_POLLED | IRQ_DISABLE_UNLAZY | IRQ_NO_SOFTIRQ_CALL)
6951  #define IRQ_NO_BALANCING_MASK  (IRQ_PER_CPU | IRQ_NO_BALANCING)
6953 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/irq_work.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/irq_work.h
6954 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/irq_work.h        2017-04-16 10:38:24.000000000 +0200
6955 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/irq_work.h     2017-04-18 17:54:26.000000000 +0200
6956 @@ -16,6 +16,7 @@
6957  #define IRQ_WORK_BUSY          2UL
6958  #define IRQ_WORK_FLAGS         3UL
6959  #define IRQ_WORK_LAZY          4UL /* Doesn't want IPI, wait for tick */
6960 +#define IRQ_WORK_HARD_IRQ      8UL /* Run hard IRQ context, even on RT */
6962  struct irq_work {
6963         unsigned long flags;
6964 @@ -51,4 +52,10 @@
6965  static inline void irq_work_run(void) { }
6966  #endif
6968 +#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_PREEMPT_RT_FULL)
6969 +void irq_work_tick_soft(void);
6970 +#else
6971 +static inline void irq_work_tick_soft(void) { }
6972 +#endif
6974  #endif /* _LINUX_IRQ_WORK_H */
6975 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/irqdesc.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/irqdesc.h
6976 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/irqdesc.h 2017-04-16 10:38:24.000000000 +0200
6977 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/irqdesc.h      2017-04-18 17:54:26.000000000 +0200
6978 @@ -66,6 +66,7 @@
6979         unsigned int            irqs_unhandled;
6980         atomic_t                threads_handled;
6981         int                     threads_handled_last;
6982 +       u64                     random_ip;
6983         raw_spinlock_t          lock;
6984         struct cpumask          *percpu_enabled;
6985         const struct cpumask    *percpu_affinity;
6986 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/irqflags.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/irqflags.h
6987 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/irqflags.h        2017-04-16 10:38:24.000000000 +0200
6988 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/irqflags.h     2017-04-18 17:54:26.000000000 +0200
6989 @@ -25,8 +25,6 @@
6990  # define trace_softirqs_enabled(p)     ((p)->softirqs_enabled)
6991  # define trace_hardirq_enter() do { current->hardirq_context++; } while (0)
6992  # define trace_hardirq_exit()  do { current->hardirq_context--; } while (0)
6993 -# define lockdep_softirq_enter()       do { current->softirq_context++; } while (0)
6994 -# define lockdep_softirq_exit()        do { current->softirq_context--; } while (0)
6995  # define INIT_TRACE_IRQFLAGS   .softirqs_enabled = 1,
6996  #else
6997  # define trace_hardirqs_on()           do { } while (0)
6998 @@ -39,9 +37,15 @@
6999  # define trace_softirqs_enabled(p)     0
7000  # define trace_hardirq_enter()         do { } while (0)
7001  # define trace_hardirq_exit()          do { } while (0)
7002 +# define INIT_TRACE_IRQFLAGS
7003 +#endif
7005 +#if defined(CONFIG_TRACE_IRQFLAGS) && !defined(CONFIG_PREEMPT_RT_FULL)
7006 +# define lockdep_softirq_enter() do { current->softirq_context++; } while (0)
7007 +# define lockdep_softirq_exit()         do { current->softirq_context--; } while (0)
7008 +#else
7009  # define lockdep_softirq_enter()       do { } while (0)
7010  # define lockdep_softirq_exit()                do { } while (0)
7011 -# define INIT_TRACE_IRQFLAGS
7012  #endif
7014  #if defined(CONFIG_IRQSOFF_TRACER) || \
7015 @@ -148,4 +152,23 @@
7017  #define irqs_disabled_flags(flags) raw_irqs_disabled_flags(flags)
7020 + * local_irq* variants depending on RT/!RT
7021 + */
7022 +#ifdef CONFIG_PREEMPT_RT_FULL
7023 +# define local_irq_disable_nort()      do { } while (0)
7024 +# define local_irq_enable_nort()       do { } while (0)
7025 +# define local_irq_save_nort(flags)    local_save_flags(flags)
7026 +# define local_irq_restore_nort(flags) (void)(flags)
7027 +# define local_irq_disable_rt()                local_irq_disable()
7028 +# define local_irq_enable_rt()         local_irq_enable()
7029 +#else
7030 +# define local_irq_disable_nort()      local_irq_disable()
7031 +# define local_irq_enable_nort()       local_irq_enable()
7032 +# define local_irq_save_nort(flags)    local_irq_save(flags)
7033 +# define local_irq_restore_nort(flags) local_irq_restore(flags)
7034 +# define local_irq_disable_rt()                do { } while (0)
7035 +# define local_irq_enable_rt()         do { } while (0)
7036 +#endif
7038  #endif
7039 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/jbd2.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/jbd2.h
7040 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/jbd2.h    2017-04-16 10:38:24.000000000 +0200
7041 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/jbd2.h 2017-04-18 17:54:26.000000000 +0200
7042 @@ -347,32 +347,56 @@
7044  static inline void jbd_lock_bh_state(struct buffer_head *bh)
7046 +#ifndef CONFIG_PREEMPT_RT_BASE
7047         bit_spin_lock(BH_State, &bh->b_state);
7048 +#else
7049 +       spin_lock(&bh->b_state_lock);
7050 +#endif
7053  static inline int jbd_trylock_bh_state(struct buffer_head *bh)
7055 +#ifndef CONFIG_PREEMPT_RT_BASE
7056         return bit_spin_trylock(BH_State, &bh->b_state);
7057 +#else
7058 +       return spin_trylock(&bh->b_state_lock);
7059 +#endif
7062  static inline int jbd_is_locked_bh_state(struct buffer_head *bh)
7064 +#ifndef CONFIG_PREEMPT_RT_BASE
7065         return bit_spin_is_locked(BH_State, &bh->b_state);
7066 +#else
7067 +       return spin_is_locked(&bh->b_state_lock);
7068 +#endif
7071  static inline void jbd_unlock_bh_state(struct buffer_head *bh)
7073 +#ifndef CONFIG_PREEMPT_RT_BASE
7074         bit_spin_unlock(BH_State, &bh->b_state);
7075 +#else
7076 +       spin_unlock(&bh->b_state_lock);
7077 +#endif
7080  static inline void jbd_lock_bh_journal_head(struct buffer_head *bh)
7082 +#ifndef CONFIG_PREEMPT_RT_BASE
7083         bit_spin_lock(BH_JournalHead, &bh->b_state);
7084 +#else
7085 +       spin_lock(&bh->b_journal_head_lock);
7086 +#endif
7089  static inline void jbd_unlock_bh_journal_head(struct buffer_head *bh)
7091 +#ifndef CONFIG_PREEMPT_RT_BASE
7092         bit_spin_unlock(BH_JournalHead, &bh->b_state);
7093 +#else
7094 +       spin_unlock(&bh->b_journal_head_lock);
7095 +#endif
7098  #define J_ASSERT(assert)       BUG_ON(!(assert))
7099 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/kdb.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/kdb.h
7100 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/kdb.h     2017-04-16 10:38:24.000000000 +0200
7101 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/kdb.h  2017-04-18 17:54:26.000000000 +0200
7102 @@ -167,6 +167,7 @@
7103  extern __printf(1, 2) int kdb_printf(const char *, ...);
7104  typedef __printf(1, 2) int (*kdb_printf_t)(const char *, ...);
7106 +#define in_kdb_printk()        (kdb_trap_printk)
7107  extern void kdb_init(int level);
7109  /* Access to kdb specific polling devices */
7110 @@ -201,6 +202,7 @@
7111  extern int kdb_unregister(char *);
7112  #else /* ! CONFIG_KGDB_KDB */
7113  static inline __printf(1, 2) int kdb_printf(const char *fmt, ...) { return 0; }
7114 +#define in_kdb_printk() (0)
7115  static inline void kdb_init(int level) {}
7116  static inline int kdb_register(char *cmd, kdb_func_t func, char *usage,
7117                                char *help, short minlen) { return 0; }
7118 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/kernel.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/kernel.h
7119 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/kernel.h  2017-04-16 10:38:24.000000000 +0200
7120 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/kernel.h       2017-04-18 17:54:26.000000000 +0200
7121 @@ -194,6 +194,9 @@
7122   */
7123  # define might_sleep() \
7124         do { __might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0)
7126 +# define might_sleep_no_state_check() \
7127 +       do { ___might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0)
7128  # define sched_annotate_sleep()        (current->task_state_change = 0)
7129  #else
7130    static inline void ___might_sleep(const char *file, int line,
7131 @@ -201,6 +204,7 @@
7132    static inline void __might_sleep(const char *file, int line,
7133                                    int preempt_offset) { }
7134  # define might_sleep() do { might_resched(); } while (0)
7135 +# define might_sleep_no_state_check() do { might_resched(); } while (0)
7136  # define sched_annotate_sleep() do { } while (0)
7137  #endif
7139 @@ -488,6 +492,7 @@
7140         SYSTEM_HALT,
7141         SYSTEM_POWER_OFF,
7142         SYSTEM_RESTART,
7143 +       SYSTEM_SUSPEND,
7144  } system_state;
7146  #define TAINT_PROPRIETARY_MODULE       0
7147 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/list_bl.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/list_bl.h
7148 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/list_bl.h 2017-04-16 10:38:25.000000000 +0200
7149 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/list_bl.h      2017-04-18 17:54:26.000000000 +0200
7150 @@ -2,6 +2,7 @@
7151  #define _LINUX_LIST_BL_H
7153  #include <linux/list.h>
7154 +#include <linux/spinlock.h>
7155  #include <linux/bit_spinlock.h>
7157  /*
7158 @@ -32,13 +33,24 @@
7160  struct hlist_bl_head {
7161         struct hlist_bl_node *first;
7162 +#ifdef CONFIG_PREEMPT_RT_BASE
7163 +       raw_spinlock_t lock;
7164 +#endif
7165  };
7167  struct hlist_bl_node {
7168         struct hlist_bl_node *next, **pprev;
7169  };
7170 -#define INIT_HLIST_BL_HEAD(ptr) \
7171 -       ((ptr)->first = NULL)
7173 +#ifdef CONFIG_PREEMPT_RT_BASE
7174 +#define INIT_HLIST_BL_HEAD(h)          \
7175 +do {                                   \
7176 +       (h)->first = NULL;              \
7177 +       raw_spin_lock_init(&(h)->lock); \
7178 +} while (0)
7179 +#else
7180 +#define INIT_HLIST_BL_HEAD(h) (h)->first = NULL
7181 +#endif
7183  static inline void INIT_HLIST_BL_NODE(struct hlist_bl_node *h)
7185 @@ -118,12 +130,26 @@
7187  static inline void hlist_bl_lock(struct hlist_bl_head *b)
7189 +#ifndef CONFIG_PREEMPT_RT_BASE
7190         bit_spin_lock(0, (unsigned long *)b);
7191 +#else
7192 +       raw_spin_lock(&b->lock);
7193 +#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
7194 +       __set_bit(0, (unsigned long *)b);
7195 +#endif
7196 +#endif
7199  static inline void hlist_bl_unlock(struct hlist_bl_head *b)
7201 +#ifndef CONFIG_PREEMPT_RT_BASE
7202         __bit_spin_unlock(0, (unsigned long *)b);
7203 +#else
7204 +#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
7205 +       __clear_bit(0, (unsigned long *)b);
7206 +#endif
7207 +       raw_spin_unlock(&b->lock);
7208 +#endif
7211  static inline bool hlist_bl_is_locked(struct hlist_bl_head *b)
7212 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/locallock.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/locallock.h
7213 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/locallock.h       1970-01-01 01:00:00.000000000 +0100
7214 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/locallock.h    2017-04-18 17:54:26.000000000 +0200
7215 @@ -0,0 +1,278 @@
7216 +#ifndef _LINUX_LOCALLOCK_H
7217 +#define _LINUX_LOCALLOCK_H
7219 +#include <linux/percpu.h>
7220 +#include <linux/spinlock.h>
7222 +#ifdef CONFIG_PREEMPT_RT_BASE
7224 +#ifdef CONFIG_DEBUG_SPINLOCK
7225 +# define LL_WARN(cond) WARN_ON(cond)
7226 +#else
7227 +# define LL_WARN(cond) do { } while (0)
7228 +#endif
7231 + * per cpu lock based substitute for local_irq_*()
7232 + */
7233 +struct local_irq_lock {
7234 +       spinlock_t              lock;
7235 +       struct task_struct      *owner;
7236 +       int                     nestcnt;
7237 +       unsigned long           flags;
7240 +#define DEFINE_LOCAL_IRQ_LOCK(lvar)                                    \
7241 +       DEFINE_PER_CPU(struct local_irq_lock, lvar) = {                 \
7242 +               .lock = __SPIN_LOCK_UNLOCKED((lvar).lock) }
7244 +#define DECLARE_LOCAL_IRQ_LOCK(lvar)                                   \
7245 +       DECLARE_PER_CPU(struct local_irq_lock, lvar)
7247 +#define local_irq_lock_init(lvar)                                      \
7248 +       do {                                                            \
7249 +               int __cpu;                                              \
7250 +               for_each_possible_cpu(__cpu)                            \
7251 +                       spin_lock_init(&per_cpu(lvar, __cpu).lock);     \
7252 +       } while (0)
7255 + * spin_lock|trylock|unlock_local flavour that does not migrate disable
7256 + * used for __local_lock|trylock|unlock where get_local_var/put_local_var
7257 + * already takes care of the migrate_disable/enable
7258 + * for CONFIG_PREEMPT_BASE map to the normal spin_* calls.
7259 + */
7260 +#ifdef CONFIG_PREEMPT_RT_FULL
7261 +# define spin_lock_local(lock)                 rt_spin_lock__no_mg(lock)
7262 +# define spin_trylock_local(lock)              rt_spin_trylock__no_mg(lock)
7263 +# define spin_unlock_local(lock)               rt_spin_unlock__no_mg(lock)
7264 +#else
7265 +# define spin_lock_local(lock)                 spin_lock(lock)
7266 +# define spin_trylock_local(lock)              spin_trylock(lock)
7267 +# define spin_unlock_local(lock)               spin_unlock(lock)
7268 +#endif
7270 +static inline void __local_lock(struct local_irq_lock *lv)
7272 +       if (lv->owner != current) {
7273 +               spin_lock_local(&lv->lock);
7274 +               LL_WARN(lv->owner);
7275 +               LL_WARN(lv->nestcnt);
7276 +               lv->owner = current;
7277 +       }
7278 +       lv->nestcnt++;
7281 +#define local_lock(lvar)                                       \
7282 +       do { __local_lock(&get_local_var(lvar)); } while (0)
7284 +#define local_lock_on(lvar, cpu)                               \
7285 +       do { __local_lock(&per_cpu(lvar, cpu)); } while (0)
7287 +static inline int __local_trylock(struct local_irq_lock *lv)
7289 +       if (lv->owner != current && spin_trylock_local(&lv->lock)) {
7290 +               LL_WARN(lv->owner);
7291 +               LL_WARN(lv->nestcnt);
7292 +               lv->owner = current;
7293 +               lv->nestcnt = 1;
7294 +               return 1;
7295 +       }
7296 +       return 0;
7299 +#define local_trylock(lvar)                                            \
7300 +       ({                                                              \
7301 +               int __locked;                                           \
7302 +               __locked = __local_trylock(&get_local_var(lvar));       \
7303 +               if (!__locked)                                          \
7304 +                       put_local_var(lvar);                            \
7305 +               __locked;                                               \
7306 +       })
7308 +static inline void __local_unlock(struct local_irq_lock *lv)
7310 +       LL_WARN(lv->nestcnt == 0);
7311 +       LL_WARN(lv->owner != current);
7312 +       if (--lv->nestcnt)
7313 +               return;
7315 +       lv->owner = NULL;
7316 +       spin_unlock_local(&lv->lock);
7319 +#define local_unlock(lvar)                                     \
7320 +       do {                                                    \
7321 +               __local_unlock(this_cpu_ptr(&lvar));            \
7322 +               put_local_var(lvar);                            \
7323 +       } while (0)
7325 +#define local_unlock_on(lvar, cpu)                       \
7326 +       do { __local_unlock(&per_cpu(lvar, cpu)); } while (0)
7328 +static inline void __local_lock_irq(struct local_irq_lock *lv)
7330 +       spin_lock_irqsave(&lv->lock, lv->flags);
7331 +       LL_WARN(lv->owner);
7332 +       LL_WARN(lv->nestcnt);
7333 +       lv->owner = current;
7334 +       lv->nestcnt = 1;
7337 +#define local_lock_irq(lvar)                                           \
7338 +       do { __local_lock_irq(&get_local_var(lvar)); } while (0)
7340 +#define local_lock_irq_on(lvar, cpu)                                   \
7341 +       do { __local_lock_irq(&per_cpu(lvar, cpu)); } while (0)
7343 +static inline void __local_unlock_irq(struct local_irq_lock *lv)
7345 +       LL_WARN(!lv->nestcnt);
7346 +       LL_WARN(lv->owner != current);
7347 +       lv->owner = NULL;
7348 +       lv->nestcnt = 0;
7349 +       spin_unlock_irq(&lv->lock);
7352 +#define local_unlock_irq(lvar)                                         \
7353 +       do {                                                            \
7354 +               __local_unlock_irq(this_cpu_ptr(&lvar));                \
7355 +               put_local_var(lvar);                                    \
7356 +       } while (0)
7358 +#define local_unlock_irq_on(lvar, cpu)                                 \
7359 +       do {                                                            \
7360 +               __local_unlock_irq(&per_cpu(lvar, cpu));                \
7361 +       } while (0)
7363 +static inline int __local_lock_irqsave(struct local_irq_lock *lv)
7365 +       if (lv->owner != current) {
7366 +               __local_lock_irq(lv);
7367 +               return 0;
7368 +       } else {
7369 +               lv->nestcnt++;
7370 +               return 1;
7371 +       }
7374 +#define local_lock_irqsave(lvar, _flags)                               \
7375 +       do {                                                            \
7376 +               if (__local_lock_irqsave(&get_local_var(lvar)))         \
7377 +                       put_local_var(lvar);                            \
7378 +               _flags = __this_cpu_read(lvar.flags);                   \
7379 +       } while (0)
7381 +#define local_lock_irqsave_on(lvar, _flags, cpu)                       \
7382 +       do {                                                            \
7383 +               __local_lock_irqsave(&per_cpu(lvar, cpu));              \
7384 +               _flags = per_cpu(lvar, cpu).flags;                      \
7385 +       } while (0)
7387 +static inline int __local_unlock_irqrestore(struct local_irq_lock *lv,
7388 +                                           unsigned long flags)
7390 +       LL_WARN(!lv->nestcnt);
7391 +       LL_WARN(lv->owner != current);
7392 +       if (--lv->nestcnt)
7393 +               return 0;
7395 +       lv->owner = NULL;
7396 +       spin_unlock_irqrestore(&lv->lock, lv->flags);
7397 +       return 1;
7400 +#define local_unlock_irqrestore(lvar, flags)                           \
7401 +       do {                                                            \
7402 +               if (__local_unlock_irqrestore(this_cpu_ptr(&lvar), flags)) \
7403 +                       put_local_var(lvar);                            \
7404 +       } while (0)
7406 +#define local_unlock_irqrestore_on(lvar, flags, cpu)                   \
7407 +       do {                                                            \
7408 +               __local_unlock_irqrestore(&per_cpu(lvar, cpu), flags);  \
7409 +       } while (0)
7411 +#define local_spin_trylock_irq(lvar, lock)                             \
7412 +       ({                                                              \
7413 +               int __locked;                                           \
7414 +               local_lock_irq(lvar);                                   \
7415 +               __locked = spin_trylock(lock);                          \
7416 +               if (!__locked)                                          \
7417 +                       local_unlock_irq(lvar);                         \
7418 +               __locked;                                               \
7419 +       })
7421 +#define local_spin_lock_irq(lvar, lock)                                        \
7422 +       do {                                                            \
7423 +               local_lock_irq(lvar);                                   \
7424 +               spin_lock(lock);                                        \
7425 +       } while (0)
7427 +#define local_spin_unlock_irq(lvar, lock)                              \
7428 +       do {                                                            \
7429 +               spin_unlock(lock);                                      \
7430 +               local_unlock_irq(lvar);                                 \
7431 +       } while (0)
7433 +#define local_spin_lock_irqsave(lvar, lock, flags)                     \
7434 +       do {                                                            \
7435 +               local_lock_irqsave(lvar, flags);                        \
7436 +               spin_lock(lock);                                        \
7437 +       } while (0)
7439 +#define local_spin_unlock_irqrestore(lvar, lock, flags)                        \
7440 +       do {                                                            \
7441 +               spin_unlock(lock);                                      \
7442 +               local_unlock_irqrestore(lvar, flags);                   \
7443 +       } while (0)
7445 +#define get_locked_var(lvar, var)                                      \
7446 +       (*({                                                            \
7447 +               local_lock(lvar);                                       \
7448 +               this_cpu_ptr(&var);                                     \
7449 +       }))
7451 +#define put_locked_var(lvar, var)      local_unlock(lvar);
7453 +#define local_lock_cpu(lvar)                                           \
7454 +       ({                                                              \
7455 +               local_lock(lvar);                                       \
7456 +               smp_processor_id();                                     \
7457 +       })
7459 +#define local_unlock_cpu(lvar)                 local_unlock(lvar)
7461 +#else /* PREEMPT_RT_BASE */
7463 +#define DEFINE_LOCAL_IRQ_LOCK(lvar)            __typeof__(const int) lvar
7464 +#define DECLARE_LOCAL_IRQ_LOCK(lvar)           extern __typeof__(const int) lvar
7466 +static inline void local_irq_lock_init(int lvar) { }
7468 +#define local_lock(lvar)                       preempt_disable()
7469 +#define local_unlock(lvar)                     preempt_enable()
7470 +#define local_lock_irq(lvar)                   local_irq_disable()
7471 +#define local_lock_irq_on(lvar, cpu)           local_irq_disable()
7472 +#define local_unlock_irq(lvar)                 local_irq_enable()
7473 +#define local_unlock_irq_on(lvar, cpu)         local_irq_enable()
7474 +#define local_lock_irqsave(lvar, flags)                local_irq_save(flags)
7475 +#define local_unlock_irqrestore(lvar, flags)   local_irq_restore(flags)
7477 +#define local_spin_trylock_irq(lvar, lock)     spin_trylock_irq(lock)
7478 +#define local_spin_lock_irq(lvar, lock)                spin_lock_irq(lock)
7479 +#define local_spin_unlock_irq(lvar, lock)      spin_unlock_irq(lock)
7480 +#define local_spin_lock_irqsave(lvar, lock, flags)     \
7481 +       spin_lock_irqsave(lock, flags)
7482 +#define local_spin_unlock_irqrestore(lvar, lock, flags)        \
7483 +       spin_unlock_irqrestore(lock, flags)
7485 +#define get_locked_var(lvar, var)              get_cpu_var(var)
7486 +#define put_locked_var(lvar, var)              put_cpu_var(var)
7488 +#define local_lock_cpu(lvar)                   get_cpu()
7489 +#define local_unlock_cpu(lvar)                 put_cpu()
7491 +#endif
7493 +#endif
7494 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/mm_types.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/mm_types.h
7495 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/mm_types.h        2017-04-16 10:38:25.000000000 +0200
7496 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/mm_types.h     2017-04-18 17:54:26.000000000 +0200
7497 @@ -11,6 +11,7 @@
7498  #include <linux/completion.h>
7499  #include <linux/cpumask.h>
7500  #include <linux/uprobes.h>
7501 +#include <linux/rcupdate.h>
7502  #include <linux/page-flags-layout.h>
7503  #include <linux/workqueue.h>
7504  #include <asm/page.h>
7505 @@ -509,6 +510,9 @@
7506         bool tlb_flush_pending;
7507  #endif
7508         struct uprobes_state uprobes_state;
7509 +#ifdef CONFIG_PREEMPT_RT_BASE
7510 +       struct rcu_head delayed_drop;
7511 +#endif
7512  #ifdef CONFIG_X86_INTEL_MPX
7513         /* address of the bounds directory */
7514         void __user *bd_addr;
7515 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/module.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/module.h
7516 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/module.h  2017-04-16 10:38:25.000000000 +0200
7517 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/module.h       2017-04-18 17:54:26.000000000 +0200
7518 @@ -496,6 +496,7 @@
7519  struct module *__module_text_address(unsigned long addr);
7520  struct module *__module_address(unsigned long addr);
7521  bool is_module_address(unsigned long addr);
7522 +bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr);
7523  bool is_module_percpu_address(unsigned long addr);
7524  bool is_module_text_address(unsigned long addr);
7526 @@ -663,6 +664,11 @@
7527         return false;
7530 +static inline bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr)
7532 +       return false;
7535  static inline bool is_module_text_address(unsigned long addr)
7537         return false;
7538 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/mutex.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/mutex.h
7539 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/mutex.h   2017-04-16 10:38:25.000000000 +0200
7540 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/mutex.h        2017-04-18 17:54:26.000000000 +0200
7541 @@ -19,6 +19,17 @@
7542  #include <asm/processor.h>
7543  #include <linux/osq_lock.h>
7545 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
7546 +# define __DEP_MAP_MUTEX_INITIALIZER(lockname) \
7547 +       , .dep_map = { .name = #lockname }
7548 +#else
7549 +# define __DEP_MAP_MUTEX_INITIALIZER(lockname)
7550 +#endif
7552 +#ifdef CONFIG_PREEMPT_RT_FULL
7553 +# include <linux/mutex_rt.h>
7554 +#else
7556  /*
7557   * Simple, straightforward mutexes with strict semantics:
7558   *
7559 @@ -99,13 +110,6 @@
7560  static inline void mutex_destroy(struct mutex *lock) {}
7561  #endif
7563 -#ifdef CONFIG_DEBUG_LOCK_ALLOC
7564 -# define __DEP_MAP_MUTEX_INITIALIZER(lockname) \
7565 -               , .dep_map = { .name = #lockname }
7566 -#else
7567 -# define __DEP_MAP_MUTEX_INITIALIZER(lockname)
7568 -#endif
7570  #define __MUTEX_INITIALIZER(lockname) \
7571                 { .count = ATOMIC_INIT(1) \
7572                 , .wait_lock = __SPIN_LOCK_UNLOCKED(lockname.wait_lock) \
7573 @@ -173,6 +177,8 @@
7574  extern int mutex_trylock(struct mutex *lock);
7575  extern void mutex_unlock(struct mutex *lock);
7577 +#endif /* !PREEMPT_RT_FULL */
7579  extern int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock);
7581  #endif /* __LINUX_MUTEX_H */
7582 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/mutex_rt.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/mutex_rt.h
7583 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/mutex_rt.h        1970-01-01 01:00:00.000000000 +0100
7584 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/mutex_rt.h     2017-04-18 17:54:26.000000000 +0200
7585 @@ -0,0 +1,89 @@
7586 +#ifndef __LINUX_MUTEX_RT_H
7587 +#define __LINUX_MUTEX_RT_H
7589 +#ifndef __LINUX_MUTEX_H
7590 +#error "Please include mutex.h"
7591 +#endif
7593 +#include <linux/rtmutex.h>
7595 +/* FIXME: Just for __lockfunc */
7596 +#include <linux/spinlock.h>
7598 +struct mutex {
7599 +       struct rt_mutex         lock;
7600 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
7601 +       struct lockdep_map      dep_map;
7602 +#endif
7605 +#define __MUTEX_INITIALIZER(mutexname)                                 \
7606 +       {                                                               \
7607 +               .lock = __RT_MUTEX_INITIALIZER(mutexname.lock)          \
7608 +               __DEP_MAP_MUTEX_INITIALIZER(mutexname)                  \
7609 +       }
7611 +#define DEFINE_MUTEX(mutexname)                                                \
7612 +       struct mutex mutexname = __MUTEX_INITIALIZER(mutexname)
7614 +extern void __mutex_do_init(struct mutex *lock, const char *name, struct lock_class_key *key);
7615 +extern void __lockfunc _mutex_lock(struct mutex *lock);
7616 +extern int __lockfunc _mutex_lock_interruptible(struct mutex *lock);
7617 +extern int __lockfunc _mutex_lock_killable(struct mutex *lock);
7618 +extern void __lockfunc _mutex_lock_nested(struct mutex *lock, int subclass);
7619 +extern void __lockfunc _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest_lock);
7620 +extern int __lockfunc _mutex_lock_interruptible_nested(struct mutex *lock, int subclass);
7621 +extern int __lockfunc _mutex_lock_killable_nested(struct mutex *lock, int subclass);
7622 +extern int __lockfunc _mutex_trylock(struct mutex *lock);
7623 +extern void __lockfunc _mutex_unlock(struct mutex *lock);
7625 +#define mutex_is_locked(l)             rt_mutex_is_locked(&(l)->lock)
7626 +#define mutex_lock(l)                  _mutex_lock(l)
7627 +#define mutex_lock_interruptible(l)    _mutex_lock_interruptible(l)
7628 +#define mutex_lock_killable(l)         _mutex_lock_killable(l)
7629 +#define mutex_trylock(l)               _mutex_trylock(l)
7630 +#define mutex_unlock(l)                        _mutex_unlock(l)
7632 +#ifdef CONFIG_DEBUG_MUTEXES
7633 +#define mutex_destroy(l)               rt_mutex_destroy(&(l)->lock)
7634 +#else
7635 +static inline void mutex_destroy(struct mutex *lock) {}
7636 +#endif
7638 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
7639 +# define mutex_lock_nested(l, s)       _mutex_lock_nested(l, s)
7640 +# define mutex_lock_interruptible_nested(l, s) \
7641 +                                       _mutex_lock_interruptible_nested(l, s)
7642 +# define mutex_lock_killable_nested(l, s) \
7643 +                                       _mutex_lock_killable_nested(l, s)
7645 +# define mutex_lock_nest_lock(lock, nest_lock)                         \
7646 +do {                                                                   \
7647 +       typecheck(struct lockdep_map *, &(nest_lock)->dep_map);         \
7648 +       _mutex_lock_nest_lock(lock, &(nest_lock)->dep_map);             \
7649 +} while (0)
7651 +#else
7652 +# define mutex_lock_nested(l, s)       _mutex_lock(l)
7653 +# define mutex_lock_interruptible_nested(l, s) \
7654 +                                       _mutex_lock_interruptible(l)
7655 +# define mutex_lock_killable_nested(l, s) \
7656 +                                       _mutex_lock_killable(l)
7657 +# define mutex_lock_nest_lock(lock, nest_lock) mutex_lock(lock)
7658 +#endif
7660 +# define mutex_init(mutex)                             \
7661 +do {                                                   \
7662 +       static struct lock_class_key __key;             \
7663 +                                                       \
7664 +       rt_mutex_init(&(mutex)->lock);                  \
7665 +       __mutex_do_init((mutex), #mutex, &__key);       \
7666 +} while (0)
7668 +# define __mutex_init(mutex, name, key)                        \
7669 +do {                                                   \
7670 +       rt_mutex_init(&(mutex)->lock);                  \
7671 +       __mutex_do_init((mutex), name, key);            \
7672 +} while (0)
7674 +#endif
7675 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/netdevice.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/netdevice.h
7676 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/netdevice.h       2017-04-16 10:38:25.000000000 +0200
7677 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/netdevice.h    2017-04-18 17:54:26.000000000 +0200
7678 @@ -396,7 +396,19 @@
7679  typedef rx_handler_result_t rx_handler_func_t(struct sk_buff **pskb);
7681  void __napi_schedule(struct napi_struct *n);
7684 + * When PREEMPT_RT_FULL is defined, all device interrupt handlers
7685 + * run as threads, and they can also be preempted (without PREEMPT_RT
7686 + * interrupt threads can not be preempted). Which means that calling
7687 + * __napi_schedule_irqoff() from an interrupt handler can be preempted
7688 + * and can corrupt the napi->poll_list.
7689 + */
7690 +#ifdef CONFIG_PREEMPT_RT_FULL
7691 +#define __napi_schedule_irqoff(n) __napi_schedule(n)
7692 +#else
7693  void __napi_schedule_irqoff(struct napi_struct *n);
7694 +#endif
7696  static inline bool napi_disable_pending(struct napi_struct *n)
7698 @@ -2463,14 +2475,53 @@
7699  void synchronize_net(void);
7700  int init_dummy_netdev(struct net_device *dev);
7702 -DECLARE_PER_CPU(int, xmit_recursion);
7703  #define XMIT_RECURSION_LIMIT   10
7704 +#ifdef CONFIG_PREEMPT_RT_FULL
7705 +static inline int dev_recursion_level(void)
7707 +       return current->xmit_recursion;
7710 +static inline int xmit_rec_read(void)
7712 +       return current->xmit_recursion;
7715 +static inline void xmit_rec_inc(void)
7717 +       current->xmit_recursion++;
7720 +static inline void xmit_rec_dec(void)
7722 +       current->xmit_recursion--;
7725 +#else
7727 +DECLARE_PER_CPU(int, xmit_recursion);
7729  static inline int dev_recursion_level(void)
7731         return this_cpu_read(xmit_recursion);
7734 +static inline int xmit_rec_read(void)
7736 +       return __this_cpu_read(xmit_recursion);
7739 +static inline void xmit_rec_inc(void)
7741 +       __this_cpu_inc(xmit_recursion);
7744 +static inline void xmit_rec_dec(void)
7746 +       __this_cpu_dec(xmit_recursion);
7748 +#endif
7750  struct net_device *dev_get_by_index(struct net *net, int ifindex);
7751  struct net_device *__dev_get_by_index(struct net *net, int ifindex);
7752  struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex);
7753 @@ -2855,6 +2906,7 @@
7754         unsigned int            dropped;
7755         struct sk_buff_head     input_pkt_queue;
7756         struct napi_struct      backlog;
7757 +       struct sk_buff_head     tofree_queue;
7759  };
7761 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/netfilter/x_tables.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/netfilter/x_tables.h
7762 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/netfilter/x_tables.h      2017-04-16 10:38:25.000000000 +0200
7763 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/netfilter/x_tables.h   2017-04-18 17:54:26.000000000 +0200
7764 @@ -4,6 +4,7 @@
7766  #include <linux/netdevice.h>
7767  #include <linux/static_key.h>
7768 +#include <linux/locallock.h>
7769  #include <uapi/linux/netfilter/x_tables.h>
7771  /* Test a struct->invflags and a boolean for inequality */
7772 @@ -300,6 +301,8 @@
7773   */
7774  DECLARE_PER_CPU(seqcount_t, xt_recseq);
7776 +DECLARE_LOCAL_IRQ_LOCK(xt_write_lock);
7778  /* xt_tee_enabled - true if x_tables needs to handle reentrancy
7779   *
7780   * Enabled if current ip(6)tables ruleset has at least one -j TEE rule.
7781 @@ -320,6 +323,9 @@
7783         unsigned int addend;
7785 +       /* RT protection */
7786 +       local_lock(xt_write_lock);
7788         /*
7789          * Low order bit of sequence is set if we already
7790          * called xt_write_recseq_begin().
7791 @@ -350,6 +356,7 @@
7792         /* this is kind of a write_seqcount_end(), but addend is 0 or 1 */
7793         smp_wmb();
7794         __this_cpu_add(xt_recseq.sequence, addend);
7795 +       local_unlock(xt_write_lock);
7798  /*
7799 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/nfs_fs.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/nfs_fs.h
7800 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/nfs_fs.h  2017-04-16 10:38:25.000000000 +0200
7801 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/nfs_fs.h       2017-04-18 17:54:26.000000000 +0200
7802 @@ -165,7 +165,11 @@
7804         /* Readers: in-flight sillydelete RPC calls */
7805         /* Writers: rmdir */
7806 +#ifdef CONFIG_PREEMPT_RT_BASE
7807 +       struct semaphore        rmdir_sem;
7808 +#else
7809         struct rw_semaphore     rmdir_sem;
7810 +#endif
7812  #if IS_ENABLED(CONFIG_NFS_V4)
7813         struct nfs4_cached_acl  *nfs4_acl;
7814 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/nfs_xdr.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/nfs_xdr.h
7815 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/nfs_xdr.h 2017-04-16 10:38:25.000000000 +0200
7816 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/nfs_xdr.h      2017-04-18 17:54:26.000000000 +0200
7817 @@ -1490,7 +1490,7 @@
7818         struct nfs_removeargs args;
7819         struct nfs_removeres res;
7820         struct dentry *dentry;
7821 -       wait_queue_head_t wq;
7822 +       struct swait_queue_head wq;
7823         struct rpc_cred *cred;
7824         struct nfs_fattr dir_attr;
7825         long timeout;
7826 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/notifier.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/notifier.h
7827 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/notifier.h        2017-04-16 10:38:25.000000000 +0200
7828 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/notifier.h     2017-04-18 17:54:26.000000000 +0200
7829 @@ -6,7 +6,7 @@
7830   *
7831   *                             Alan Cox <Alan.Cox@linux.org>
7832   */
7835  #ifndef _LINUX_NOTIFIER_H
7836  #define _LINUX_NOTIFIER_H
7837  #include <linux/errno.h>
7838 @@ -42,9 +42,7 @@
7839   * in srcu_notifier_call_chain(): no cache bounces and no memory barriers.
7840   * As compensation, srcu_notifier_chain_unregister() is rather expensive.
7841   * SRCU notifier chains should be used when the chain will be called very
7842 - * often but notifier_blocks will seldom be removed.  Also, SRCU notifier
7843 - * chains are slightly more difficult to use because they require special
7844 - * runtime initialization.
7845 + * often but notifier_blocks will seldom be removed.
7846   */
7848  struct notifier_block;
7849 @@ -90,7 +88,7 @@
7850                 (name)->head = NULL;            \
7851         } while (0)
7853 -/* srcu_notifier_heads must be initialized and cleaned up dynamically */
7854 +/* srcu_notifier_heads must be cleaned up dynamically */
7855  extern void srcu_init_notifier_head(struct srcu_notifier_head *nh);
7856  #define srcu_cleanup_notifier_head(name)       \
7857                 cleanup_srcu_struct(&(name)->srcu);
7858 @@ -103,7 +101,13 @@
7859                 .head = NULL }
7860  #define RAW_NOTIFIER_INIT(name)        {                               \
7861                 .head = NULL }
7862 -/* srcu_notifier_heads cannot be initialized statically */
7864 +#define SRCU_NOTIFIER_INIT(name, pcpu)                         \
7865 +       {                                                       \
7866 +               .mutex = __MUTEX_INITIALIZER(name.mutex),       \
7867 +               .head = NULL,                                   \
7868 +               .srcu = __SRCU_STRUCT_INIT(name.srcu, pcpu),    \
7869 +       }
7871  #define ATOMIC_NOTIFIER_HEAD(name)                             \
7872         struct atomic_notifier_head name =                      \
7873 @@ -115,6 +119,18 @@
7874         struct raw_notifier_head name =                         \
7875                 RAW_NOTIFIER_INIT(name)
7877 +#define _SRCU_NOTIFIER_HEAD(name, mod)                         \
7878 +       static DEFINE_PER_CPU(struct srcu_struct_array,         \
7879 +                       name##_head_srcu_array);                \
7880 +       mod struct srcu_notifier_head name =                    \
7881 +                       SRCU_NOTIFIER_INIT(name, name##_head_srcu_array)
7883 +#define SRCU_NOTIFIER_HEAD(name)                               \
7884 +       _SRCU_NOTIFIER_HEAD(name, )
7886 +#define SRCU_NOTIFIER_HEAD_STATIC(name)                                \
7887 +       _SRCU_NOTIFIER_HEAD(name, static)
7889  #ifdef __KERNEL__
7891  extern int atomic_notifier_chain_register(struct atomic_notifier_head *nh,
7892 @@ -184,12 +200,12 @@
7894  /*
7895   *     Declared notifiers so far. I can imagine quite a few more chains
7896 - *     over time (eg laptop power reset chains, reboot chain (to clean 
7897 + *     over time (eg laptop power reset chains, reboot chain (to clean
7898   *     device units up), device [un]mount chain, module load/unload chain,
7899 - *     low memory chain, screenblank chain (for plug in modular screenblankers) 
7900 + *     low memory chain, screenblank chain (for plug in modular screenblankers)
7901   *     VC switch chains (for loadable kernel svgalib VC switch helpers) etc...
7902   */
7905  /* CPU notfiers are defined in include/linux/cpu.h. */
7907  /* netdevice notifiers are defined in include/linux/netdevice.h */
7908 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/percpu-rwsem.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/percpu-rwsem.h
7909 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/percpu-rwsem.h    2017-04-16 10:38:25.000000000 +0200
7910 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/percpu-rwsem.h 2017-04-18 17:54:26.000000000 +0200
7911 @@ -4,7 +4,7 @@
7912  #include <linux/atomic.h>
7913  #include <linux/rwsem.h>
7914  #include <linux/percpu.h>
7915 -#include <linux/wait.h>
7916 +#include <linux/swait.h>
7917  #include <linux/rcu_sync.h>
7918  #include <linux/lockdep.h>
7920 @@ -12,7 +12,7 @@
7921         struct rcu_sync         rss;
7922         unsigned int __percpu   *read_count;
7923         struct rw_semaphore     rw_sem;
7924 -       wait_queue_head_t       writer;
7925 +       struct swait_queue_head writer;
7926         int                     readers_block;
7927  };
7929 @@ -22,13 +22,13 @@
7930         .rss = __RCU_SYNC_INITIALIZER(name.rss, RCU_SCHED_SYNC),        \
7931         .read_count = &__percpu_rwsem_rc_##name,                        \
7932         .rw_sem = __RWSEM_INITIALIZER(name.rw_sem),                     \
7933 -       .writer = __WAIT_QUEUE_HEAD_INITIALIZER(name.writer),           \
7934 +       .writer = __SWAIT_QUEUE_HEAD_INITIALIZER(name.writer),          \
7937  extern int __percpu_down_read(struct percpu_rw_semaphore *, int);
7938  extern void __percpu_up_read(struct percpu_rw_semaphore *);
7940 -static inline void percpu_down_read_preempt_disable(struct percpu_rw_semaphore *sem)
7941 +static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
7943         might_sleep();
7945 @@ -46,16 +46,10 @@
7946         __this_cpu_inc(*sem->read_count);
7947         if (unlikely(!rcu_sync_is_idle(&sem->rss)))
7948                 __percpu_down_read(sem, false); /* Unconditional memory barrier */
7949 -       barrier();
7950         /*
7951 -        * The barrier() prevents the compiler from
7952 +        * The preempt_enable() prevents the compiler from
7953          * bleeding the critical section out.
7954          */
7957 -static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
7959 -       percpu_down_read_preempt_disable(sem);
7960         preempt_enable();
7963 @@ -82,13 +76,9 @@
7964         return ret;
7967 -static inline void percpu_up_read_preempt_enable(struct percpu_rw_semaphore *sem)
7968 +static inline void percpu_up_read(struct percpu_rw_semaphore *sem)
7970 -       /*
7971 -        * The barrier() prevents the compiler from
7972 -        * bleeding the critical section out.
7973 -        */
7974 -       barrier();
7975 +       preempt_disable();
7976         /*
7977          * Same as in percpu_down_read().
7978          */
7979 @@ -101,12 +91,6 @@
7980         rwsem_release(&sem->rw_sem.dep_map, 1, _RET_IP_);
7983 -static inline void percpu_up_read(struct percpu_rw_semaphore *sem)
7985 -       preempt_disable();
7986 -       percpu_up_read_preempt_enable(sem);
7989  extern void percpu_down_write(struct percpu_rw_semaphore *);
7990  extern void percpu_up_write(struct percpu_rw_semaphore *);
7992 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/percpu.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/percpu.h
7993 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/percpu.h  2017-04-16 10:38:25.000000000 +0200
7994 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/percpu.h       2017-04-18 17:54:26.000000000 +0200
7995 @@ -18,6 +18,35 @@
7996  #define PERCPU_MODULE_RESERVE          0
7997  #endif
7999 +#ifdef CONFIG_PREEMPT_RT_FULL
8001 +#define get_local_var(var) (*({        \
8002 +       migrate_disable();      \
8003 +       this_cpu_ptr(&var);     }))
8005 +#define put_local_var(var) do {        \
8006 +       (void)&(var);           \
8007 +       migrate_enable();       \
8008 +} while (0)
8010 +# define get_local_ptr(var) ({ \
8011 +       migrate_disable();      \
8012 +       this_cpu_ptr(var);      })
8014 +# define put_local_ptr(var) do {       \
8015 +       (void)(var);                    \
8016 +       migrate_enable();               \
8017 +} while (0)
8019 +#else
8021 +#define get_local_var(var)     get_cpu_var(var)
8022 +#define put_local_var(var)     put_cpu_var(var)
8023 +#define get_local_ptr(var)     get_cpu_ptr(var)
8024 +#define put_local_ptr(var)     put_cpu_ptr(var)
8026 +#endif
8028  /* minimum unit size, also is the maximum supported allocation size */
8029  #define PCPU_MIN_UNIT_SIZE             PFN_ALIGN(32 << 10)
8031 @@ -110,6 +139,7 @@
8032  #endif
8034  extern void __percpu *__alloc_reserved_percpu(size_t size, size_t align);
8035 +extern bool __is_kernel_percpu_address(unsigned long addr, unsigned long *can_addr);
8036  extern bool is_kernel_percpu_address(unsigned long addr);
8038  #if !defined(CONFIG_SMP) || !defined(CONFIG_HAVE_SETUP_PER_CPU_AREA)
8039 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/pid.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/pid.h
8040 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/pid.h     2017-04-16 10:38:25.000000000 +0200
8041 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/pid.h  2017-04-18 17:54:26.000000000 +0200
8042 @@ -2,6 +2,7 @@
8043  #define _LINUX_PID_H
8045  #include <linux/rcupdate.h>
8046 +#include <linux/atomic.h>
8048  enum pid_type
8050 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/preempt.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/preempt.h
8051 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/preempt.h 2017-04-16 10:38:26.000000000 +0200
8052 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/preempt.h      2017-04-18 17:54:26.000000000 +0200
8053 @@ -50,7 +50,11 @@
8054  #define HARDIRQ_OFFSET (1UL << HARDIRQ_SHIFT)
8055  #define NMI_OFFSET     (1UL << NMI_SHIFT)
8057 -#define SOFTIRQ_DISABLE_OFFSET (2 * SOFTIRQ_OFFSET)
8058 +#ifndef CONFIG_PREEMPT_RT_FULL
8059 +# define SOFTIRQ_DISABLE_OFFSET                (2 * SOFTIRQ_OFFSET)
8060 +#else
8061 +# define SOFTIRQ_DISABLE_OFFSET                (0)
8062 +#endif
8064  /* We use the MSB mostly because its available */
8065  #define PREEMPT_NEED_RESCHED   0x80000000
8066 @@ -59,9 +63,15 @@
8067  #include <asm/preempt.h>
8069  #define hardirq_count()        (preempt_count() & HARDIRQ_MASK)
8070 -#define softirq_count()        (preempt_count() & SOFTIRQ_MASK)
8071  #define irq_count()    (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK \
8072                                  | NMI_MASK))
8073 +#ifndef CONFIG_PREEMPT_RT_FULL
8074 +# define softirq_count()       (preempt_count() & SOFTIRQ_MASK)
8075 +# define in_serving_softirq()  (softirq_count() & SOFTIRQ_OFFSET)
8076 +#else
8077 +# define softirq_count()       (0UL)
8078 +extern int in_serving_softirq(void);
8079 +#endif
8081  /*
8082   * Are we doing bottom half or hardware interrupt processing?
8083 @@ -72,7 +82,6 @@
8084  #define in_irq()               (hardirq_count())
8085  #define in_softirq()           (softirq_count())
8086  #define in_interrupt()         (irq_count())
8087 -#define in_serving_softirq()   (softirq_count() & SOFTIRQ_OFFSET)
8089  /*
8090   * Are we in NMI context?
8091 @@ -91,7 +100,11 @@
8092  /*
8093   * The preempt_count offset after spin_lock()
8094   */
8095 +#if !defined(CONFIG_PREEMPT_RT_FULL)
8096  #define PREEMPT_LOCK_OFFSET    PREEMPT_DISABLE_OFFSET
8097 +#else
8098 +#define PREEMPT_LOCK_OFFSET    0
8099 +#endif
8101  /*
8102   * The preempt_count offset needed for things like:
8103 @@ -140,6 +153,20 @@
8104  #define preempt_count_inc() preempt_count_add(1)
8105  #define preempt_count_dec() preempt_count_sub(1)
8107 +#ifdef CONFIG_PREEMPT_LAZY
8108 +#define add_preempt_lazy_count(val)    do { preempt_lazy_count() += (val); } while (0)
8109 +#define sub_preempt_lazy_count(val)    do { preempt_lazy_count() -= (val); } while (0)
8110 +#define inc_preempt_lazy_count()       add_preempt_lazy_count(1)
8111 +#define dec_preempt_lazy_count()       sub_preempt_lazy_count(1)
8112 +#define preempt_lazy_count()           (current_thread_info()->preempt_lazy_count)
8113 +#else
8114 +#define add_preempt_lazy_count(val)    do { } while (0)
8115 +#define sub_preempt_lazy_count(val)    do { } while (0)
8116 +#define inc_preempt_lazy_count()       do { } while (0)
8117 +#define dec_preempt_lazy_count()       do { } while (0)
8118 +#define preempt_lazy_count()           (0)
8119 +#endif
8121  #ifdef CONFIG_PREEMPT_COUNT
8123  #define preempt_disable() \
8124 @@ -148,13 +175,25 @@
8125         barrier(); \
8126  } while (0)
8128 +#define preempt_lazy_disable() \
8129 +do { \
8130 +       inc_preempt_lazy_count(); \
8131 +       barrier(); \
8132 +} while (0)
8134  #define sched_preempt_enable_no_resched() \
8135  do { \
8136         barrier(); \
8137         preempt_count_dec(); \
8138  } while (0)
8140 -#define preempt_enable_no_resched() sched_preempt_enable_no_resched()
8141 +#ifdef CONFIG_PREEMPT_RT_BASE
8142 +# define preempt_enable_no_resched() sched_preempt_enable_no_resched()
8143 +# define preempt_check_resched_rt() preempt_check_resched()
8144 +#else
8145 +# define preempt_enable_no_resched() preempt_enable()
8146 +# define preempt_check_resched_rt() barrier();
8147 +#endif
8149  #define preemptible()  (preempt_count() == 0 && !irqs_disabled())
8151 @@ -179,6 +218,13 @@
8152                 __preempt_schedule(); \
8153  } while (0)
8155 +#define preempt_lazy_enable() \
8156 +do { \
8157 +       dec_preempt_lazy_count(); \
8158 +       barrier(); \
8159 +       preempt_check_resched(); \
8160 +} while (0)
8162  #else /* !CONFIG_PREEMPT */
8163  #define preempt_enable() \
8164  do { \
8165 @@ -224,6 +270,7 @@
8166  #define preempt_disable_notrace()              barrier()
8167  #define preempt_enable_no_resched_notrace()    barrier()
8168  #define preempt_enable_notrace()               barrier()
8169 +#define preempt_check_resched_rt()             barrier()
8170  #define preemptible()                          0
8172  #endif /* CONFIG_PREEMPT_COUNT */
8173 @@ -244,10 +291,31 @@
8174  } while (0)
8175  #define preempt_fold_need_resched() \
8176  do { \
8177 -       if (tif_need_resched()) \
8178 +       if (tif_need_resched_now()) \
8179                 set_preempt_need_resched(); \
8180  } while (0)
8182 +#ifdef CONFIG_PREEMPT_RT_FULL
8183 +# define preempt_disable_rt()          preempt_disable()
8184 +# define preempt_enable_rt()           preempt_enable()
8185 +# define preempt_disable_nort()                barrier()
8186 +# define preempt_enable_nort()         barrier()
8187 +# ifdef CONFIG_SMP
8188 +   extern void migrate_disable(void);
8189 +   extern void migrate_enable(void);
8190 +# else /* CONFIG_SMP */
8191 +#  define migrate_disable()            barrier()
8192 +#  define migrate_enable()             barrier()
8193 +# endif /* CONFIG_SMP */
8194 +#else
8195 +# define preempt_disable_rt()          barrier()
8196 +# define preempt_enable_rt()           barrier()
8197 +# define preempt_disable_nort()                preempt_disable()
8198 +# define preempt_enable_nort()         preempt_enable()
8199 +# define migrate_disable()             preempt_disable()
8200 +# define migrate_enable()              preempt_enable()
8201 +#endif
8203  #ifdef CONFIG_PREEMPT_NOTIFIERS
8205  struct preempt_notifier;
8206 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/printk.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/printk.h
8207 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/printk.h  2017-04-16 10:38:26.000000000 +0200
8208 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/printk.h       2017-04-18 17:54:26.000000000 +0200
8209 @@ -126,9 +126,11 @@
8210  #ifdef CONFIG_EARLY_PRINTK
8211  extern asmlinkage __printf(1, 2)
8212  void early_printk(const char *fmt, ...);
8213 +extern void printk_kill(void);
8214  #else
8215  static inline __printf(1, 2) __cold
8216  void early_printk(const char *s, ...) { }
8217 +static inline void printk_kill(void) { }
8218  #endif
8220  #ifdef CONFIG_PRINTK_NMI
8221 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/radix-tree.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/radix-tree.h
8222 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/radix-tree.h      2017-04-16 10:38:26.000000000 +0200
8223 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/radix-tree.h   2017-04-18 17:54:26.000000000 +0200
8224 @@ -292,6 +292,8 @@
8225  int radix_tree_preload(gfp_t gfp_mask);
8226  int radix_tree_maybe_preload(gfp_t gfp_mask);
8227  int radix_tree_maybe_preload_order(gfp_t gfp_mask, int order);
8228 +void radix_tree_preload_end(void);
8230  void radix_tree_init(void);
8231  void *radix_tree_tag_set(struct radix_tree_root *root,
8232                         unsigned long index, unsigned int tag);
8233 @@ -314,11 +316,6 @@
8234  int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag);
8235  unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item);
8237 -static inline void radix_tree_preload_end(void)
8239 -       preempt_enable();
8242  /**
8243   * struct radix_tree_iter - radix tree iterator state
8244   *
8245 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/random.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/random.h
8246 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/random.h  2017-04-16 10:38:26.000000000 +0200
8247 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/random.h       2017-04-18 17:54:26.000000000 +0200
8248 @@ -31,7 +31,7 @@
8250  extern void add_input_randomness(unsigned int type, unsigned int code,
8251                                  unsigned int value) __latent_entropy;
8252 -extern void add_interrupt_randomness(int irq, int irq_flags) __latent_entropy;
8253 +extern void add_interrupt_randomness(int irq, int irq_flags, __u64 ip) __latent_entropy;
8255  extern void get_random_bytes(void *buf, int nbytes);
8256  extern int add_random_ready_callback(struct random_ready_callback *rdy);
8257 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/rbtree.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/rbtree.h
8258 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/rbtree.h  2017-04-16 10:38:26.000000000 +0200
8259 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/rbtree.h       2017-04-18 17:54:26.000000000 +0200
8260 @@ -31,7 +31,7 @@
8262  #include <linux/kernel.h>
8263  #include <linux/stddef.h>
8264 -#include <linux/rcupdate.h>
8265 +#include <linux/rcu_assign_pointer.h>
8267  struct rb_node {
8268         unsigned long  __rb_parent_color;
8269 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/rbtree_augmented.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/rbtree_augmented.h
8270 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/rbtree_augmented.h        2017-04-16 10:38:26.000000000 +0200
8271 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/rbtree_augmented.h     2017-04-18 17:54:26.000000000 +0200
8272 @@ -26,6 +26,7 @@
8274  #include <linux/compiler.h>
8275  #include <linux/rbtree.h>
8276 +#include <linux/rcupdate.h>
8278  /*
8279   * Please note - only struct rb_augment_callbacks and the prototypes for
8280 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/rcu_assign_pointer.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/rcu_assign_pointer.h
8281 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/rcu_assign_pointer.h      1970-01-01 01:00:00.000000000 +0100
8282 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/rcu_assign_pointer.h   2017-04-18 17:54:26.000000000 +0200
8283 @@ -0,0 +1,54 @@
8284 +#ifndef __LINUX_RCU_ASSIGN_POINTER_H__
8285 +#define __LINUX_RCU_ASSIGN_POINTER_H__
8286 +#include <linux/compiler.h>
8287 +#include <asm/barrier.h>
8289 +/**
8290 + * RCU_INITIALIZER() - statically initialize an RCU-protected global variable
8291 + * @v: The value to statically initialize with.
8292 + */
8293 +#define RCU_INITIALIZER(v) (typeof(*(v)) __force __rcu *)(v)
8295 +/**
8296 + * rcu_assign_pointer() - assign to RCU-protected pointer
8297 + * @p: pointer to assign to
8298 + * @v: value to assign (publish)
8299 + *
8300 + * Assigns the specified value to the specified RCU-protected
8301 + * pointer, ensuring that any concurrent RCU readers will see
8302 + * any prior initialization.
8303 + *
8304 + * Inserts memory barriers on architectures that require them
8305 + * (which is most of them), and also prevents the compiler from
8306 + * reordering the code that initializes the structure after the pointer
8307 + * assignment.  More importantly, this call documents which pointers
8308 + * will be dereferenced by RCU read-side code.
8309 + *
8310 + * In some special cases, you may use RCU_INIT_POINTER() instead
8311 + * of rcu_assign_pointer().  RCU_INIT_POINTER() is a bit faster due
8312 + * to the fact that it does not constrain either the CPU or the compiler.
8313 + * That said, using RCU_INIT_POINTER() when you should have used
8314 + * rcu_assign_pointer() is a very bad thing that results in
8315 + * impossible-to-diagnose memory corruption.  So please be careful.
8316 + * See the RCU_INIT_POINTER() comment header for details.
8317 + *
8318 + * Note that rcu_assign_pointer() evaluates each of its arguments only
8319 + * once, appearances notwithstanding.  One of the "extra" evaluations
8320 + * is in typeof() and the other visible only to sparse (__CHECKER__),
8321 + * neither of which actually execute the argument.  As with most cpp
8322 + * macros, this execute-arguments-only-once property is important, so
8323 + * please be careful when making changes to rcu_assign_pointer() and the
8324 + * other macros that it invokes.
8325 + */
8326 +#define rcu_assign_pointer(p, v)                                             \
8327 +({                                                                           \
8328 +       uintptr_t _r_a_p__v = (uintptr_t)(v);                                 \
8329 +                                                                             \
8330 +       if (__builtin_constant_p(v) && (_r_a_p__v) == (uintptr_t)NULL)        \
8331 +               WRITE_ONCE((p), (typeof(p))(_r_a_p__v));                      \
8332 +       else                                                                  \
8333 +               smp_store_release(&p, RCU_INITIALIZER((typeof(p))_r_a_p__v)); \
8334 +       _r_a_p__v;                                                            \
8337 +#endif
8338 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/rcupdate.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/rcupdate.h
8339 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/rcupdate.h        2017-04-16 10:38:26.000000000 +0200
8340 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/rcupdate.h     2017-04-18 17:54:26.000000000 +0200
8341 @@ -46,6 +46,7 @@
8342  #include <linux/compiler.h>
8343  #include <linux/ktime.h>
8344  #include <linux/irqflags.h>
8345 +#include <linux/rcu_assign_pointer.h>
8347  #include <asm/barrier.h>
8349 @@ -178,6 +179,9 @@
8351  #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
8353 +#ifdef CONFIG_PREEMPT_RT_FULL
8354 +#define call_rcu_bh    call_rcu
8355 +#else
8356  /**
8357   * call_rcu_bh() - Queue an RCU for invocation after a quicker grace period.
8358   * @head: structure to be used for queueing the RCU updates.
8359 @@ -201,6 +205,7 @@
8360   */
8361  void call_rcu_bh(struct rcu_head *head,
8362                  rcu_callback_t func);
8363 +#endif
8365  /**
8366   * call_rcu_sched() - Queue an RCU for invocation after sched grace period.
8367 @@ -301,6 +306,11 @@
8368   * types of kernel builds, the rcu_read_lock() nesting depth is unknowable.
8369   */
8370  #define rcu_preempt_depth() (current->rcu_read_lock_nesting)
8371 +#ifndef CONFIG_PREEMPT_RT_FULL
8372 +#define sched_rcu_preempt_depth()      rcu_preempt_depth()
8373 +#else
8374 +static inline int sched_rcu_preempt_depth(void) { return 0; }
8375 +#endif
8377  #else /* #ifdef CONFIG_PREEMPT_RCU */
8379 @@ -326,6 +336,8 @@
8380         return 0;
8383 +#define sched_rcu_preempt_depth()      rcu_preempt_depth()
8385  #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
8387  /* Internal to kernel */
8388 @@ -505,7 +517,14 @@
8389  int debug_lockdep_rcu_enabled(void);
8391  int rcu_read_lock_held(void);
8392 +#ifdef CONFIG_PREEMPT_RT_FULL
8393 +static inline int rcu_read_lock_bh_held(void)
8395 +       return rcu_read_lock_held();
8397 +#else
8398  int rcu_read_lock_bh_held(void);
8399 +#endif
8401  /**
8402   * rcu_read_lock_sched_held() - might we be in RCU-sched read-side critical section?
8403 @@ -626,54 +645,6 @@
8404  })
8406  /**
8407 - * RCU_INITIALIZER() - statically initialize an RCU-protected global variable
8408 - * @v: The value to statically initialize with.
8409 - */
8410 -#define RCU_INITIALIZER(v) (typeof(*(v)) __force __rcu *)(v)
8412 -/**
8413 - * rcu_assign_pointer() - assign to RCU-protected pointer
8414 - * @p: pointer to assign to
8415 - * @v: value to assign (publish)
8416 - *
8417 - * Assigns the specified value to the specified RCU-protected
8418 - * pointer, ensuring that any concurrent RCU readers will see
8419 - * any prior initialization.
8420 - *
8421 - * Inserts memory barriers on architectures that require them
8422 - * (which is most of them), and also prevents the compiler from
8423 - * reordering the code that initializes the structure after the pointer
8424 - * assignment.  More importantly, this call documents which pointers
8425 - * will be dereferenced by RCU read-side code.
8426 - *
8427 - * In some special cases, you may use RCU_INIT_POINTER() instead
8428 - * of rcu_assign_pointer().  RCU_INIT_POINTER() is a bit faster due
8429 - * to the fact that it does not constrain either the CPU or the compiler.
8430 - * That said, using RCU_INIT_POINTER() when you should have used
8431 - * rcu_assign_pointer() is a very bad thing that results in
8432 - * impossible-to-diagnose memory corruption.  So please be careful.
8433 - * See the RCU_INIT_POINTER() comment header for details.
8434 - *
8435 - * Note that rcu_assign_pointer() evaluates each of its arguments only
8436 - * once, appearances notwithstanding.  One of the "extra" evaluations
8437 - * is in typeof() and the other visible only to sparse (__CHECKER__),
8438 - * neither of which actually execute the argument.  As with most cpp
8439 - * macros, this execute-arguments-only-once property is important, so
8440 - * please be careful when making changes to rcu_assign_pointer() and the
8441 - * other macros that it invokes.
8442 - */
8443 -#define rcu_assign_pointer(p, v)                                             \
8444 -({                                                                           \
8445 -       uintptr_t _r_a_p__v = (uintptr_t)(v);                                 \
8446 -                                                                             \
8447 -       if (__builtin_constant_p(v) && (_r_a_p__v) == (uintptr_t)NULL)        \
8448 -               WRITE_ONCE((p), (typeof(p))(_r_a_p__v));                      \
8449 -       else                                                                  \
8450 -               smp_store_release(&p, RCU_INITIALIZER((typeof(p))_r_a_p__v)); \
8451 -       _r_a_p__v;                                                            \
8454 -/**
8455   * rcu_access_pointer() - fetch RCU pointer with no dereferencing
8456   * @p: The pointer to read
8457   *
8458 @@ -951,10 +922,14 @@
8459  static inline void rcu_read_lock_bh(void)
8461         local_bh_disable();
8462 +#ifdef CONFIG_PREEMPT_RT_FULL
8463 +       rcu_read_lock();
8464 +#else
8465         __acquire(RCU_BH);
8466         rcu_lock_acquire(&rcu_bh_lock_map);
8467         RCU_LOCKDEP_WARN(!rcu_is_watching(),
8468                          "rcu_read_lock_bh() used illegally while idle");
8469 +#endif
8472  /*
8473 @@ -964,10 +939,14 @@
8474   */
8475  static inline void rcu_read_unlock_bh(void)
8477 +#ifdef CONFIG_PREEMPT_RT_FULL
8478 +       rcu_read_unlock();
8479 +#else
8480         RCU_LOCKDEP_WARN(!rcu_is_watching(),
8481                          "rcu_read_unlock_bh() used illegally while idle");
8482         rcu_lock_release(&rcu_bh_lock_map);
8483         __release(RCU_BH);
8484 +#endif
8485         local_bh_enable();
8488 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/rcutree.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/rcutree.h
8489 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/rcutree.h 2017-04-16 10:38:26.000000000 +0200
8490 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/rcutree.h      2017-04-18 17:54:26.000000000 +0200
8491 @@ -44,7 +44,11 @@
8492         rcu_note_context_switch();
8495 +#ifdef CONFIG_PREEMPT_RT_FULL
8496 +# define synchronize_rcu_bh    synchronize_rcu
8497 +#else
8498  void synchronize_rcu_bh(void);
8499 +#endif
8500  void synchronize_sched_expedited(void);
8501  void synchronize_rcu_expedited(void);
8503 @@ -72,7 +76,11 @@
8506  void rcu_barrier(void);
8507 +#ifdef CONFIG_PREEMPT_RT_FULL
8508 +# define rcu_barrier_bh                rcu_barrier
8509 +#else
8510  void rcu_barrier_bh(void);
8511 +#endif
8512  void rcu_barrier_sched(void);
8513  unsigned long get_state_synchronize_rcu(void);
8514  void cond_synchronize_rcu(unsigned long oldstate);
8515 @@ -82,17 +90,14 @@
8516  extern unsigned long rcutorture_testseq;
8517  extern unsigned long rcutorture_vernum;
8518  unsigned long rcu_batches_started(void);
8519 -unsigned long rcu_batches_started_bh(void);
8520  unsigned long rcu_batches_started_sched(void);
8521  unsigned long rcu_batches_completed(void);
8522 -unsigned long rcu_batches_completed_bh(void);
8523  unsigned long rcu_batches_completed_sched(void);
8524  unsigned long rcu_exp_batches_completed(void);
8525  unsigned long rcu_exp_batches_completed_sched(void);
8526  void show_rcu_gp_kthreads(void);
8528  void rcu_force_quiescent_state(void);
8529 -void rcu_bh_force_quiescent_state(void);
8530  void rcu_sched_force_quiescent_state(void);
8532  void rcu_idle_enter(void);
8533 @@ -109,6 +114,16 @@
8535  bool rcu_is_watching(void);
8537 +#ifndef CONFIG_PREEMPT_RT_FULL
8538 +void rcu_bh_force_quiescent_state(void);
8539 +unsigned long rcu_batches_started_bh(void);
8540 +unsigned long rcu_batches_completed_bh(void);
8541 +#else
8542 +# define rcu_bh_force_quiescent_state  rcu_force_quiescent_state
8543 +# define rcu_batches_completed_bh      rcu_batches_completed
8544 +# define rcu_batches_started_bh                rcu_batches_completed
8545 +#endif
8547  void rcu_all_qs(void);
8549  /* RCUtree hotplug events */
8550 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/rtmutex.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/rtmutex.h
8551 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/rtmutex.h 2017-04-16 10:38:26.000000000 +0200
8552 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/rtmutex.h      2017-04-18 17:54:26.000000000 +0200
8553 @@ -13,11 +13,15 @@
8554  #define __LINUX_RT_MUTEX_H
8556  #include <linux/linkage.h>
8557 +#include <linux/spinlock_types_raw.h>
8558  #include <linux/rbtree.h>
8559 -#include <linux/spinlock_types.h>
8561  extern int max_lock_depth; /* for sysctl */
8563 +#ifdef CONFIG_DEBUG_MUTEXES
8564 +#include <linux/debug_locks.h>
8565 +#endif
8567  /**
8568   * The rt_mutex structure
8569   *
8570 @@ -31,8 +35,8 @@
8571         struct rb_root          waiters;
8572         struct rb_node          *waiters_leftmost;
8573         struct task_struct      *owner;
8574 -#ifdef CONFIG_DEBUG_RT_MUTEXES
8575         int                     save_state;
8576 +#ifdef CONFIG_DEBUG_RT_MUTEXES
8577         const char              *name, *file;
8578         int                     line;
8579         void                    *magic;
8580 @@ -55,22 +59,33 @@
8581  # define rt_mutex_debug_check_no_locks_held(task)      do { } while (0)
8582  #endif
8584 +# define rt_mutex_init(mutex)                                  \
8585 +       do {                                                    \
8586 +               raw_spin_lock_init(&(mutex)->wait_lock);        \
8587 +               __rt_mutex_init(mutex, #mutex);                 \
8588 +       } while (0)
8590  #ifdef CONFIG_DEBUG_RT_MUTEXES
8591  # define __DEBUG_RT_MUTEX_INITIALIZER(mutexname) \
8592         , .name = #mutexname, .file = __FILE__, .line = __LINE__
8593 -# define rt_mutex_init(mutex)                  __rt_mutex_init(mutex, __func__)
8594   extern void rt_mutex_debug_task_free(struct task_struct *tsk);
8595  #else
8596  # define __DEBUG_RT_MUTEX_INITIALIZER(mutexname)
8597 -# define rt_mutex_init(mutex)                  __rt_mutex_init(mutex, NULL)
8598  # define rt_mutex_debug_task_free(t)                   do { } while (0)
8599  #endif
8601 -#define __RT_MUTEX_INITIALIZER(mutexname) \
8602 -       { .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \
8603 +#define __RT_MUTEX_INITIALIZER_PLAIN(mutexname) \
8604 +        .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \
8605         , .waiters = RB_ROOT \
8606         , .owner = NULL \
8607 -       __DEBUG_RT_MUTEX_INITIALIZER(mutexname)}
8608 +       __DEBUG_RT_MUTEX_INITIALIZER(mutexname)
8610 +#define __RT_MUTEX_INITIALIZER(mutexname) \
8611 +       { __RT_MUTEX_INITIALIZER_PLAIN(mutexname) }
8613 +#define __RT_MUTEX_INITIALIZER_SAVE_STATE(mutexname) \
8614 +       { __RT_MUTEX_INITIALIZER_PLAIN(mutexname)    \
8615 +       , .save_state = 1 }
8617  #define DEFINE_RT_MUTEX(mutexname) \
8618         struct rt_mutex mutexname = __RT_MUTEX_INITIALIZER(mutexname)
8619 @@ -90,7 +105,9 @@
8620  extern void rt_mutex_destroy(struct rt_mutex *lock);
8622  extern void rt_mutex_lock(struct rt_mutex *lock);
8623 +extern int rt_mutex_lock_state(struct rt_mutex *lock, int state);
8624  extern int rt_mutex_lock_interruptible(struct rt_mutex *lock);
8625 +extern int rt_mutex_lock_killable(struct rt_mutex *lock);
8626  extern int rt_mutex_timed_lock(struct rt_mutex *lock,
8627                                struct hrtimer_sleeper *timeout);
8629 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/rwlock_rt.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/rwlock_rt.h
8630 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/rwlock_rt.h       1970-01-01 01:00:00.000000000 +0100
8631 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/rwlock_rt.h    2017-04-18 17:54:26.000000000 +0200
8632 @@ -0,0 +1,99 @@
8633 +#ifndef __LINUX_RWLOCK_RT_H
8634 +#define __LINUX_RWLOCK_RT_H
8636 +#ifndef __LINUX_SPINLOCK_H
8637 +#error Do not include directly. Use spinlock.h
8638 +#endif
8640 +#define rwlock_init(rwl)                               \
8641 +do {                                                   \
8642 +       static struct lock_class_key __key;             \
8643 +                                                       \
8644 +       rt_mutex_init(&(rwl)->lock);                    \
8645 +       __rt_rwlock_init(rwl, #rwl, &__key);            \
8646 +} while (0)
8648 +extern void __lockfunc rt_write_lock(rwlock_t *rwlock);
8649 +extern void __lockfunc rt_read_lock(rwlock_t *rwlock);
8650 +extern int __lockfunc rt_write_trylock(rwlock_t *rwlock);
8651 +extern int __lockfunc rt_write_trylock_irqsave(rwlock_t *trylock, unsigned long *flags);
8652 +extern int __lockfunc rt_read_trylock(rwlock_t *rwlock);
8653 +extern void __lockfunc rt_write_unlock(rwlock_t *rwlock);
8654 +extern void __lockfunc rt_read_unlock(rwlock_t *rwlock);
8655 +extern unsigned long __lockfunc rt_write_lock_irqsave(rwlock_t *rwlock);
8656 +extern unsigned long __lockfunc rt_read_lock_irqsave(rwlock_t *rwlock);
8657 +extern void __rt_rwlock_init(rwlock_t *rwlock, char *name, struct lock_class_key *key);
8659 +#define read_trylock(lock)     __cond_lock(lock, rt_read_trylock(lock))
8660 +#define write_trylock(lock)    __cond_lock(lock, rt_write_trylock(lock))
8662 +#define write_trylock_irqsave(lock, flags)     \
8663 +       __cond_lock(lock, rt_write_trylock_irqsave(lock, &flags))
8665 +#define read_lock_irqsave(lock, flags)                 \
8666 +       do {                                            \
8667 +               typecheck(unsigned long, flags);        \
8668 +               flags = rt_read_lock_irqsave(lock);     \
8669 +       } while (0)
8671 +#define write_lock_irqsave(lock, flags)                        \
8672 +       do {                                            \
8673 +               typecheck(unsigned long, flags);        \
8674 +               flags = rt_write_lock_irqsave(lock);    \
8675 +       } while (0)
8677 +#define read_lock(lock)                rt_read_lock(lock)
8679 +#define read_lock_bh(lock)                             \
8680 +       do {                                            \
8681 +               local_bh_disable();                     \
8682 +               rt_read_lock(lock);                     \
8683 +       } while (0)
8685 +#define read_lock_irq(lock)    read_lock(lock)
8687 +#define write_lock(lock)       rt_write_lock(lock)
8689 +#define write_lock_bh(lock)                            \
8690 +       do {                                            \
8691 +               local_bh_disable();                     \
8692 +               rt_write_lock(lock);                    \
8693 +       } while (0)
8695 +#define write_lock_irq(lock)   write_lock(lock)
8697 +#define read_unlock(lock)      rt_read_unlock(lock)
8699 +#define read_unlock_bh(lock)                           \
8700 +       do {                                            \
8701 +               rt_read_unlock(lock);                   \
8702 +               local_bh_enable();                      \
8703 +       } while (0)
8705 +#define read_unlock_irq(lock)  read_unlock(lock)
8707 +#define write_unlock(lock)     rt_write_unlock(lock)
8709 +#define write_unlock_bh(lock)                          \
8710 +       do {                                            \
8711 +               rt_write_unlock(lock);                  \
8712 +               local_bh_enable();                      \
8713 +       } while (0)
8715 +#define write_unlock_irq(lock) write_unlock(lock)
8717 +#define read_unlock_irqrestore(lock, flags)            \
8718 +       do {                                            \
8719 +               typecheck(unsigned long, flags);        \
8720 +               (void) flags;                           \
8721 +               rt_read_unlock(lock);                   \
8722 +       } while (0)
8724 +#define write_unlock_irqrestore(lock, flags) \
8725 +       do {                                            \
8726 +               typecheck(unsigned long, flags);        \
8727 +               (void) flags;                           \
8728 +               rt_write_unlock(lock);                  \
8729 +       } while (0)
8731 +#endif
8732 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/rwlock_types.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/rwlock_types.h
8733 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/rwlock_types.h    2017-04-16 10:38:26.000000000 +0200
8734 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/rwlock_types.h 2017-04-18 17:54:26.000000000 +0200
8735 @@ -1,6 +1,10 @@
8736  #ifndef __LINUX_RWLOCK_TYPES_H
8737  #define __LINUX_RWLOCK_TYPES_H
8739 +#if !defined(__LINUX_SPINLOCK_TYPES_H)
8740 +# error "Do not include directly, include spinlock_types.h"
8741 +#endif
8743  /*
8744   * include/linux/rwlock_types.h - generic rwlock type definitions
8745   *                               and initializers
8746 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/rwlock_types_rt.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/rwlock_types_rt.h
8747 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/rwlock_types_rt.h 1970-01-01 01:00:00.000000000 +0100
8748 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/rwlock_types_rt.h      2017-04-18 17:54:26.000000000 +0200
8749 @@ -0,0 +1,33 @@
8750 +#ifndef __LINUX_RWLOCK_TYPES_RT_H
8751 +#define __LINUX_RWLOCK_TYPES_RT_H
8753 +#ifndef __LINUX_SPINLOCK_TYPES_H
8754 +#error "Do not include directly. Include spinlock_types.h instead"
8755 +#endif
8758 + * rwlocks - rtmutex which allows single reader recursion
8759 + */
8760 +typedef struct {
8761 +       struct rt_mutex         lock;
8762 +       int                     read_depth;
8763 +       unsigned int            break_lock;
8764 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
8765 +       struct lockdep_map      dep_map;
8766 +#endif
8767 +} rwlock_t;
8769 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
8770 +# define RW_DEP_MAP_INIT(lockname)     .dep_map = { .name = #lockname }
8771 +#else
8772 +# define RW_DEP_MAP_INIT(lockname)
8773 +#endif
8775 +#define __RW_LOCK_UNLOCKED(name) \
8776 +       { .lock = __RT_MUTEX_INITIALIZER_SAVE_STATE(name.lock), \
8777 +         RW_DEP_MAP_INIT(name) }
8779 +#define DEFINE_RWLOCK(name) \
8780 +       rwlock_t name = __RW_LOCK_UNLOCKED(name)
8782 +#endif
8783 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/rwsem.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/rwsem.h
8784 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/rwsem.h   2017-04-16 10:38:26.000000000 +0200
8785 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/rwsem.h        2017-04-18 17:54:26.000000000 +0200
8786 @@ -19,6 +19,10 @@
8787  #include <linux/osq_lock.h>
8788  #endif
8790 +#ifdef CONFIG_PREEMPT_RT_FULL
8791 +#include <linux/rwsem_rt.h>
8792 +#else /* PREEMPT_RT_FULL */
8794  struct rw_semaphore;
8796  #ifdef CONFIG_RWSEM_GENERIC_SPINLOCK
8797 @@ -106,6 +110,13 @@
8798         return !list_empty(&sem->wait_list);
8801 +#endif /* !PREEMPT_RT_FULL */
8804 + * The functions below are the same for all rwsem implementations including
8805 + * the RT specific variant.
8806 + */
8808  /*
8809   * lock for reading
8810   */
8811 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/rwsem_rt.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/rwsem_rt.h
8812 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/rwsem_rt.h        1970-01-01 01:00:00.000000000 +0100
8813 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/rwsem_rt.h     2017-04-18 17:54:26.000000000 +0200
8814 @@ -0,0 +1,67 @@
8815 +#ifndef _LINUX_RWSEM_RT_H
8816 +#define _LINUX_RWSEM_RT_H
8818 +#ifndef _LINUX_RWSEM_H
8819 +#error "Include rwsem.h"
8820 +#endif
8822 +#include <linux/rtmutex.h>
8823 +#include <linux/swait.h>
8825 +#define READER_BIAS            (1U << 31)
8826 +#define WRITER_BIAS            (1U << 30)
8828 +struct rw_semaphore {
8829 +       atomic_t                readers;
8830 +       struct rt_mutex         rtmutex;
8831 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
8832 +       struct lockdep_map      dep_map;
8833 +#endif
8836 +#define __RWSEM_INITIALIZER(name)                              \
8837 +{                                                              \
8838 +       .readers = ATOMIC_INIT(READER_BIAS),                    \
8839 +       .rtmutex = __RT_MUTEX_INITIALIZER(name.rtmutex),        \
8840 +       RW_DEP_MAP_INIT(name)                                   \
8843 +#define DECLARE_RWSEM(lockname) \
8844 +       struct rw_semaphore lockname = __RWSEM_INITIALIZER(lockname)
8846 +extern void  __rwsem_init(struct rw_semaphore *rwsem, const char *name,
8847 +                         struct lock_class_key *key);
8849 +#define __init_rwsem(sem, name, key)                   \
8850 +do {                                                   \
8851 +               rt_mutex_init(&(sem)->rtmutex);         \
8852 +               __rwsem_init((sem), (name), (key));     \
8853 +} while (0)
8855 +#define init_rwsem(sem)                                        \
8856 +do {                                                   \
8857 +       static struct lock_class_key __key;             \
8858 +                                                       \
8859 +       __init_rwsem((sem), #sem, &__key);              \
8860 +} while (0)
8862 +static inline int rwsem_is_locked(struct rw_semaphore *sem)
8864 +       return atomic_read(&sem->readers) != READER_BIAS;
8867 +static inline int rwsem_is_contended(struct rw_semaphore *sem)
8869 +       return atomic_read(&sem->readers) > 0;
8872 +extern void __down_read(struct rw_semaphore *sem);
8873 +extern int __down_read_trylock(struct rw_semaphore *sem);
8874 +extern void __down_write(struct rw_semaphore *sem);
8875 +extern int __must_check __down_write_killable(struct rw_semaphore *sem);
8876 +extern int __down_write_trylock(struct rw_semaphore *sem);
8877 +extern void __up_read(struct rw_semaphore *sem);
8878 +extern void __up_write(struct rw_semaphore *sem);
8879 +extern void __downgrade_write(struct rw_semaphore *sem);
8881 +#endif
8882 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/sched.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/sched.h
8883 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/sched.h   2017-04-16 10:38:26.000000000 +0200
8884 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/sched.h        2017-04-18 17:54:26.000000000 +0200
8885 @@ -26,6 +26,7 @@
8886  #include <linux/nodemask.h>
8887  #include <linux/mm_types.h>
8888  #include <linux/preempt.h>
8889 +#include <asm/kmap_types.h>
8891  #include <asm/page.h>
8892  #include <asm/ptrace.h>
8893 @@ -243,10 +244,7 @@
8894                                  TASK_UNINTERRUPTIBLE | __TASK_STOPPED | \
8895                                  __TASK_TRACED | EXIT_ZOMBIE | EXIT_DEAD)
8897 -#define task_is_traced(task)   ((task->state & __TASK_TRACED) != 0)
8898  #define task_is_stopped(task)  ((task->state & __TASK_STOPPED) != 0)
8899 -#define task_is_stopped_or_traced(task)        \
8900 -                       ((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0)
8901  #define task_contributes_to_load(task) \
8902                                 ((task->state & TASK_UNINTERRUPTIBLE) != 0 && \
8903                                  (task->flags & PF_FROZEN) == 0 && \
8904 @@ -312,6 +310,11 @@
8906  #endif
8908 +#define __set_current_state_no_track(state_value)      \
8909 +       do { current->state = (state_value); } while (0)
8910 +#define set_current_state_no_track(state_value)                \
8911 +       set_mb(current->state, (state_value))
8913  /* Task command name length */
8914  #define TASK_COMM_LEN 16
8916 @@ -1013,8 +1016,18 @@
8917         struct wake_q_head name = { WAKE_Q_TAIL, &name.first }
8919  extern void wake_q_add(struct wake_q_head *head,
8920 -                      struct task_struct *task);
8921 -extern void wake_up_q(struct wake_q_head *head);
8922 +                             struct task_struct *task);
8923 +extern void __wake_up_q(struct wake_q_head *head, bool sleeper);
8925 +static inline void wake_up_q(struct wake_q_head *head)
8927 +       __wake_up_q(head, false);
8930 +static inline void wake_up_q_sleeper(struct wake_q_head *head)
8932 +       __wake_up_q(head, true);
8935  /*
8936   * sched-domains (multiprocessor balancing) declarations:
8937 @@ -1481,6 +1494,7 @@
8938         struct thread_info thread_info;
8939  #endif
8940         volatile long state;    /* -1 unrunnable, 0 runnable, >0 stopped */
8941 +       volatile long saved_state; /* saved state for "spinlock sleepers" */
8942         void *stack;
8943         atomic_t usage;
8944         unsigned int flags;     /* per process flags, defined below */
8945 @@ -1520,6 +1534,12 @@
8946  #endif
8948         unsigned int policy;
8949 +#ifdef CONFIG_PREEMPT_RT_FULL
8950 +       int migrate_disable;
8951 +# ifdef CONFIG_SCHED_DEBUG
8952 +       int migrate_disable_atomic;
8953 +# endif
8954 +#endif
8955         int nr_cpus_allowed;
8956         cpumask_t cpus_allowed;
8958 @@ -1654,6 +1674,9 @@
8960         struct task_cputime cputime_expires;
8961         struct list_head cpu_timers[3];
8962 +#ifdef CONFIG_PREEMPT_RT_BASE
8963 +       struct task_struct *posix_timer_list;
8964 +#endif
8966  /* process credentials */
8967         const struct cred __rcu *ptracer_cred; /* Tracer's credentials at attach */
8968 @@ -1685,10 +1708,15 @@
8969  /* signal handlers */
8970         struct signal_struct *signal;
8971         struct sighand_struct *sighand;
8972 +       struct sigqueue *sigqueue_cache;
8974         sigset_t blocked, real_blocked;
8975         sigset_t saved_sigmask; /* restored if set_restore_sigmask() was used */
8976         struct sigpending pending;
8977 +#ifdef CONFIG_PREEMPT_RT_FULL
8978 +       /* TODO: move me into ->restart_block ? */
8979 +       struct siginfo forced_info;
8980 +#endif
8982         unsigned long sas_ss_sp;
8983         size_t sas_ss_size;
8984 @@ -1917,6 +1945,12 @@
8985         /* bitmask and counter of trace recursion */
8986         unsigned long trace_recursion;
8987  #endif /* CONFIG_TRACING */
8988 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
8989 +       u64 preempt_timestamp_hist;
8990 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
8991 +       long timer_offset;
8992 +#endif
8993 +#endif
8994  #ifdef CONFIG_KCOV
8995         /* Coverage collection mode enabled for this task (0 if disabled). */
8996         enum kcov_mode kcov_mode;
8997 @@ -1942,9 +1976,23 @@
8998         unsigned int    sequential_io;
8999         unsigned int    sequential_io_avg;
9000  #endif
9001 +#ifdef CONFIG_PREEMPT_RT_BASE
9002 +       struct rcu_head put_rcu;
9003 +       int softirq_nestcnt;
9004 +       unsigned int softirqs_raised;
9005 +#endif
9006 +#ifdef CONFIG_PREEMPT_RT_FULL
9007 +# if defined CONFIG_HIGHMEM || defined CONFIG_X86_32
9008 +       int kmap_idx;
9009 +       pte_t kmap_pte[KM_TYPE_NR];
9010 +# endif
9011 +#endif
9012  #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
9013         unsigned long   task_state_change;
9014  #endif
9015 +#ifdef CONFIG_PREEMPT_RT_FULL
9016 +       int xmit_recursion;
9017 +#endif
9018         int pagefault_disabled;
9019  #ifdef CONFIG_MMU
9020         struct task_struct *oom_reaper_list;
9021 @@ -1984,14 +2032,6 @@
9023  #endif
9025 -/* Future-safe accessor for struct task_struct's cpus_allowed. */
9026 -#define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
9028 -static inline int tsk_nr_cpus_allowed(struct task_struct *p)
9030 -       return p->nr_cpus_allowed;
9033  #define TNF_MIGRATED   0x01
9034  #define TNF_NO_GROUP   0x02
9035  #define TNF_SHARED     0x04
9036 @@ -2207,6 +2247,15 @@
9037  extern void free_task(struct task_struct *tsk);
9038  #define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0)
9040 +#ifdef CONFIG_PREEMPT_RT_BASE
9041 +extern void __put_task_struct_cb(struct rcu_head *rhp);
9043 +static inline void put_task_struct(struct task_struct *t)
9045 +       if (atomic_dec_and_test(&t->usage))
9046 +               call_rcu(&t->put_rcu, __put_task_struct_cb);
9048 +#else
9049  extern void __put_task_struct(struct task_struct *t);
9051  static inline void put_task_struct(struct task_struct *t)
9052 @@ -2214,6 +2263,7 @@
9053         if (atomic_dec_and_test(&t->usage))
9054                 __put_task_struct(t);
9056 +#endif
9058  struct task_struct *task_rcu_dereference(struct task_struct **ptask);
9059  struct task_struct *try_get_task_struct(struct task_struct **ptask);
9060 @@ -2255,6 +2305,7 @@
9061  /*
9062   * Per process flags
9063   */
9064 +#define PF_IN_SOFTIRQ  0x00000001      /* Task is serving softirq */
9065  #define PF_EXITING     0x00000004      /* getting shut down */
9066  #define PF_EXITPIDONE  0x00000008      /* pi exit done on shut down */
9067  #define PF_VCPU                0x00000010      /* I'm a virtual CPU */
9068 @@ -2423,6 +2474,10 @@
9070  extern int set_cpus_allowed_ptr(struct task_struct *p,
9071                                 const struct cpumask *new_mask);
9072 +int migrate_me(void);
9073 +void tell_sched_cpu_down_begin(int cpu);
9074 +void tell_sched_cpu_down_done(int cpu);
9076  #else
9077  static inline void do_set_cpus_allowed(struct task_struct *p,
9078                                       const struct cpumask *new_mask)
9079 @@ -2435,6 +2490,9 @@
9080                 return -EINVAL;
9081         return 0;
9083 +static inline int migrate_me(void) { return 0; }
9084 +static inline void tell_sched_cpu_down_begin(int cpu) { }
9085 +static inline void tell_sched_cpu_down_done(int cpu) { }
9086  #endif
9088  #ifdef CONFIG_NO_HZ_COMMON
9089 @@ -2673,6 +2731,7 @@
9091  extern int wake_up_state(struct task_struct *tsk, unsigned int state);
9092  extern int wake_up_process(struct task_struct *tsk);
9093 +extern int wake_up_lock_sleeper(struct task_struct * tsk);
9094  extern void wake_up_new_task(struct task_struct *tsk);
9095  #ifdef CONFIG_SMP
9096   extern void kick_process(struct task_struct *tsk);
9097 @@ -2881,6 +2940,17 @@
9098                 __mmdrop(mm);
9101 +#ifdef CONFIG_PREEMPT_RT_BASE
9102 +extern void __mmdrop_delayed(struct rcu_head *rhp);
9103 +static inline void mmdrop_delayed(struct mm_struct *mm)
9105 +       if (atomic_dec_and_test(&mm->mm_count))
9106 +               call_rcu(&mm->delayed_drop, __mmdrop_delayed);
9108 +#else
9109 +# define mmdrop_delayed(mm)    mmdrop(mm)
9110 +#endif
9112  static inline void mmdrop_async_fn(struct work_struct *work)
9114         struct mm_struct *mm = container_of(work, struct mm_struct, async_put_work);
9115 @@ -3273,6 +3343,43 @@
9116         return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED));
9119 +#ifdef CONFIG_PREEMPT_LAZY
9120 +static inline void set_tsk_need_resched_lazy(struct task_struct *tsk)
9122 +       set_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY);
9125 +static inline void clear_tsk_need_resched_lazy(struct task_struct *tsk)
9127 +       clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY);
9130 +static inline int test_tsk_need_resched_lazy(struct task_struct *tsk)
9132 +       return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY));
9135 +static inline int need_resched_lazy(void)
9137 +       return test_thread_flag(TIF_NEED_RESCHED_LAZY);
9140 +static inline int need_resched_now(void)
9142 +       return test_thread_flag(TIF_NEED_RESCHED);
9145 +#else
9146 +static inline void clear_tsk_need_resched_lazy(struct task_struct *tsk) { }
9147 +static inline int need_resched_lazy(void) { return 0; }
9149 +static inline int need_resched_now(void)
9151 +       return test_thread_flag(TIF_NEED_RESCHED);
9154 +#endif
9156  static inline int restart_syscall(void)
9158         set_tsk_thread_flag(current, TIF_SIGPENDING);
9159 @@ -3304,6 +3411,51 @@
9160         return (state & TASK_INTERRUPTIBLE) || __fatal_signal_pending(p);
9163 +static inline bool __task_is_stopped_or_traced(struct task_struct *task)
9165 +       if (task->state & (__TASK_STOPPED | __TASK_TRACED))
9166 +               return true;
9167 +#ifdef CONFIG_PREEMPT_RT_FULL
9168 +       if (task->saved_state & (__TASK_STOPPED | __TASK_TRACED))
9169 +               return true;
9170 +#endif
9171 +       return false;
9174 +static inline bool task_is_stopped_or_traced(struct task_struct *task)
9176 +       bool traced_stopped;
9178 +#ifdef CONFIG_PREEMPT_RT_FULL
9179 +       unsigned long flags;
9181 +       raw_spin_lock_irqsave(&task->pi_lock, flags);
9182 +       traced_stopped = __task_is_stopped_or_traced(task);
9183 +       raw_spin_unlock_irqrestore(&task->pi_lock, flags);
9184 +#else
9185 +       traced_stopped = __task_is_stopped_or_traced(task);
9186 +#endif
9187 +       return traced_stopped;
9190 +static inline bool task_is_traced(struct task_struct *task)
9192 +       bool traced = false;
9194 +       if (task->state & __TASK_TRACED)
9195 +               return true;
9196 +#ifdef CONFIG_PREEMPT_RT_FULL
9197 +       /* in case the task is sleeping on tasklist_lock */
9198 +       raw_spin_lock_irq(&task->pi_lock);
9199 +       if (task->state & __TASK_TRACED)
9200 +               traced = true;
9201 +       else if (task->saved_state & __TASK_TRACED)
9202 +               traced = true;
9203 +       raw_spin_unlock_irq(&task->pi_lock);
9204 +#endif
9205 +       return traced;
9208  /*
9209   * cond_resched() and cond_resched_lock(): latency reduction via
9210   * explicit rescheduling in places that are safe. The return
9211 @@ -3329,12 +3481,16 @@
9212         __cond_resched_lock(lock);                              \
9213  })
9215 +#ifndef CONFIG_PREEMPT_RT_FULL
9216  extern int __cond_resched_softirq(void);
9218  #define cond_resched_softirq() ({                                      \
9219         ___might_sleep(__FILE__, __LINE__, SOFTIRQ_DISABLE_OFFSET);     \
9220         __cond_resched_softirq();                                       \
9221  })
9222 +#else
9223 +# define cond_resched_softirq()                cond_resched()
9224 +#endif
9226  static inline void cond_resched_rcu(void)
9228 @@ -3509,6 +3665,31 @@
9230  #endif /* CONFIG_SMP */
9232 +static inline int __migrate_disabled(struct task_struct *p)
9234 +#ifdef CONFIG_PREEMPT_RT_FULL
9235 +       return p->migrate_disable;
9236 +#else
9237 +       return 0;
9238 +#endif
9241 +/* Future-safe accessor for struct task_struct's cpus_allowed. */
9242 +static inline const struct cpumask *tsk_cpus_allowed(struct task_struct *p)
9244 +       if (__migrate_disabled(p))
9245 +               return cpumask_of(task_cpu(p));
9247 +       return &p->cpus_allowed;
9250 +static inline int tsk_nr_cpus_allowed(struct task_struct *p)
9252 +       if (__migrate_disabled(p))
9253 +               return 1;
9254 +       return p->nr_cpus_allowed;
9257  extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask);
9258  extern long sched_getaffinity(pid_t pid, struct cpumask *mask);
9260 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/seqlock.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/seqlock.h
9261 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/seqlock.h 2017-04-16 10:38:26.000000000 +0200
9262 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/seqlock.h      2017-04-18 17:54:26.000000000 +0200
9263 @@ -220,20 +220,30 @@
9264         return __read_seqcount_retry(s, start);
9269 -static inline void raw_write_seqcount_begin(seqcount_t *s)
9270 +static inline void __raw_write_seqcount_begin(seqcount_t *s)
9272         s->sequence++;
9273         smp_wmb();
9276 -static inline void raw_write_seqcount_end(seqcount_t *s)
9277 +static inline void raw_write_seqcount_begin(seqcount_t *s)
9279 +       preempt_disable_rt();
9280 +       __raw_write_seqcount_begin(s);
9283 +static inline void __raw_write_seqcount_end(seqcount_t *s)
9285         smp_wmb();
9286         s->sequence++;
9289 +static inline void raw_write_seqcount_end(seqcount_t *s)
9291 +       __raw_write_seqcount_end(s);
9292 +       preempt_enable_rt();
9295  /**
9296   * raw_write_seqcount_barrier - do a seq write barrier
9297   * @s: pointer to seqcount_t
9298 @@ -428,10 +438,32 @@
9299  /*
9300   * Read side functions for starting and finalizing a read side section.
9301   */
9302 +#ifndef CONFIG_PREEMPT_RT_FULL
9303  static inline unsigned read_seqbegin(const seqlock_t *sl)
9305         return read_seqcount_begin(&sl->seqcount);
9307 +#else
9309 + * Starvation safe read side for RT
9310 + */
9311 +static inline unsigned read_seqbegin(seqlock_t *sl)
9313 +       unsigned ret;
9315 +repeat:
9316 +       ret = ACCESS_ONCE(sl->seqcount.sequence);
9317 +       if (unlikely(ret & 1)) {
9318 +               /*
9319 +                * Take the lock and let the writer proceed (i.e. evtl
9320 +                * boost it), otherwise we could loop here forever.
9321 +                */
9322 +               spin_unlock_wait(&sl->lock);
9323 +               goto repeat;
9324 +       }
9325 +       return ret;
9327 +#endif
9329  static inline unsigned read_seqretry(const seqlock_t *sl, unsigned start)
9331 @@ -446,36 +478,45 @@
9332  static inline void write_seqlock(seqlock_t *sl)
9334         spin_lock(&sl->lock);
9335 -       write_seqcount_begin(&sl->seqcount);
9336 +       __raw_write_seqcount_begin(&sl->seqcount);
9339 +static inline int try_write_seqlock(seqlock_t *sl)
9341 +       if (spin_trylock(&sl->lock)) {
9342 +               __raw_write_seqcount_begin(&sl->seqcount);
9343 +               return 1;
9344 +       }
9345 +       return 0;
9348  static inline void write_sequnlock(seqlock_t *sl)
9350 -       write_seqcount_end(&sl->seqcount);
9351 +       __raw_write_seqcount_end(&sl->seqcount);
9352         spin_unlock(&sl->lock);
9355  static inline void write_seqlock_bh(seqlock_t *sl)
9357         spin_lock_bh(&sl->lock);
9358 -       write_seqcount_begin(&sl->seqcount);
9359 +       __raw_write_seqcount_begin(&sl->seqcount);
9362  static inline void write_sequnlock_bh(seqlock_t *sl)
9364 -       write_seqcount_end(&sl->seqcount);
9365 +       __raw_write_seqcount_end(&sl->seqcount);
9366         spin_unlock_bh(&sl->lock);
9369  static inline void write_seqlock_irq(seqlock_t *sl)
9371         spin_lock_irq(&sl->lock);
9372 -       write_seqcount_begin(&sl->seqcount);
9373 +       __raw_write_seqcount_begin(&sl->seqcount);
9376  static inline void write_sequnlock_irq(seqlock_t *sl)
9378 -       write_seqcount_end(&sl->seqcount);
9379 +       __raw_write_seqcount_end(&sl->seqcount);
9380         spin_unlock_irq(&sl->lock);
9383 @@ -484,7 +525,7 @@
9384         unsigned long flags;
9386         spin_lock_irqsave(&sl->lock, flags);
9387 -       write_seqcount_begin(&sl->seqcount);
9388 +       __raw_write_seqcount_begin(&sl->seqcount);
9389         return flags;
9392 @@ -494,7 +535,7 @@
9393  static inline void
9394  write_sequnlock_irqrestore(seqlock_t *sl, unsigned long flags)
9396 -       write_seqcount_end(&sl->seqcount);
9397 +       __raw_write_seqcount_end(&sl->seqcount);
9398         spin_unlock_irqrestore(&sl->lock, flags);
9401 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/signal.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/signal.h
9402 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/signal.h  2017-04-16 10:38:26.000000000 +0200
9403 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/signal.h       2017-04-18 17:54:26.000000000 +0200
9404 @@ -233,6 +233,7 @@
9407  extern void flush_sigqueue(struct sigpending *queue);
9408 +extern void flush_task_sigqueue(struct task_struct *tsk);
9410  /* Test if 'sig' is valid signal. Use this instead of testing _NSIG directly */
9411  static inline int valid_signal(unsigned long sig)
9412 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/skbuff.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/skbuff.h
9413 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/skbuff.h  2017-04-16 10:38:26.000000000 +0200
9414 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/skbuff.h       2017-04-18 17:54:26.000000000 +0200
9415 @@ -284,6 +284,7 @@
9417         __u32           qlen;
9418         spinlock_t      lock;
9419 +       raw_spinlock_t  raw_lock;
9420  };
9422  struct sk_buff;
9423 @@ -1573,6 +1574,12 @@
9424         __skb_queue_head_init(list);
9427 +static inline void skb_queue_head_init_raw(struct sk_buff_head *list)
9429 +       raw_spin_lock_init(&list->raw_lock);
9430 +       __skb_queue_head_init(list);
9433  static inline void skb_queue_head_init_class(struct sk_buff_head *list,
9434                 struct lock_class_key *class)
9436 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/smp.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/smp.h
9437 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/smp.h     2017-04-16 10:38:26.000000000 +0200
9438 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/smp.h  2017-04-18 17:54:26.000000000 +0200
9439 @@ -120,6 +120,13 @@
9440  extern void __init setup_nr_cpu_ids(void);
9441  extern void __init smp_init(void);
9443 +extern int __boot_cpu_id;
9445 +static inline int get_boot_cpu_id(void)
9447 +       return __boot_cpu_id;
9450  #else /* !SMP */
9452  static inline void smp_send_stop(void) { }
9453 @@ -158,6 +165,11 @@
9454  static inline void smp_init(void) { }
9455  #endif
9457 +static inline int get_boot_cpu_id(void)
9459 +       return 0;
9462  #endif /* !SMP */
9464  /*
9465 @@ -185,6 +197,9 @@
9466  #define get_cpu()              ({ preempt_disable(); smp_processor_id(); })
9467  #define put_cpu()              preempt_enable()
9469 +#define get_cpu_light()                ({ migrate_disable(); smp_processor_id(); })
9470 +#define put_cpu_light()                migrate_enable()
9472  /*
9473   * Callback to arch code if there's nosmp or maxcpus=0 on the
9474   * boot command line:
9475 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/spinlock.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/spinlock.h
9476 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/spinlock.h        2017-04-16 10:38:26.000000000 +0200
9477 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/spinlock.h     2017-04-18 17:54:26.000000000 +0200
9478 @@ -271,7 +271,11 @@
9479  #define raw_spin_can_lock(lock)        (!raw_spin_is_locked(lock))
9481  /* Include rwlock functions */
9482 -#include <linux/rwlock.h>
9483 +#ifdef CONFIG_PREEMPT_RT_FULL
9484 +# include <linux/rwlock_rt.h>
9485 +#else
9486 +# include <linux/rwlock.h>
9487 +#endif
9489  /*
9490   * Pull the _spin_*()/_read_*()/_write_*() functions/declarations:
9491 @@ -282,6 +286,10 @@
9492  # include <linux/spinlock_api_up.h>
9493  #endif
9495 +#ifdef CONFIG_PREEMPT_RT_FULL
9496 +# include <linux/spinlock_rt.h>
9497 +#else /* PREEMPT_RT_FULL */
9499  /*
9500   * Map the spin_lock functions to the raw variants for PREEMPT_RT=n
9501   */
9502 @@ -416,4 +424,6 @@
9503  #define atomic_dec_and_lock(atomic, lock) \
9504                 __cond_lock(lock, _atomic_dec_and_lock(atomic, lock))
9506 +#endif /* !PREEMPT_RT_FULL */
9508  #endif /* __LINUX_SPINLOCK_H */
9509 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/spinlock_api_smp.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/spinlock_api_smp.h
9510 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/spinlock_api_smp.h        2017-04-16 10:38:26.000000000 +0200
9511 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/spinlock_api_smp.h     2017-04-18 17:54:26.000000000 +0200
9512 @@ -189,6 +189,8 @@
9513         return 0;
9516 -#include <linux/rwlock_api_smp.h>
9517 +#ifndef CONFIG_PREEMPT_RT_FULL
9518 +# include <linux/rwlock_api_smp.h>
9519 +#endif
9521  #endif /* __LINUX_SPINLOCK_API_SMP_H */
9522 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/spinlock_rt.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/spinlock_rt.h
9523 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/spinlock_rt.h     1970-01-01 01:00:00.000000000 +0100
9524 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/spinlock_rt.h  2017-04-18 17:54:26.000000000 +0200
9525 @@ -0,0 +1,162 @@
9526 +#ifndef __LINUX_SPINLOCK_RT_H
9527 +#define __LINUX_SPINLOCK_RT_H
9529 +#ifndef __LINUX_SPINLOCK_H
9530 +#error Do not include directly. Use spinlock.h
9531 +#endif
9533 +#include <linux/bug.h>
9535 +extern void
9536 +__rt_spin_lock_init(spinlock_t *lock, char *name, struct lock_class_key *key);
9538 +#define spin_lock_init(slock)                          \
9539 +do {                                                   \
9540 +       static struct lock_class_key __key;             \
9541 +                                                       \
9542 +       rt_mutex_init(&(slock)->lock);                  \
9543 +       __rt_spin_lock_init(slock, #slock, &__key);     \
9544 +} while (0)
9546 +void __lockfunc rt_spin_lock__no_mg(spinlock_t *lock);
9547 +void __lockfunc rt_spin_unlock__no_mg(spinlock_t *lock);
9548 +int __lockfunc rt_spin_trylock__no_mg(spinlock_t *lock);
9550 +extern void __lockfunc rt_spin_lock(spinlock_t *lock);
9551 +extern unsigned long __lockfunc rt_spin_lock_trace_flags(spinlock_t *lock);
9552 +extern void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int subclass);
9553 +extern void __lockfunc rt_spin_unlock(spinlock_t *lock);
9554 +extern void __lockfunc rt_spin_unlock_wait(spinlock_t *lock);
9555 +extern int __lockfunc rt_spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags);
9556 +extern int __lockfunc rt_spin_trylock_bh(spinlock_t *lock);
9557 +extern int __lockfunc rt_spin_trylock(spinlock_t *lock);
9558 +extern int atomic_dec_and_spin_lock(atomic_t *atomic, spinlock_t *lock);
9561 + * lockdep-less calls, for derived types like rwlock:
9562 + * (for trylock they can use rt_mutex_trylock() directly.
9563 + */
9564 +extern void __lockfunc __rt_spin_lock__no_mg(struct rt_mutex *lock);
9565 +extern void __lockfunc __rt_spin_lock(struct rt_mutex *lock);
9566 +extern void __lockfunc __rt_spin_unlock(struct rt_mutex *lock);
9568 +#define spin_lock(lock)                        rt_spin_lock(lock)
9570 +#define spin_lock_bh(lock)                     \
9571 +       do {                                    \
9572 +               local_bh_disable();             \
9573 +               rt_spin_lock(lock);             \
9574 +       } while (0)
9576 +#define spin_lock_irq(lock)            spin_lock(lock)
9578 +#define spin_do_trylock(lock)          __cond_lock(lock, rt_spin_trylock(lock))
9580 +#define spin_trylock(lock)                     \
9581 +({                                             \
9582 +       int __locked;                           \
9583 +       __locked = spin_do_trylock(lock);       \
9584 +       __locked;                               \
9587 +#ifdef CONFIG_LOCKDEP
9588 +# define spin_lock_nested(lock, subclass)              \
9589 +       do {                                            \
9590 +               rt_spin_lock_nested(lock, subclass);    \
9591 +       } while (0)
9593 +#define spin_lock_bh_nested(lock, subclass)            \
9594 +       do {                                            \
9595 +               local_bh_disable();                     \
9596 +               rt_spin_lock_nested(lock, subclass);    \
9597 +       } while (0)
9599 +# define spin_lock_irqsave_nested(lock, flags, subclass) \
9600 +       do {                                             \
9601 +               typecheck(unsigned long, flags);         \
9602 +               flags = 0;                               \
9603 +               rt_spin_lock_nested(lock, subclass);     \
9604 +       } while (0)
9605 +#else
9606 +# define spin_lock_nested(lock, subclass)      spin_lock(lock)
9607 +# define spin_lock_bh_nested(lock, subclass)   spin_lock_bh(lock)
9609 +# define spin_lock_irqsave_nested(lock, flags, subclass) \
9610 +       do {                                             \
9611 +               typecheck(unsigned long, flags);         \
9612 +               flags = 0;                               \
9613 +               spin_lock(lock);                         \
9614 +       } while (0)
9615 +#endif
9617 +#define spin_lock_irqsave(lock, flags)                  \
9618 +       do {                                             \
9619 +               typecheck(unsigned long, flags);         \
9620 +               flags = 0;                               \
9621 +               spin_lock(lock);                         \
9622 +       } while (0)
9624 +static inline unsigned long spin_lock_trace_flags(spinlock_t *lock)
9626 +       unsigned long flags = 0;
9627 +#ifdef CONFIG_TRACE_IRQFLAGS
9628 +       flags = rt_spin_lock_trace_flags(lock);
9629 +#else
9630 +       spin_lock(lock); /* lock_local */
9631 +#endif
9632 +       return flags;
9635 +/* FIXME: we need rt_spin_lock_nest_lock */
9636 +#define spin_lock_nest_lock(lock, nest_lock) spin_lock_nested(lock, 0)
9638 +#define spin_unlock(lock)                      rt_spin_unlock(lock)
9640 +#define spin_unlock_bh(lock)                           \
9641 +       do {                                            \
9642 +               rt_spin_unlock(lock);                   \
9643 +               local_bh_enable();                      \
9644 +       } while (0)
9646 +#define spin_unlock_irq(lock)          spin_unlock(lock)
9648 +#define spin_unlock_irqrestore(lock, flags)            \
9649 +       do {                                            \
9650 +               typecheck(unsigned long, flags);        \
9651 +               (void) flags;                           \
9652 +               spin_unlock(lock);                      \
9653 +       } while (0)
9655 +#define spin_trylock_bh(lock)  __cond_lock(lock, rt_spin_trylock_bh(lock))
9656 +#define spin_trylock_irq(lock) spin_trylock(lock)
9658 +#define spin_trylock_irqsave(lock, flags)      \
9659 +       rt_spin_trylock_irqsave(lock, &(flags))
9661 +#define spin_unlock_wait(lock)         rt_spin_unlock_wait(lock)
9663 +#ifdef CONFIG_GENERIC_LOCKBREAK
9664 +# define spin_is_contended(lock)       ((lock)->break_lock)
9665 +#else
9666 +# define spin_is_contended(lock)       (((void)(lock), 0))
9667 +#endif
9669 +static inline int spin_can_lock(spinlock_t *lock)
9671 +       return !rt_mutex_is_locked(&lock->lock);
9674 +static inline int spin_is_locked(spinlock_t *lock)
9676 +       return rt_mutex_is_locked(&lock->lock);
9679 +static inline void assert_spin_locked(spinlock_t *lock)
9681 +       BUG_ON(!spin_is_locked(lock));
9684 +#define atomic_dec_and_lock(atomic, lock) \
9685 +       atomic_dec_and_spin_lock(atomic, lock)
9687 +#endif
9688 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/spinlock_types.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/spinlock_types.h
9689 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/spinlock_types.h  2017-04-16 10:38:26.000000000 +0200
9690 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/spinlock_types.h       2017-04-18 17:54:26.000000000 +0200
9691 @@ -9,80 +9,15 @@
9692   * Released under the General Public License (GPL).
9693   */
9695 -#if defined(CONFIG_SMP)
9696 -# include <asm/spinlock_types.h>
9697 -#else
9698 -# include <linux/spinlock_types_up.h>
9699 -#endif
9701 -#include <linux/lockdep.h>
9703 -typedef struct raw_spinlock {
9704 -       arch_spinlock_t raw_lock;
9705 -#ifdef CONFIG_GENERIC_LOCKBREAK
9706 -       unsigned int break_lock;
9707 -#endif
9708 -#ifdef CONFIG_DEBUG_SPINLOCK
9709 -       unsigned int magic, owner_cpu;
9710 -       void *owner;
9711 -#endif
9712 -#ifdef CONFIG_DEBUG_LOCK_ALLOC
9713 -       struct lockdep_map dep_map;
9714 -#endif
9715 -} raw_spinlock_t;
9717 -#define SPINLOCK_MAGIC         0xdead4ead
9719 -#define SPINLOCK_OWNER_INIT    ((void *)-1L)
9721 -#ifdef CONFIG_DEBUG_LOCK_ALLOC
9722 -# define SPIN_DEP_MAP_INIT(lockname)   .dep_map = { .name = #lockname }
9723 -#else
9724 -# define SPIN_DEP_MAP_INIT(lockname)
9725 -#endif
9726 +#include <linux/spinlock_types_raw.h>
9728 -#ifdef CONFIG_DEBUG_SPINLOCK
9729 -# define SPIN_DEBUG_INIT(lockname)             \
9730 -       .magic = SPINLOCK_MAGIC,                \
9731 -       .owner_cpu = -1,                        \
9732 -       .owner = SPINLOCK_OWNER_INIT,
9733 +#ifndef CONFIG_PREEMPT_RT_FULL
9734 +# include <linux/spinlock_types_nort.h>
9735 +# include <linux/rwlock_types.h>
9736  #else
9737 -# define SPIN_DEBUG_INIT(lockname)
9738 +# include <linux/rtmutex.h>
9739 +# include <linux/spinlock_types_rt.h>
9740 +# include <linux/rwlock_types_rt.h>
9741  #endif
9743 -#define __RAW_SPIN_LOCK_INITIALIZER(lockname)  \
9744 -       {                                       \
9745 -       .raw_lock = __ARCH_SPIN_LOCK_UNLOCKED,  \
9746 -       SPIN_DEBUG_INIT(lockname)               \
9747 -       SPIN_DEP_MAP_INIT(lockname) }
9749 -#define __RAW_SPIN_LOCK_UNLOCKED(lockname)     \
9750 -       (raw_spinlock_t) __RAW_SPIN_LOCK_INITIALIZER(lockname)
9752 -#define DEFINE_RAW_SPINLOCK(x) raw_spinlock_t x = __RAW_SPIN_LOCK_UNLOCKED(x)
9754 -typedef struct spinlock {
9755 -       union {
9756 -               struct raw_spinlock rlock;
9758 -#ifdef CONFIG_DEBUG_LOCK_ALLOC
9759 -# define LOCK_PADSIZE (offsetof(struct raw_spinlock, dep_map))
9760 -               struct {
9761 -                       u8 __padding[LOCK_PADSIZE];
9762 -                       struct lockdep_map dep_map;
9763 -               };
9764 -#endif
9765 -       };
9766 -} spinlock_t;
9768 -#define __SPIN_LOCK_INITIALIZER(lockname) \
9769 -       { { .rlock = __RAW_SPIN_LOCK_INITIALIZER(lockname) } }
9771 -#define __SPIN_LOCK_UNLOCKED(lockname) \
9772 -       (spinlock_t ) __SPIN_LOCK_INITIALIZER(lockname)
9774 -#define DEFINE_SPINLOCK(x)     spinlock_t x = __SPIN_LOCK_UNLOCKED(x)
9776 -#include <linux/rwlock_types.h>
9778  #endif /* __LINUX_SPINLOCK_TYPES_H */
9779 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/spinlock_types_nort.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/spinlock_types_nort.h
9780 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/spinlock_types_nort.h     1970-01-01 01:00:00.000000000 +0100
9781 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/spinlock_types_nort.h  2017-04-18 17:54:26.000000000 +0200
9782 @@ -0,0 +1,33 @@
9783 +#ifndef __LINUX_SPINLOCK_TYPES_NORT_H
9784 +#define __LINUX_SPINLOCK_TYPES_NORT_H
9786 +#ifndef __LINUX_SPINLOCK_TYPES_H
9787 +#error "Do not include directly. Include spinlock_types.h instead"
9788 +#endif
9791 + * The non RT version maps spinlocks to raw_spinlocks
9792 + */
9793 +typedef struct spinlock {
9794 +       union {
9795 +               struct raw_spinlock rlock;
9797 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
9798 +# define LOCK_PADSIZE (offsetof(struct raw_spinlock, dep_map))
9799 +               struct {
9800 +                       u8 __padding[LOCK_PADSIZE];
9801 +                       struct lockdep_map dep_map;
9802 +               };
9803 +#endif
9804 +       };
9805 +} spinlock_t;
9807 +#define __SPIN_LOCK_INITIALIZER(lockname) \
9808 +       { { .rlock = __RAW_SPIN_LOCK_INITIALIZER(lockname) } }
9810 +#define __SPIN_LOCK_UNLOCKED(lockname) \
9811 +       (spinlock_t ) __SPIN_LOCK_INITIALIZER(lockname)
9813 +#define DEFINE_SPINLOCK(x)     spinlock_t x = __SPIN_LOCK_UNLOCKED(x)
9815 +#endif
9816 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/spinlock_types_raw.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/spinlock_types_raw.h
9817 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/spinlock_types_raw.h      1970-01-01 01:00:00.000000000 +0100
9818 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/spinlock_types_raw.h   2017-04-18 17:54:26.000000000 +0200
9819 @@ -0,0 +1,56 @@
9820 +#ifndef __LINUX_SPINLOCK_TYPES_RAW_H
9821 +#define __LINUX_SPINLOCK_TYPES_RAW_H
9823 +#if defined(CONFIG_SMP)
9824 +# include <asm/spinlock_types.h>
9825 +#else
9826 +# include <linux/spinlock_types_up.h>
9827 +#endif
9829 +#include <linux/lockdep.h>
9831 +typedef struct raw_spinlock {
9832 +       arch_spinlock_t raw_lock;
9833 +#ifdef CONFIG_GENERIC_LOCKBREAK
9834 +       unsigned int break_lock;
9835 +#endif
9836 +#ifdef CONFIG_DEBUG_SPINLOCK
9837 +       unsigned int magic, owner_cpu;
9838 +       void *owner;
9839 +#endif
9840 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
9841 +       struct lockdep_map dep_map;
9842 +#endif
9843 +} raw_spinlock_t;
9845 +#define SPINLOCK_MAGIC         0xdead4ead
9847 +#define SPINLOCK_OWNER_INIT    ((void *)-1L)
9849 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
9850 +# define SPIN_DEP_MAP_INIT(lockname)   .dep_map = { .name = #lockname }
9851 +#else
9852 +# define SPIN_DEP_MAP_INIT(lockname)
9853 +#endif
9855 +#ifdef CONFIG_DEBUG_SPINLOCK
9856 +# define SPIN_DEBUG_INIT(lockname)             \
9857 +       .magic = SPINLOCK_MAGIC,                \
9858 +       .owner_cpu = -1,                        \
9859 +       .owner = SPINLOCK_OWNER_INIT,
9860 +#else
9861 +# define SPIN_DEBUG_INIT(lockname)
9862 +#endif
9864 +#define __RAW_SPIN_LOCK_INITIALIZER(lockname)  \
9865 +       {                                       \
9866 +       .raw_lock = __ARCH_SPIN_LOCK_UNLOCKED,  \
9867 +       SPIN_DEBUG_INIT(lockname)               \
9868 +       SPIN_DEP_MAP_INIT(lockname) }
9870 +#define __RAW_SPIN_LOCK_UNLOCKED(lockname)     \
9871 +       (raw_spinlock_t) __RAW_SPIN_LOCK_INITIALIZER(lockname)
9873 +#define DEFINE_RAW_SPINLOCK(x) raw_spinlock_t x = __RAW_SPIN_LOCK_UNLOCKED(x)
9875 +#endif
9876 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/spinlock_types_rt.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/spinlock_types_rt.h
9877 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/spinlock_types_rt.h       1970-01-01 01:00:00.000000000 +0100
9878 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/spinlock_types_rt.h    2017-04-18 17:54:26.000000000 +0200
9879 @@ -0,0 +1,48 @@
9880 +#ifndef __LINUX_SPINLOCK_TYPES_RT_H
9881 +#define __LINUX_SPINLOCK_TYPES_RT_H
9883 +#ifndef __LINUX_SPINLOCK_TYPES_H
9884 +#error "Do not include directly. Include spinlock_types.h instead"
9885 +#endif
9887 +#include <linux/cache.h>
9890 + * PREEMPT_RT: spinlocks - an RT mutex plus lock-break field:
9891 + */
9892 +typedef struct spinlock {
9893 +       struct rt_mutex         lock;
9894 +       unsigned int            break_lock;
9895 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
9896 +       struct lockdep_map      dep_map;
9897 +#endif
9898 +} spinlock_t;
9900 +#ifdef CONFIG_DEBUG_RT_MUTEXES
9901 +# define __RT_SPIN_INITIALIZER(name) \
9902 +       { \
9903 +       .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), \
9904 +       .save_state = 1, \
9905 +       .file = __FILE__, \
9906 +       .line = __LINE__ , \
9907 +       }
9908 +#else
9909 +# define __RT_SPIN_INITIALIZER(name) \
9910 +       {                                                               \
9911 +       .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock),          \
9912 +       .save_state = 1, \
9913 +       }
9914 +#endif
9917 +.wait_list = PLIST_HEAD_INIT_RAW((name).lock.wait_list, (name).lock.wait_lock)
9920 +#define __SPIN_LOCK_UNLOCKED(name)                     \
9921 +       { .lock = __RT_SPIN_INITIALIZER(name.lock),             \
9922 +         SPIN_DEP_MAP_INIT(name) }
9924 +#define DEFINE_SPINLOCK(name) \
9925 +       spinlock_t name = __SPIN_LOCK_UNLOCKED(name)
9927 +#endif
9928 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/srcu.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/srcu.h
9929 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/srcu.h    2017-04-16 10:38:26.000000000 +0200
9930 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/srcu.h 2017-04-18 17:54:26.000000000 +0200
9931 @@ -84,10 +84,10 @@
9933  void process_srcu(struct work_struct *work);
9935 -#define __SRCU_STRUCT_INIT(name)                                       \
9936 +#define __SRCU_STRUCT_INIT(name, pcpu_name)                            \
9937         {                                                               \
9938                 .completed = -300,                                      \
9939 -               .per_cpu_ref = &name##_srcu_array,                      \
9940 +               .per_cpu_ref = &pcpu_name,                              \
9941                 .queue_lock = __SPIN_LOCK_UNLOCKED(name.queue_lock),    \
9942                 .running = false,                                       \
9943                 .batch_queue = RCU_BATCH_INIT(name.batch_queue),        \
9944 @@ -119,7 +119,7 @@
9945   */
9946  #define __DEFINE_SRCU(name, is_static)                                 \
9947         static DEFINE_PER_CPU(struct srcu_struct_array, name##_srcu_array);\
9948 -       is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name)
9949 +       is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name, name##_srcu_array)
9950  #define DEFINE_SRCU(name)              __DEFINE_SRCU(name, /* not static */)
9951  #define DEFINE_STATIC_SRCU(name)       __DEFINE_SRCU(name, static)
9953 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/suspend.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/suspend.h
9954 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/suspend.h 2017-04-16 10:38:26.000000000 +0200
9955 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/suspend.h      2017-04-18 17:54:26.000000000 +0200
9956 @@ -193,6 +193,12 @@
9957         void (*end)(void);
9958  };
9960 +#if defined(CONFIG_SUSPEND) || defined(CONFIG_HIBERNATION)
9961 +extern bool pm_in_action;
9962 +#else
9963 +# define pm_in_action false
9964 +#endif
9966  #ifdef CONFIG_SUSPEND
9967  /**
9968   * suspend_set_ops - set platform dependent suspend operations
9969 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/swait.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/swait.h
9970 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/swait.h   2017-04-16 10:38:26.000000000 +0200
9971 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/swait.h        2017-04-18 17:54:26.000000000 +0200
9972 @@ -87,6 +87,7 @@
9973  extern void swake_up(struct swait_queue_head *q);
9974  extern void swake_up_all(struct swait_queue_head *q);
9975  extern void swake_up_locked(struct swait_queue_head *q);
9976 +extern void swake_up_all_locked(struct swait_queue_head *q);
9978  extern void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait);
9979  extern void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int state);
9980 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/swap.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/swap.h
9981 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/swap.h    2017-04-16 10:38:26.000000000 +0200
9982 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/swap.h 2017-04-18 17:54:26.000000000 +0200
9983 @@ -11,6 +11,7 @@
9984  #include <linux/fs.h>
9985  #include <linux/atomic.h>
9986  #include <linux/page-flags.h>
9987 +#include <linux/locallock.h>
9988  #include <asm/page.h>
9990  struct notifier_block;
9991 @@ -247,7 +248,8 @@
9992  void *workingset_eviction(struct address_space *mapping, struct page *page);
9993  bool workingset_refault(void *shadow);
9994  void workingset_activation(struct page *page);
9995 -extern struct list_lru workingset_shadow_nodes;
9996 +extern struct list_lru __workingset_shadow_nodes;
9997 +DECLARE_LOCAL_IRQ_LOCK(workingset_shadow_lock);
9999  static inline unsigned int workingset_node_pages(struct radix_tree_node *node)
10001 @@ -292,6 +294,7 @@
10004  /* linux/mm/swap.c */
10005 +DECLARE_LOCAL_IRQ_LOCK(swapvec_lock);
10006  extern void lru_cache_add(struct page *);
10007  extern void lru_cache_add_anon(struct page *page);
10008  extern void lru_cache_add_file(struct page *page);
10009 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/swork.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/swork.h
10010 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/swork.h   1970-01-01 01:00:00.000000000 +0100
10011 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/swork.h        2017-04-18 17:54:26.000000000 +0200
10012 @@ -0,0 +1,24 @@
10013 +#ifndef _LINUX_SWORK_H
10014 +#define _LINUX_SWORK_H
10016 +#include <linux/list.h>
10018 +struct swork_event {
10019 +       struct list_head item;
10020 +       unsigned long flags;
10021 +       void (*func)(struct swork_event *);
10024 +static inline void INIT_SWORK(struct swork_event *event,
10025 +                             void (*func)(struct swork_event *))
10027 +       event->flags = 0;
10028 +       event->func = func;
10031 +bool swork_queue(struct swork_event *sev);
10033 +int swork_get(void);
10034 +void swork_put(void);
10036 +#endif /* _LINUX_SWORK_H */
10037 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/thread_info.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/thread_info.h
10038 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/thread_info.h     2017-04-16 10:38:26.000000000 +0200
10039 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/thread_info.h  2017-04-18 17:54:26.000000000 +0200
10040 @@ -107,7 +107,17 @@
10041  #define test_thread_flag(flag) \
10042         test_ti_thread_flag(current_thread_info(), flag)
10044 -#define tif_need_resched() test_thread_flag(TIF_NEED_RESCHED)
10045 +#ifdef CONFIG_PREEMPT_LAZY
10046 +#define tif_need_resched()     (test_thread_flag(TIF_NEED_RESCHED) || \
10047 +                                test_thread_flag(TIF_NEED_RESCHED_LAZY))
10048 +#define tif_need_resched_now() (test_thread_flag(TIF_NEED_RESCHED))
10049 +#define tif_need_resched_lazy()        test_thread_flag(TIF_NEED_RESCHED_LAZY))
10051 +#else
10052 +#define tif_need_resched()     test_thread_flag(TIF_NEED_RESCHED)
10053 +#define tif_need_resched_now() test_thread_flag(TIF_NEED_RESCHED)
10054 +#define tif_need_resched_lazy()        0
10055 +#endif
10057  #ifndef CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES
10058  static inline int arch_within_stack_frames(const void * const stack,
10059 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/timer.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/timer.h
10060 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/timer.h   2017-04-16 10:38:26.000000000 +0200
10061 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/timer.h        2017-04-18 17:54:26.000000000 +0200
10062 @@ -241,7 +241,7 @@
10064  extern int try_to_del_timer_sync(struct timer_list *timer);
10066 -#ifdef CONFIG_SMP
10067 +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
10068    extern int del_timer_sync(struct timer_list *timer);
10069  #else
10070  # define del_timer_sync(t)             del_timer(t)
10071 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/trace_events.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/trace_events.h
10072 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/trace_events.h    2017-04-16 10:38:26.000000000 +0200
10073 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/trace_events.h 2017-04-18 17:54:26.000000000 +0200
10074 @@ -56,6 +56,9 @@
10075         unsigned char           flags;
10076         unsigned char           preempt_count;
10077         int                     pid;
10078 +       unsigned short          migrate_disable;
10079 +       unsigned short          padding;
10080 +       unsigned char           preempt_lazy_count;
10081  };
10083  #define TRACE_EVENT_TYPE_MAX                                           \
10084 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/uaccess.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/uaccess.h
10085 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/uaccess.h 2017-04-16 10:38:26.000000000 +0200
10086 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/uaccess.h      2017-04-18 17:54:26.000000000 +0200
10087 @@ -24,6 +24,7 @@
10088   */
10089  static inline void pagefault_disable(void)
10091 +       migrate_disable();
10092         pagefault_disabled_inc();
10093         /*
10094          * make sure to have issued the store before a pagefault
10095 @@ -40,6 +41,7 @@
10096          */
10097         barrier();
10098         pagefault_disabled_dec();
10099 +       migrate_enable();
10102  /*
10103 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/uprobes.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/uprobes.h
10104 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/uprobes.h 2017-04-16 10:38:26.000000000 +0200
10105 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/uprobes.h      2017-04-18 17:54:26.000000000 +0200
10106 @@ -27,6 +27,7 @@
10107  #include <linux/errno.h>
10108  #include <linux/rbtree.h>
10109  #include <linux/types.h>
10110 +#include <linux/wait.h>
10112  struct vm_area_struct;
10113  struct mm_struct;
10114 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/vmstat.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/vmstat.h
10115 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/vmstat.h  2017-04-16 10:38:27.000000000 +0200
10116 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/vmstat.h       2017-04-18 17:54:26.000000000 +0200
10117 @@ -33,7 +33,9 @@
10118   */
10119  static inline void __count_vm_event(enum vm_event_item item)
10121 +       preempt_disable_rt();
10122         raw_cpu_inc(vm_event_states.event[item]);
10123 +       preempt_enable_rt();
10126  static inline void count_vm_event(enum vm_event_item item)
10127 @@ -43,7 +45,9 @@
10129  static inline void __count_vm_events(enum vm_event_item item, long delta)
10131 +       preempt_disable_rt();
10132         raw_cpu_add(vm_event_states.event[item], delta);
10133 +       preempt_enable_rt();
10136  static inline void count_vm_events(enum vm_event_item item, long delta)
10137 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/wait.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/wait.h
10138 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/linux/wait.h    2017-04-16 10:38:27.000000000 +0200
10139 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/linux/wait.h 2017-04-18 17:54:26.000000000 +0200
10140 @@ -8,6 +8,7 @@
10141  #include <linux/spinlock.h>
10142  #include <asm/current.h>
10143  #include <uapi/linux/wait.h>
10144 +#include <linux/atomic.h>
10146  typedef struct __wait_queue wait_queue_t;
10147  typedef int (*wait_queue_func_t)(wait_queue_t *wait, unsigned mode, int flags, void *key);
10148 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/net/dst.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/net/dst.h
10149 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/net/dst.h       2017-04-16 10:38:27.000000000 +0200
10150 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/net/dst.h    2017-04-18 17:54:26.000000000 +0200
10151 @@ -446,7 +446,7 @@
10152  static inline int dst_neigh_output(struct dst_entry *dst, struct neighbour *n,
10153                                    struct sk_buff *skb)
10155 -       const struct hh_cache *hh;
10156 +       struct hh_cache *hh;
10158         if (dst->pending_confirm) {
10159                 unsigned long now = jiffies;
10160 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/net/gen_stats.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/net/gen_stats.h
10161 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/net/gen_stats.h 2017-04-16 10:38:27.000000000 +0200
10162 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/net/gen_stats.h      2017-04-18 17:54:26.000000000 +0200
10163 @@ -5,6 +5,7 @@
10164  #include <linux/socket.h>
10165  #include <linux/rtnetlink.h>
10166  #include <linux/pkt_sched.h>
10167 +#include <net/net_seq_lock.h>
10169  struct gnet_stats_basic_cpu {
10170         struct gnet_stats_basic_packed bstats;
10171 @@ -33,11 +34,11 @@
10172                                  spinlock_t *lock, struct gnet_dump *d,
10173                                  int padattr);
10175 -int gnet_stats_copy_basic(const seqcount_t *running,
10176 +int gnet_stats_copy_basic(net_seqlock_t *running,
10177                           struct gnet_dump *d,
10178                           struct gnet_stats_basic_cpu __percpu *cpu,
10179                           struct gnet_stats_basic_packed *b);
10180 -void __gnet_stats_copy_basic(const seqcount_t *running,
10181 +void __gnet_stats_copy_basic(net_seqlock_t *running,
10182                              struct gnet_stats_basic_packed *bstats,
10183                              struct gnet_stats_basic_cpu __percpu *cpu,
10184                              struct gnet_stats_basic_packed *b);
10185 @@ -55,14 +56,14 @@
10186                       struct gnet_stats_basic_cpu __percpu *cpu_bstats,
10187                       struct gnet_stats_rate_est64 *rate_est,
10188                       spinlock_t *stats_lock,
10189 -                     seqcount_t *running, struct nlattr *opt);
10190 +                     net_seqlock_t *running, struct nlattr *opt);
10191  void gen_kill_estimator(struct gnet_stats_basic_packed *bstats,
10192                         struct gnet_stats_rate_est64 *rate_est);
10193  int gen_replace_estimator(struct gnet_stats_basic_packed *bstats,
10194                           struct gnet_stats_basic_cpu __percpu *cpu_bstats,
10195                           struct gnet_stats_rate_est64 *rate_est,
10196                           spinlock_t *stats_lock,
10197 -                         seqcount_t *running, struct nlattr *opt);
10198 +                         net_seqlock_t *running, struct nlattr *opt);
10199  bool gen_estimator_active(const struct gnet_stats_basic_packed *bstats,
10200                           const struct gnet_stats_rate_est64 *rate_est);
10201  #endif
10202 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/net/neighbour.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/net/neighbour.h
10203 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/net/neighbour.h 2017-04-16 10:38:27.000000000 +0200
10204 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/net/neighbour.h      2017-04-18 17:54:26.000000000 +0200
10205 @@ -446,7 +446,7 @@
10207  #endif
10209 -static inline int neigh_hh_output(const struct hh_cache *hh, struct sk_buff *skb)
10210 +static inline int neigh_hh_output(struct hh_cache *hh, struct sk_buff *skb)
10212         unsigned int seq;
10213         int hh_len;
10214 @@ -501,7 +501,7 @@
10216  #define NEIGH_CB(skb)  ((struct neighbour_cb *)(skb)->cb)
10218 -static inline void neigh_ha_snapshot(char *dst, const struct neighbour *n,
10219 +static inline void neigh_ha_snapshot(char *dst, struct neighbour *n,
10220                                      const struct net_device *dev)
10222         unsigned int seq;
10223 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/net/net_seq_lock.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/net/net_seq_lock.h
10224 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/net/net_seq_lock.h      1970-01-01 01:00:00.000000000 +0100
10225 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/net/net_seq_lock.h   2017-04-18 17:54:26.000000000 +0200
10226 @@ -0,0 +1,15 @@
10227 +#ifndef __NET_NET_SEQ_LOCK_H__
10228 +#define __NET_NET_SEQ_LOCK_H__
10230 +#ifdef CONFIG_PREEMPT_RT_BASE
10231 +# define net_seqlock_t                 seqlock_t
10232 +# define net_seq_begin(__r)            read_seqbegin(__r)
10233 +# define net_seq_retry(__r, __s)       read_seqretry(__r, __s)
10235 +#else
10236 +# define net_seqlock_t                 seqcount_t
10237 +# define net_seq_begin(__r)            read_seqcount_begin(__r)
10238 +# define net_seq_retry(__r, __s)       read_seqcount_retry(__r, __s)
10239 +#endif
10241 +#endif
10242 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/net/netns/ipv4.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/net/netns/ipv4.h
10243 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/net/netns/ipv4.h        2017-04-16 10:38:27.000000000 +0200
10244 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/net/netns/ipv4.h     2017-04-18 17:54:26.000000000 +0200
10245 @@ -69,6 +69,7 @@
10247         int sysctl_icmp_echo_ignore_all;
10248         int sysctl_icmp_echo_ignore_broadcasts;
10249 +       int sysctl_icmp_echo_sysrq;
10250         int sysctl_icmp_ignore_bogus_error_responses;
10251         int sysctl_icmp_ratelimit;
10252         int sysctl_icmp_ratemask;
10253 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/net/sch_generic.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/net/sch_generic.h
10254 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/net/sch_generic.h       2017-04-16 10:38:27.000000000 +0200
10255 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/net/sch_generic.h    2017-04-18 17:54:26.000000000 +0200
10256 @@ -10,6 +10,7 @@
10257  #include <linux/dynamic_queue_limits.h>
10258  #include <net/gen_stats.h>
10259  #include <net/rtnetlink.h>
10260 +#include <net/net_seq_lock.h>
10262  struct Qdisc_ops;
10263  struct qdisc_walker;
10264 @@ -86,7 +87,7 @@
10265         struct sk_buff          *gso_skb ____cacheline_aligned_in_smp;
10266         struct qdisc_skb_head   q;
10267         struct gnet_stats_basic_packed bstats;
10268 -       seqcount_t              running;
10269 +       net_seqlock_t           running;
10270         struct gnet_stats_queue qstats;
10271         unsigned long           state;
10272         struct Qdisc            *next_sched;
10273 @@ -98,13 +99,22 @@
10274         spinlock_t              busylock ____cacheline_aligned_in_smp;
10275  };
10277 -static inline bool qdisc_is_running(const struct Qdisc *qdisc)
10278 +static inline bool qdisc_is_running(struct Qdisc *qdisc)
10280 +#ifdef CONFIG_PREEMPT_RT_BASE
10281 +       return spin_is_locked(&qdisc->running.lock) ? true : false;
10282 +#else
10283         return (raw_read_seqcount(&qdisc->running) & 1) ? true : false;
10284 +#endif
10287  static inline bool qdisc_run_begin(struct Qdisc *qdisc)
10289 +#ifdef CONFIG_PREEMPT_RT_BASE
10290 +       if (try_write_seqlock(&qdisc->running))
10291 +               return true;
10292 +       return false;
10293 +#else
10294         if (qdisc_is_running(qdisc))
10295                 return false;
10296         /* Variant of write_seqcount_begin() telling lockdep a trylock
10297 @@ -113,11 +123,16 @@
10298         raw_write_seqcount_begin(&qdisc->running);
10299         seqcount_acquire(&qdisc->running.dep_map, 0, 1, _RET_IP_);
10300         return true;
10301 +#endif
10304  static inline void qdisc_run_end(struct Qdisc *qdisc)
10306 +#ifdef CONFIG_PREEMPT_RT_BASE
10307 +       write_sequnlock(&qdisc->running);
10308 +#else
10309         write_seqcount_end(&qdisc->running);
10310 +#endif
10313  static inline bool qdisc_may_bulk(const struct Qdisc *qdisc)
10314 @@ -308,7 +323,7 @@
10315         return qdisc_lock(root);
10318 -static inline seqcount_t *qdisc_root_sleeping_running(const struct Qdisc *qdisc)
10319 +static inline net_seqlock_t *qdisc_root_sleeping_running(const struct Qdisc *qdisc)
10321         struct Qdisc *root = qdisc_root_sleeping(qdisc);
10323 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/trace/events/hist.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/trace/events/hist.h
10324 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/trace/events/hist.h     1970-01-01 01:00:00.000000000 +0100
10325 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/trace/events/hist.h  2017-04-18 17:54:26.000000000 +0200
10326 @@ -0,0 +1,73 @@
10327 +#undef TRACE_SYSTEM
10328 +#define TRACE_SYSTEM hist
10330 +#if !defined(_TRACE_HIST_H) || defined(TRACE_HEADER_MULTI_READ)
10331 +#define _TRACE_HIST_H
10333 +#include "latency_hist.h"
10334 +#include <linux/tracepoint.h>
10336 +#if !defined(CONFIG_PREEMPT_OFF_HIST) && !defined(CONFIG_INTERRUPT_OFF_HIST)
10337 +#define trace_preemptirqsoff_hist(a, b)
10338 +#define trace_preemptirqsoff_hist_rcuidle(a, b)
10339 +#else
10340 +TRACE_EVENT(preemptirqsoff_hist,
10342 +       TP_PROTO(int reason, int starthist),
10344 +       TP_ARGS(reason, starthist),
10346 +       TP_STRUCT__entry(
10347 +               __field(int,    reason)
10348 +               __field(int,    starthist)
10349 +       ),
10351 +       TP_fast_assign(
10352 +               __entry->reason         = reason;
10353 +               __entry->starthist      = starthist;
10354 +       ),
10356 +       TP_printk("reason=%s starthist=%s", getaction(__entry->reason),
10357 +                 __entry->starthist ? "start" : "stop")
10359 +#endif
10361 +#ifndef CONFIG_MISSED_TIMER_OFFSETS_HIST
10362 +#define trace_hrtimer_interrupt(a, b, c, d)
10363 +#else
10364 +TRACE_EVENT(hrtimer_interrupt,
10366 +       TP_PROTO(int cpu, long long offset, struct task_struct *curr,
10367 +               struct task_struct *task),
10369 +       TP_ARGS(cpu, offset, curr, task),
10371 +       TP_STRUCT__entry(
10372 +               __field(int,            cpu)
10373 +               __field(long long,      offset)
10374 +               __array(char,           ccomm,  TASK_COMM_LEN)
10375 +               __field(int,            cprio)
10376 +               __array(char,           tcomm,  TASK_COMM_LEN)
10377 +               __field(int,            tprio)
10378 +       ),
10380 +       TP_fast_assign(
10381 +               __entry->cpu    = cpu;
10382 +               __entry->offset = offset;
10383 +               memcpy(__entry->ccomm, curr->comm, TASK_COMM_LEN);
10384 +               __entry->cprio  = curr->prio;
10385 +               memcpy(__entry->tcomm, task != NULL ? task->comm : "<none>",
10386 +                       task != NULL ? TASK_COMM_LEN : 7);
10387 +               __entry->tprio  = task != NULL ? task->prio : -1;
10388 +       ),
10390 +       TP_printk("cpu=%d offset=%lld curr=%s[%d] thread=%s[%d]",
10391 +               __entry->cpu, __entry->offset, __entry->ccomm,
10392 +               __entry->cprio, __entry->tcomm, __entry->tprio)
10394 +#endif
10396 +#endif /* _TRACE_HIST_H */
10398 +/* This part must be outside protection */
10399 +#include <trace/define_trace.h>
10400 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/trace/events/latency_hist.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/trace/events/latency_hist.h
10401 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/include/trace/events/latency_hist.h     1970-01-01 01:00:00.000000000 +0100
10402 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/include/trace/events/latency_hist.h  2017-04-18 17:54:26.000000000 +0200
10403 @@ -0,0 +1,29 @@
10404 +#ifndef _LATENCY_HIST_H
10405 +#define _LATENCY_HIST_H
10407 +enum hist_action {
10408 +       IRQS_ON,
10409 +       PREEMPT_ON,
10410 +       TRACE_STOP,
10411 +       IRQS_OFF,
10412 +       PREEMPT_OFF,
10413 +       TRACE_START,
10416 +static char *actions[] = {
10417 +       "IRQS_ON",
10418 +       "PREEMPT_ON",
10419 +       "TRACE_STOP",
10420 +       "IRQS_OFF",
10421 +       "PREEMPT_OFF",
10422 +       "TRACE_START",
10425 +static inline char *getaction(int action)
10427 +       if (action >= 0 && action <= sizeof(actions)/sizeof(actions[0]))
10428 +               return actions[action];
10429 +       return "unknown";
10432 +#endif /* _LATENCY_HIST_H */
10433 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/init/Kconfig linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/init/Kconfig
10434 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/init/Kconfig    2017-04-16 10:38:29.000000000 +0200
10435 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/init/Kconfig 2017-04-18 17:54:26.000000000 +0200
10436 @@ -506,7 +506,7 @@
10438  config RCU_EXPERT
10439         bool "Make expert-level adjustments to RCU configuration"
10440 -       default n
10441 +       default y if PREEMPT_RT_FULL
10442         help
10443           This option needs to be enabled if you wish to make
10444           expert-level adjustments to RCU configuration.  By default,
10445 @@ -623,7 +623,7 @@
10447  config RCU_FAST_NO_HZ
10448         bool "Accelerate last non-dyntick-idle CPU's grace periods"
10449 -       depends on NO_HZ_COMMON && SMP && RCU_EXPERT
10450 +       depends on NO_HZ_COMMON && SMP && RCU_EXPERT && !PREEMPT_RT_FULL
10451         default n
10452         help
10453           This option permits CPUs to enter dynticks-idle state even if
10454 @@ -650,7 +650,7 @@
10455  config RCU_BOOST
10456         bool "Enable RCU priority boosting"
10457         depends on RT_MUTEXES && PREEMPT_RCU && RCU_EXPERT
10458 -       default n
10459 +       default y if PREEMPT_RT_FULL
10460         help
10461           This option boosts the priority of preempted RCU readers that
10462           block the current preemptible RCU grace period for too long.
10463 @@ -781,19 +781,6 @@
10465  endchoice
10467 -config RCU_EXPEDITE_BOOT
10468 -       bool
10469 -       default n
10470 -       help
10471 -         This option enables expedited grace periods at boot time,
10472 -         as if rcu_expedite_gp() had been invoked early in boot.
10473 -         The corresponding rcu_unexpedite_gp() is invoked from
10474 -         rcu_end_inkernel_boot(), which is intended to be invoked
10475 -         at the end of the kernel-only boot sequence, just before
10476 -         init is exec'ed.
10478 -         Accept the default if unsure.
10480  endmenu # "RCU Subsystem"
10482  config BUILD_BIN2C
10483 @@ -1064,6 +1051,7 @@
10484  config RT_GROUP_SCHED
10485         bool "Group scheduling for SCHED_RR/FIFO"
10486         depends on CGROUP_SCHED
10487 +       depends on !PREEMPT_RT_FULL
10488         default n
10489         help
10490           This feature lets you explicitly allocate real CPU bandwidth
10491 @@ -1772,6 +1760,7 @@
10493  config SLAB
10494         bool "SLAB"
10495 +       depends on !PREEMPT_RT_FULL
10496         select HAVE_HARDENED_USERCOPY_ALLOCATOR
10497         help
10498           The regular slab allocator that is established and known to work
10499 @@ -1792,6 +1781,7 @@
10500  config SLOB
10501         depends on EXPERT
10502         bool "SLOB (Simple Allocator)"
10503 +       depends on !PREEMPT_RT_FULL
10504         help
10505            SLOB replaces the stock allocator with a drastically simpler
10506            allocator. SLOB is generally more space efficient but
10507 @@ -1810,7 +1800,7 @@
10509  config SLUB_CPU_PARTIAL
10510         default y
10511 -       depends on SLUB && SMP
10512 +       depends on SLUB && SMP && !PREEMPT_RT_FULL
10513         bool "SLUB per cpu partial cache"
10514         help
10515           Per cpu partial caches accellerate objects allocation and freeing
10516 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/init/Makefile linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/init/Makefile
10517 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/init/Makefile   2017-04-16 10:38:29.000000000 +0200
10518 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/init/Makefile        2017-04-18 17:54:26.000000000 +0200
10519 @@ -35,4 +35,4 @@
10520  include/generated/compile.h: FORCE
10521         @$($(quiet)chk_compile.h)
10522         $(Q)$(CONFIG_SHELL) $(srctree)/scripts/mkcompile_h $@ \
10523 -       "$(UTS_MACHINE)" "$(CONFIG_SMP)" "$(CONFIG_PREEMPT)" "$(CC) $(KBUILD_CFLAGS)"
10524 +       "$(UTS_MACHINE)" "$(CONFIG_SMP)" "$(CONFIG_PREEMPT)" "$(CONFIG_PREEMPT_RT_FULL)" "$(CC) $(KBUILD_CFLAGS)"
10525 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/init/main.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/init/main.c
10526 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/init/main.c     2017-04-16 10:38:29.000000000 +0200
10527 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/init/main.c  2017-04-18 17:54:26.000000000 +0200
10528 @@ -507,6 +507,7 @@
10529         setup_command_line(command_line);
10530         setup_nr_cpu_ids();
10531         setup_per_cpu_areas();
10532 +       softirq_early_init();
10533         boot_cpu_state_init();
10534         smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */
10536 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/ipc/sem.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/ipc/sem.c
10537 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/ipc/sem.c       2017-04-16 10:38:29.000000000 +0200
10538 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/ipc/sem.c    2017-04-18 17:54:26.000000000 +0200
10539 @@ -712,6 +712,13 @@
10540  static void wake_up_sem_queue_prepare(struct list_head *pt,
10541                                 struct sem_queue *q, int error)
10543 +#ifdef CONFIG_PREEMPT_RT_BASE
10544 +       struct task_struct *p = q->sleeper;
10545 +       get_task_struct(p);
10546 +       q->status = error;
10547 +       wake_up_process(p);
10548 +       put_task_struct(p);
10549 +#else
10550         if (list_empty(pt)) {
10551                 /*
10552                  * Hold preempt off so that we don't get preempted and have the
10553 @@ -723,6 +730,7 @@
10554         q->pid = error;
10556         list_add_tail(&q->list, pt);
10557 +#endif
10560  /**
10561 @@ -736,6 +744,7 @@
10562   */
10563  static void wake_up_sem_queue_do(struct list_head *pt)
10565 +#ifndef CONFIG_PREEMPT_RT_BASE
10566         struct sem_queue *q, *t;
10567         int did_something;
10569 @@ -748,6 +757,7 @@
10570         }
10571         if (did_something)
10572                 preempt_enable();
10573 +#endif
10576  static void unlink_queue(struct sem_array *sma, struct sem_queue *q)
10577 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/Kconfig.locks linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/Kconfig.locks
10578 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/Kconfig.locks    2017-04-16 10:38:29.000000000 +0200
10579 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/Kconfig.locks 2017-04-18 17:54:26.000000000 +0200
10580 @@ -225,11 +225,11 @@
10582  config MUTEX_SPIN_ON_OWNER
10583         def_bool y
10584 -       depends on SMP && !DEBUG_MUTEXES && ARCH_SUPPORTS_ATOMIC_RMW
10585 +       depends on SMP && !DEBUG_MUTEXES && ARCH_SUPPORTS_ATOMIC_RMW && !PREEMPT_RT_FULL
10587  config RWSEM_SPIN_ON_OWNER
10588         def_bool y
10589 -       depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW
10590 +       depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW && !PREEMPT_RT_FULL
10592  config LOCK_SPIN_ON_OWNER
10593         def_bool y
10594 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/Kconfig.preempt linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/Kconfig.preempt
10595 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/Kconfig.preempt  2017-04-16 10:38:29.000000000 +0200
10596 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/Kconfig.preempt       2017-04-18 17:54:26.000000000 +0200
10597 @@ -1,3 +1,16 @@
10598 +config PREEMPT
10599 +       bool
10600 +       select PREEMPT_COUNT
10602 +config PREEMPT_RT_BASE
10603 +       bool
10604 +       select PREEMPT
10606 +config HAVE_PREEMPT_LAZY
10607 +       bool
10609 +config PREEMPT_LAZY
10610 +       def_bool y if HAVE_PREEMPT_LAZY && PREEMPT_RT_FULL
10612  choice
10613         prompt "Preemption Model"
10614 @@ -33,9 +46,9 @@
10616           Select this if you are building a kernel for a desktop system.
10618 -config PREEMPT
10619 +config PREEMPT__LL
10620         bool "Preemptible Kernel (Low-Latency Desktop)"
10621 -       select PREEMPT_COUNT
10622 +       select PREEMPT
10623         select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK
10624         help
10625           This option reduces the latency of the kernel by making
10626 @@ -52,6 +65,22 @@
10627           embedded system with latency requirements in the milliseconds
10628           range.
10630 +config PREEMPT_RTB
10631 +       bool "Preemptible Kernel (Basic RT)"
10632 +       select PREEMPT_RT_BASE
10633 +       help
10634 +         This option is basically the same as (Low-Latency Desktop) but
10635 +         enables changes which are preliminary for the full preemptible
10636 +         RT kernel.
10638 +config PREEMPT_RT_FULL
10639 +       bool "Fully Preemptible Kernel (RT)"
10640 +       depends on IRQ_FORCED_THREADING
10641 +       select PREEMPT_RT_BASE
10642 +       select PREEMPT_RCU
10643 +       help
10644 +         All and everything
10646  endchoice
10648  config PREEMPT_COUNT
10649 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/cgroup.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/cgroup.c
10650 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/cgroup.c 2017-04-16 10:38:29.000000000 +0200
10651 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/cgroup.c      2017-04-18 17:54:26.000000000 +0200
10652 @@ -5040,10 +5040,10 @@
10653         queue_work(cgroup_destroy_wq, &css->destroy_work);
10656 -static void css_release_work_fn(struct work_struct *work)
10657 +static void css_release_work_fn(struct swork_event *sev)
10659         struct cgroup_subsys_state *css =
10660 -               container_of(work, struct cgroup_subsys_state, destroy_work);
10661 +               container_of(sev, struct cgroup_subsys_state, destroy_swork);
10662         struct cgroup_subsys *ss = css->ss;
10663         struct cgroup *cgrp = css->cgroup;
10665 @@ -5086,8 +5086,8 @@
10666         struct cgroup_subsys_state *css =
10667                 container_of(ref, struct cgroup_subsys_state, refcnt);
10669 -       INIT_WORK(&css->destroy_work, css_release_work_fn);
10670 -       queue_work(cgroup_destroy_wq, &css->destroy_work);
10671 +       INIT_SWORK(&css->destroy_swork, css_release_work_fn);
10672 +       swork_queue(&css->destroy_swork);
10675  static void init_and_link_css(struct cgroup_subsys_state *css,
10676 @@ -5739,6 +5739,7 @@
10677          */
10678         cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
10679         BUG_ON(!cgroup_destroy_wq);
10680 +       BUG_ON(swork_get());
10682         /*
10683          * Used to destroy pidlists and separate to serve as flush domain.
10684 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/cpu.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/cpu.c
10685 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/cpu.c    2017-04-16 10:38:29.000000000 +0200
10686 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/cpu.c 2017-04-18 17:54:26.000000000 +0200
10687 @@ -239,6 +239,289 @@
10688  #define cpuhp_lock_acquire()      lock_map_acquire(&cpu_hotplug.dep_map)
10689  #define cpuhp_lock_release()      lock_map_release(&cpu_hotplug.dep_map)
10691 +/**
10692 + * hotplug_pcp - per cpu hotplug descriptor
10693 + * @unplug:    set when pin_current_cpu() needs to sync tasks
10694 + * @sync_tsk:  the task that waits for tasks to finish pinned sections
10695 + * @refcount:  counter of tasks in pinned sections
10696 + * @grab_lock: set when the tasks entering pinned sections should wait
10697 + * @synced:    notifier for @sync_tsk to tell cpu_down it's finished
10698 + * @mutex:     the mutex to make tasks wait (used when @grab_lock is true)
10699 + * @mutex_init:        zero if the mutex hasn't been initialized yet.
10700 + *
10701 + * Although @unplug and @sync_tsk may point to the same task, the @unplug
10702 + * is used as a flag and still exists after @sync_tsk has exited and
10703 + * @sync_tsk set to NULL.
10704 + */
10705 +struct hotplug_pcp {
10706 +       struct task_struct *unplug;
10707 +       struct task_struct *sync_tsk;
10708 +       int refcount;
10709 +       int grab_lock;
10710 +       struct completion synced;
10711 +       struct completion unplug_wait;
10712 +#ifdef CONFIG_PREEMPT_RT_FULL
10713 +       /*
10714 +        * Note, on PREEMPT_RT, the hotplug lock must save the state of
10715 +        * the task, otherwise the mutex will cause the task to fail
10716 +        * to sleep when required. (Because it's called from migrate_disable())
10717 +        *
10718 +        * The spinlock_t on PREEMPT_RT is a mutex that saves the task's
10719 +        * state.
10720 +        */
10721 +       spinlock_t lock;
10722 +#else
10723 +       struct mutex mutex;
10724 +#endif
10725 +       int mutex_init;
10728 +#ifdef CONFIG_PREEMPT_RT_FULL
10729 +# define hotplug_lock(hp) rt_spin_lock__no_mg(&(hp)->lock)
10730 +# define hotplug_unlock(hp) rt_spin_unlock__no_mg(&(hp)->lock)
10731 +#else
10732 +# define hotplug_lock(hp) mutex_lock(&(hp)->mutex)
10733 +# define hotplug_unlock(hp) mutex_unlock(&(hp)->mutex)
10734 +#endif
10736 +static DEFINE_PER_CPU(struct hotplug_pcp, hotplug_pcp);
10738 +/**
10739 + * pin_current_cpu - Prevent the current cpu from being unplugged
10740 + *
10741 + * Lightweight version of get_online_cpus() to prevent cpu from being
10742 + * unplugged when code runs in a migration disabled region.
10743 + *
10744 + * Must be called with preemption disabled (preempt_count = 1)!
10745 + */
10746 +void pin_current_cpu(void)
10748 +       struct hotplug_pcp *hp;
10749 +       int force = 0;
10751 +retry:
10752 +       hp = this_cpu_ptr(&hotplug_pcp);
10754 +       if (!hp->unplug || hp->refcount || force || preempt_count() > 1 ||
10755 +           hp->unplug == current) {
10756 +               hp->refcount++;
10757 +               return;
10758 +       }
10759 +       if (hp->grab_lock) {
10760 +               preempt_enable();
10761 +               hotplug_lock(hp);
10762 +               hotplug_unlock(hp);
10763 +       } else {
10764 +               preempt_enable();
10765 +               /*
10766 +                * Try to push this task off of this CPU.
10767 +                */
10768 +               if (!migrate_me()) {
10769 +                       preempt_disable();
10770 +                       hp = this_cpu_ptr(&hotplug_pcp);
10771 +                       if (!hp->grab_lock) {
10772 +                               /*
10773 +                                * Just let it continue it's already pinned
10774 +                                * or about to sleep.
10775 +                                */
10776 +                               force = 1;
10777 +                               goto retry;
10778 +                       }
10779 +                       preempt_enable();
10780 +               }
10781 +       }
10782 +       preempt_disable();
10783 +       goto retry;
10786 +/**
10787 + * unpin_current_cpu - Allow unplug of current cpu
10788 + *
10789 + * Must be called with preemption or interrupts disabled!
10790 + */
10791 +void unpin_current_cpu(void)
10793 +       struct hotplug_pcp *hp = this_cpu_ptr(&hotplug_pcp);
10795 +       WARN_ON(hp->refcount <= 0);
10797 +       /* This is safe. sync_unplug_thread is pinned to this cpu */
10798 +       if (!--hp->refcount && hp->unplug && hp->unplug != current)
10799 +               wake_up_process(hp->unplug);
10802 +static void wait_for_pinned_cpus(struct hotplug_pcp *hp)
10804 +       set_current_state(TASK_UNINTERRUPTIBLE);
10805 +       while (hp->refcount) {
10806 +               schedule_preempt_disabled();
10807 +               set_current_state(TASK_UNINTERRUPTIBLE);
10808 +       }
10811 +static int sync_unplug_thread(void *data)
10813 +       struct hotplug_pcp *hp = data;
10815 +       wait_for_completion(&hp->unplug_wait);
10816 +       preempt_disable();
10817 +       hp->unplug = current;
10818 +       wait_for_pinned_cpus(hp);
10820 +       /*
10821 +        * This thread will synchronize the cpu_down() with threads
10822 +        * that have pinned the CPU. When the pinned CPU count reaches
10823 +        * zero, we inform the cpu_down code to continue to the next step.
10824 +        */
10825 +       set_current_state(TASK_UNINTERRUPTIBLE);
10826 +       preempt_enable();
10827 +       complete(&hp->synced);
10829 +       /*
10830 +        * If all succeeds, the next step will need tasks to wait till
10831 +        * the CPU is offline before continuing. To do this, the grab_lock
10832 +        * is set and tasks going into pin_current_cpu() will block on the
10833 +        * mutex. But we still need to wait for those that are already in
10834 +        * pinned CPU sections. If the cpu_down() failed, the kthread_should_stop()
10835 +        * will kick this thread out.
10836 +        */
10837 +       while (!hp->grab_lock && !kthread_should_stop()) {
10838 +               schedule();
10839 +               set_current_state(TASK_UNINTERRUPTIBLE);
10840 +       }
10842 +       /* Make sure grab_lock is seen before we see a stale completion */
10843 +       smp_mb();
10845 +       /*
10846 +        * Now just before cpu_down() enters stop machine, we need to make
10847 +        * sure all tasks that are in pinned CPU sections are out, and new
10848 +        * tasks will now grab the lock, keeping them from entering pinned
10849 +        * CPU sections.
10850 +        */
10851 +       if (!kthread_should_stop()) {
10852 +               preempt_disable();
10853 +               wait_for_pinned_cpus(hp);
10854 +               preempt_enable();
10855 +               complete(&hp->synced);
10856 +       }
10858 +       set_current_state(TASK_UNINTERRUPTIBLE);
10859 +       while (!kthread_should_stop()) {
10860 +               schedule();
10861 +               set_current_state(TASK_UNINTERRUPTIBLE);
10862 +       }
10863 +       set_current_state(TASK_RUNNING);
10865 +       /*
10866 +        * Force this thread off this CPU as it's going down and
10867 +        * we don't want any more work on this CPU.
10868 +        */
10869 +       current->flags &= ~PF_NO_SETAFFINITY;
10870 +       set_cpus_allowed_ptr(current, cpu_present_mask);
10871 +       migrate_me();
10872 +       return 0;
10875 +static void __cpu_unplug_sync(struct hotplug_pcp *hp)
10877 +       wake_up_process(hp->sync_tsk);
10878 +       wait_for_completion(&hp->synced);
10881 +static void __cpu_unplug_wait(unsigned int cpu)
10883 +       struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
10885 +       complete(&hp->unplug_wait);
10886 +       wait_for_completion(&hp->synced);
10890 + * Start the sync_unplug_thread on the target cpu and wait for it to
10891 + * complete.
10892 + */
10893 +static int cpu_unplug_begin(unsigned int cpu)
10895 +       struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
10896 +       int err;
10898 +       /* Protected by cpu_hotplug.lock */
10899 +       if (!hp->mutex_init) {
10900 +#ifdef CONFIG_PREEMPT_RT_FULL
10901 +               spin_lock_init(&hp->lock);
10902 +#else
10903 +               mutex_init(&hp->mutex);
10904 +#endif
10905 +               hp->mutex_init = 1;
10906 +       }
10908 +       /* Inform the scheduler to migrate tasks off this CPU */
10909 +       tell_sched_cpu_down_begin(cpu);
10911 +       init_completion(&hp->synced);
10912 +       init_completion(&hp->unplug_wait);
10914 +       hp->sync_tsk = kthread_create(sync_unplug_thread, hp, "sync_unplug/%d", cpu);
10915 +       if (IS_ERR(hp->sync_tsk)) {
10916 +               err = PTR_ERR(hp->sync_tsk);
10917 +               hp->sync_tsk = NULL;
10918 +               return err;
10919 +       }
10920 +       kthread_bind(hp->sync_tsk, cpu);
10922 +       /*
10923 +        * Wait for tasks to get out of the pinned sections,
10924 +        * it's still OK if new tasks enter. Some CPU notifiers will
10925 +        * wait for tasks that are going to enter these sections and
10926 +        * we must not have them block.
10927 +        */
10928 +       wake_up_process(hp->sync_tsk);
10929 +       return 0;
10932 +static void cpu_unplug_sync(unsigned int cpu)
10934 +       struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
10936 +       init_completion(&hp->synced);
10937 +       /* The completion needs to be initialzied before setting grab_lock */
10938 +       smp_wmb();
10940 +       /* Grab the mutex before setting grab_lock */
10941 +       hotplug_lock(hp);
10942 +       hp->grab_lock = 1;
10944 +       /*
10945 +        * The CPU notifiers have been completed.
10946 +        * Wait for tasks to get out of pinned CPU sections and have new
10947 +        * tasks block until the CPU is completely down.
10948 +        */
10949 +       __cpu_unplug_sync(hp);
10951 +       /* All done with the sync thread */
10952 +       kthread_stop(hp->sync_tsk);
10953 +       hp->sync_tsk = NULL;
10956 +static void cpu_unplug_done(unsigned int cpu)
10958 +       struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
10960 +       hp->unplug = NULL;
10961 +       /* Let all tasks know cpu unplug is finished before cleaning up */
10962 +       smp_wmb();
10964 +       if (hp->sync_tsk)
10965 +               kthread_stop(hp->sync_tsk);
10967 +       if (hp->grab_lock) {
10968 +               hotplug_unlock(hp);
10969 +               /* protected by cpu_hotplug.lock */
10970 +               hp->grab_lock = 0;
10971 +       }
10972 +       tell_sched_cpu_down_done(cpu);
10975  void get_online_cpus(void)
10977 @@ -789,10 +1072,14 @@
10978         struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
10979         int err;
10981 +       __cpu_unplug_wait(cpu);
10982         /* Park the smpboot threads */
10983         kthread_park(per_cpu_ptr(&cpuhp_state, cpu)->thread);
10984         smpboot_park_threads(cpu);
10986 +       /* Notifiers are done. Don't let any more tasks pin this CPU. */
10987 +       cpu_unplug_sync(cpu);
10989         /*
10990          * Prevent irq alloc/free while the dying cpu reorganizes the
10991          * interrupt affinities.
10992 @@ -877,6 +1164,9 @@
10993         struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
10994         int prev_state, ret = 0;
10995         bool hasdied = false;
10996 +       int mycpu;
10997 +       cpumask_var_t cpumask;
10998 +       cpumask_var_t cpumask_org;
11000         if (num_online_cpus() == 1)
11001                 return -EBUSY;
11002 @@ -884,7 +1174,34 @@
11003         if (!cpu_present(cpu))
11004                 return -EINVAL;
11006 +       /* Move the downtaker off the unplug cpu */
11007 +       if (!alloc_cpumask_var(&cpumask, GFP_KERNEL))
11008 +               return -ENOMEM;
11009 +       if (!alloc_cpumask_var(&cpumask_org, GFP_KERNEL))  {
11010 +               free_cpumask_var(cpumask);
11011 +               return -ENOMEM;
11012 +       }
11014 +       cpumask_copy(cpumask_org, tsk_cpus_allowed(current));
11015 +       cpumask_andnot(cpumask, cpu_online_mask, cpumask_of(cpu));
11016 +       set_cpus_allowed_ptr(current, cpumask);
11017 +       free_cpumask_var(cpumask);
11018 +       migrate_disable();
11019 +       mycpu = smp_processor_id();
11020 +       if (mycpu == cpu) {
11021 +               printk(KERN_ERR "Yuck! Still on unplug CPU\n!");
11022 +               migrate_enable();
11023 +               ret = -EBUSY;
11024 +               goto restore_cpus;
11025 +       }
11027 +       migrate_enable();
11028         cpu_hotplug_begin();
11029 +       ret = cpu_unplug_begin(cpu);
11030 +       if (ret) {
11031 +               printk("cpu_unplug_begin(%d) failed\n", cpu);
11032 +               goto out_cancel;
11033 +       }
11035         cpuhp_tasks_frozen = tasks_frozen;
11037 @@ -923,10 +1240,15 @@
11039         hasdied = prev_state != st->state && st->state == CPUHP_OFFLINE;
11040  out:
11041 +       cpu_unplug_done(cpu);
11042 +out_cancel:
11043         cpu_hotplug_done();
11044         /* This post dead nonsense must die */
11045         if (!ret && hasdied)
11046                 cpu_notify_nofail(CPU_POST_DEAD, cpu);
11047 +restore_cpus:
11048 +       set_cpus_allowed_ptr(current, cpumask_org);
11049 +       free_cpumask_var(cpumask_org);
11050         return ret;
11053 @@ -1240,6 +1562,8 @@
11055  #endif /* CONFIG_PM_SLEEP_SMP */
11057 +int __boot_cpu_id;
11059  #endif /* CONFIG_SMP */
11061  /* Boot processor state steps */
11062 @@ -1923,6 +2247,10 @@
11063         set_cpu_active(cpu, true);
11064         set_cpu_present(cpu, true);
11065         set_cpu_possible(cpu, true);
11067 +#ifdef CONFIG_SMP
11068 +       __boot_cpu_id = cpu;
11069 +#endif
11072  /*
11073 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/cpuset.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/cpuset.c
11074 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/cpuset.c 2017-04-16 10:38:29.000000000 +0200
11075 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/cpuset.c      2017-04-18 17:54:26.000000000 +0200
11076 @@ -284,7 +284,7 @@
11077   */
11079  static DEFINE_MUTEX(cpuset_mutex);
11080 -static DEFINE_SPINLOCK(callback_lock);
11081 +static DEFINE_RAW_SPINLOCK(callback_lock);
11083  static struct workqueue_struct *cpuset_migrate_mm_wq;
11085 @@ -907,9 +907,9 @@
11086                         continue;
11087                 rcu_read_unlock();
11089 -               spin_lock_irq(&callback_lock);
11090 +               raw_spin_lock_irq(&callback_lock);
11091                 cpumask_copy(cp->effective_cpus, new_cpus);
11092 -               spin_unlock_irq(&callback_lock);
11093 +               raw_spin_unlock_irq(&callback_lock);
11095                 WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
11096                         !cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
11097 @@ -974,9 +974,9 @@
11098         if (retval < 0)
11099                 return retval;
11101 -       spin_lock_irq(&callback_lock);
11102 +       raw_spin_lock_irq(&callback_lock);
11103         cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
11104 -       spin_unlock_irq(&callback_lock);
11105 +       raw_spin_unlock_irq(&callback_lock);
11107         /* use trialcs->cpus_allowed as a temp variable */
11108         update_cpumasks_hier(cs, trialcs->cpus_allowed);
11109 @@ -1176,9 +1176,9 @@
11110                         continue;
11111                 rcu_read_unlock();
11113 -               spin_lock_irq(&callback_lock);
11114 +               raw_spin_lock_irq(&callback_lock);
11115                 cp->effective_mems = *new_mems;
11116 -               spin_unlock_irq(&callback_lock);
11117 +               raw_spin_unlock_irq(&callback_lock);
11119                 WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
11120                         !nodes_equal(cp->mems_allowed, cp->effective_mems));
11121 @@ -1246,9 +1246,9 @@
11122         if (retval < 0)
11123                 goto done;
11125 -       spin_lock_irq(&callback_lock);
11126 +       raw_spin_lock_irq(&callback_lock);
11127         cs->mems_allowed = trialcs->mems_allowed;
11128 -       spin_unlock_irq(&callback_lock);
11129 +       raw_spin_unlock_irq(&callback_lock);
11131         /* use trialcs->mems_allowed as a temp variable */
11132         update_nodemasks_hier(cs, &trialcs->mems_allowed);
11133 @@ -1339,9 +1339,9 @@
11134         spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
11135                         || (is_spread_page(cs) != is_spread_page(trialcs)));
11137 -       spin_lock_irq(&callback_lock);
11138 +       raw_spin_lock_irq(&callback_lock);
11139         cs->flags = trialcs->flags;
11140 -       spin_unlock_irq(&callback_lock);
11141 +       raw_spin_unlock_irq(&callback_lock);
11143         if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
11144                 rebuild_sched_domains_locked();
11145 @@ -1756,7 +1756,7 @@
11146         cpuset_filetype_t type = seq_cft(sf)->private;
11147         int ret = 0;
11149 -       spin_lock_irq(&callback_lock);
11150 +       raw_spin_lock_irq(&callback_lock);
11152         switch (type) {
11153         case FILE_CPULIST:
11154 @@ -1775,7 +1775,7 @@
11155                 ret = -EINVAL;
11156         }
11158 -       spin_unlock_irq(&callback_lock);
11159 +       raw_spin_unlock_irq(&callback_lock);
11160         return ret;
11163 @@ -1989,12 +1989,12 @@
11165         cpuset_inc();
11167 -       spin_lock_irq(&callback_lock);
11168 +       raw_spin_lock_irq(&callback_lock);
11169         if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) {
11170                 cpumask_copy(cs->effective_cpus, parent->effective_cpus);
11171                 cs->effective_mems = parent->effective_mems;
11172         }
11173 -       spin_unlock_irq(&callback_lock);
11174 +       raw_spin_unlock_irq(&callback_lock);
11176         if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
11177                 goto out_unlock;
11178 @@ -2021,12 +2021,12 @@
11179         }
11180         rcu_read_unlock();
11182 -       spin_lock_irq(&callback_lock);
11183 +       raw_spin_lock_irq(&callback_lock);
11184         cs->mems_allowed = parent->mems_allowed;
11185         cs->effective_mems = parent->mems_allowed;
11186         cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
11187         cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
11188 -       spin_unlock_irq(&callback_lock);
11189 +       raw_spin_unlock_irq(&callback_lock);
11190  out_unlock:
11191         mutex_unlock(&cpuset_mutex);
11192         return 0;
11193 @@ -2065,7 +2065,7 @@
11194  static void cpuset_bind(struct cgroup_subsys_state *root_css)
11196         mutex_lock(&cpuset_mutex);
11197 -       spin_lock_irq(&callback_lock);
11198 +       raw_spin_lock_irq(&callback_lock);
11200         if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) {
11201                 cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
11202 @@ -2076,7 +2076,7 @@
11203                 top_cpuset.mems_allowed = top_cpuset.effective_mems;
11204         }
11206 -       spin_unlock_irq(&callback_lock);
11207 +       raw_spin_unlock_irq(&callback_lock);
11208         mutex_unlock(&cpuset_mutex);
11211 @@ -2177,12 +2177,12 @@
11213         bool is_empty;
11215 -       spin_lock_irq(&callback_lock);
11216 +       raw_spin_lock_irq(&callback_lock);
11217         cpumask_copy(cs->cpus_allowed, new_cpus);
11218         cpumask_copy(cs->effective_cpus, new_cpus);
11219         cs->mems_allowed = *new_mems;
11220         cs->effective_mems = *new_mems;
11221 -       spin_unlock_irq(&callback_lock);
11222 +       raw_spin_unlock_irq(&callback_lock);
11224         /*
11225          * Don't call update_tasks_cpumask() if the cpuset becomes empty,
11226 @@ -2219,10 +2219,10 @@
11227         if (nodes_empty(*new_mems))
11228                 *new_mems = parent_cs(cs)->effective_mems;
11230 -       spin_lock_irq(&callback_lock);
11231 +       raw_spin_lock_irq(&callback_lock);
11232         cpumask_copy(cs->effective_cpus, new_cpus);
11233         cs->effective_mems = *new_mems;
11234 -       spin_unlock_irq(&callback_lock);
11235 +       raw_spin_unlock_irq(&callback_lock);
11237         if (cpus_updated)
11238                 update_tasks_cpumask(cs);
11239 @@ -2308,21 +2308,21 @@
11241         /* synchronize cpus_allowed to cpu_active_mask */
11242         if (cpus_updated) {
11243 -               spin_lock_irq(&callback_lock);
11244 +               raw_spin_lock_irq(&callback_lock);
11245                 if (!on_dfl)
11246                         cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
11247                 cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
11248 -               spin_unlock_irq(&callback_lock);
11249 +               raw_spin_unlock_irq(&callback_lock);
11250                 /* we don't mess with cpumasks of tasks in top_cpuset */
11251         }
11253         /* synchronize mems_allowed to N_MEMORY */
11254         if (mems_updated) {
11255 -               spin_lock_irq(&callback_lock);
11256 +               raw_spin_lock_irq(&callback_lock);
11257                 if (!on_dfl)
11258                         top_cpuset.mems_allowed = new_mems;
11259                 top_cpuset.effective_mems = new_mems;
11260 -               spin_unlock_irq(&callback_lock);
11261 +               raw_spin_unlock_irq(&callback_lock);
11262                 update_tasks_nodemask(&top_cpuset);
11263         }
11265 @@ -2420,11 +2420,11 @@
11267         unsigned long flags;
11269 -       spin_lock_irqsave(&callback_lock, flags);
11270 +       raw_spin_lock_irqsave(&callback_lock, flags);
11271         rcu_read_lock();
11272         guarantee_online_cpus(task_cs(tsk), pmask);
11273         rcu_read_unlock();
11274 -       spin_unlock_irqrestore(&callback_lock, flags);
11275 +       raw_spin_unlock_irqrestore(&callback_lock, flags);
11278  void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
11279 @@ -2472,11 +2472,11 @@
11280         nodemask_t mask;
11281         unsigned long flags;
11283 -       spin_lock_irqsave(&callback_lock, flags);
11284 +       raw_spin_lock_irqsave(&callback_lock, flags);
11285         rcu_read_lock();
11286         guarantee_online_mems(task_cs(tsk), &mask);
11287         rcu_read_unlock();
11288 -       spin_unlock_irqrestore(&callback_lock, flags);
11289 +       raw_spin_unlock_irqrestore(&callback_lock, flags);
11291         return mask;
11293 @@ -2568,14 +2568,14 @@
11294                 return true;
11296         /* Not hardwall and node outside mems_allowed: scan up cpusets */
11297 -       spin_lock_irqsave(&callback_lock, flags);
11298 +       raw_spin_lock_irqsave(&callback_lock, flags);
11300         rcu_read_lock();
11301         cs = nearest_hardwall_ancestor(task_cs(current));
11302         allowed = node_isset(node, cs->mems_allowed);
11303         rcu_read_unlock();
11305 -       spin_unlock_irqrestore(&callback_lock, flags);
11306 +       raw_spin_unlock_irqrestore(&callback_lock, flags);
11307         return allowed;
11310 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/debug/kdb/kdb_io.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/debug/kdb/kdb_io.c
11311 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/debug/kdb/kdb_io.c       2017-04-16 10:38:29.000000000 +0200
11312 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/debug/kdb/kdb_io.c    2017-04-18 17:54:26.000000000 +0200
11313 @@ -554,7 +554,6 @@
11314         int linecount;
11315         int colcount;
11316         int logging, saved_loglevel = 0;
11317 -       int saved_trap_printk;
11318         int got_printf_lock = 0;
11319         int retlen = 0;
11320         int fnd, len;
11321 @@ -565,8 +564,6 @@
11322         unsigned long uninitialized_var(flags);
11324         preempt_disable();
11325 -       saved_trap_printk = kdb_trap_printk;
11326 -       kdb_trap_printk = 0;
11328         /* Serialize kdb_printf if multiple cpus try to write at once.
11329          * But if any cpu goes recursive in kdb, just print the output,
11330 @@ -855,7 +852,6 @@
11331         } else {
11332                 __release(kdb_printf_lock);
11333         }
11334 -       kdb_trap_printk = saved_trap_printk;
11335         preempt_enable();
11336         return retlen;
11338 @@ -865,9 +861,11 @@
11339         va_list ap;
11340         int r;
11342 +       kdb_trap_printk++;
11343         va_start(ap, fmt);
11344         r = vkdb_printf(KDB_MSGSRC_INTERNAL, fmt, ap);
11345         va_end(ap);
11346 +       kdb_trap_printk--;
11348         return r;
11350 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/events/core.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/events/core.c
11351 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/events/core.c    2017-04-16 10:38:29.000000000 +0200
11352 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/events/core.c 2017-04-18 17:54:26.000000000 +0200
11353 @@ -1050,6 +1050,7 @@
11354         raw_spin_lock_init(&cpuctx->hrtimer_lock);
11355         hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
11356         timer->function = perf_mux_hrtimer_handler;
11357 +       timer->irqsafe = 1;
11360  static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx)
11361 @@ -8363,6 +8364,7 @@
11363         hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
11364         hwc->hrtimer.function = perf_swevent_hrtimer;
11365 +       hwc->hrtimer.irqsafe = 1;
11367         /*
11368          * Since hrtimers have a fixed rate, we can do a static freq->period
11369 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/exit.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/exit.c
11370 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/exit.c   2017-04-16 10:38:29.000000000 +0200
11371 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/exit.c        2017-04-18 17:54:26.000000000 +0200
11372 @@ -143,7 +143,7 @@
11373          * Do this under ->siglock, we can race with another thread
11374          * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals.
11375          */
11376 -       flush_sigqueue(&tsk->pending);
11377 +       flush_task_sigqueue(tsk);
11378         tsk->sighand = NULL;
11379         spin_unlock(&sighand->siglock);
11381 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/fork.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/fork.c
11382 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/fork.c   2017-04-16 10:38:29.000000000 +0200
11383 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/fork.c        2017-04-18 17:54:26.000000000 +0200
11384 @@ -76,6 +76,7 @@
11385  #include <linux/compiler.h>
11386  #include <linux/sysctl.h>
11387  #include <linux/kcov.h>
11388 +#include <linux/kprobes.h>
11390  #include <asm/pgtable.h>
11391  #include <asm/pgalloc.h>
11392 @@ -376,13 +377,24 @@
11393         if (atomic_dec_and_test(&sig->sigcnt))
11394                 free_signal_struct(sig);
11397 +#ifdef CONFIG_PREEMPT_RT_BASE
11398 +static
11399 +#endif
11400  void __put_task_struct(struct task_struct *tsk)
11402         WARN_ON(!tsk->exit_state);
11403         WARN_ON(atomic_read(&tsk->usage));
11404         WARN_ON(tsk == current);
11406 +       /*
11407 +        * Remove function-return probe instances associated with this
11408 +        * task and put them back on the free list.
11409 +        */
11410 +       kprobe_flush_task(tsk);
11412 +       /* Task is done with its stack. */
11413 +       put_task_stack(tsk);
11415         cgroup_free(tsk);
11416         task_numa_free(tsk);
11417         security_task_free(tsk);
11418 @@ -393,7 +405,18 @@
11419         if (!profile_handoff_task(tsk))
11420                 free_task(tsk);
11422 +#ifndef CONFIG_PREEMPT_RT_BASE
11423  EXPORT_SYMBOL_GPL(__put_task_struct);
11424 +#else
11425 +void __put_task_struct_cb(struct rcu_head *rhp)
11427 +       struct task_struct *tsk = container_of(rhp, struct task_struct, put_rcu);
11429 +       __put_task_struct(tsk);
11432 +EXPORT_SYMBOL_GPL(__put_task_struct_cb);
11433 +#endif
11435  void __init __weak arch_task_cache_init(void) { }
11437 @@ -852,6 +875,19 @@
11439  EXPORT_SYMBOL_GPL(__mmdrop);
11441 +#ifdef CONFIG_PREEMPT_RT_BASE
11443 + * RCU callback for delayed mm drop. Not strictly rcu, but we don't
11444 + * want another facility to make this work.
11445 + */
11446 +void __mmdrop_delayed(struct rcu_head *rhp)
11448 +       struct mm_struct *mm = container_of(rhp, struct mm_struct, delayed_drop);
11450 +       __mmdrop(mm);
11452 +#endif
11454  static inline void __mmput(struct mm_struct *mm)
11456         VM_BUG_ON(atomic_read(&mm->mm_users));
11457 @@ -1426,6 +1462,9 @@
11458   */
11459  static void posix_cpu_timers_init(struct task_struct *tsk)
11461 +#ifdef CONFIG_PREEMPT_RT_BASE
11462 +       tsk->posix_timer_list = NULL;
11463 +#endif
11464         tsk->cputime_expires.prof_exp = 0;
11465         tsk->cputime_expires.virt_exp = 0;
11466         tsk->cputime_expires.sched_exp = 0;
11467 @@ -1552,6 +1591,7 @@
11468         spin_lock_init(&p->alloc_lock);
11470         init_sigpending(&p->pending);
11471 +       p->sigqueue_cache = NULL;
11473         p->utime = p->stime = p->gtime = 0;
11474         p->utimescaled = p->stimescaled = 0;
11475 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/futex.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/futex.c
11476 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/futex.c  2017-04-16 10:38:29.000000000 +0200
11477 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/futex.c       2017-04-18 17:54:26.000000000 +0200
11478 @@ -800,7 +800,7 @@
11479         return 0;
11482 -static struct futex_pi_state * alloc_pi_state(void)
11483 +static struct futex_pi_state *alloc_pi_state(void)
11485         struct futex_pi_state *pi_state = current->pi_state_cache;
11487 @@ -810,6 +810,11 @@
11488         return pi_state;
11491 +static void get_pi_state(struct futex_pi_state *pi_state)
11493 +       WARN_ON_ONCE(!atomic_inc_not_zero(&pi_state->refcount));
11496  /*
11497   * Drops a reference to the pi_state object and frees or caches it
11498   * when the last reference is gone.
11499 @@ -854,7 +859,7 @@
11500   * Look up the task based on what TID userspace gave us.
11501   * We dont trust it.
11502   */
11503 -static struct task_struct * futex_find_get_task(pid_t pid)
11504 +static struct task_struct *futex_find_get_task(pid_t pid)
11506         struct task_struct *p;
11508 @@ -904,7 +909,9 @@
11509                  * task still owns the PI-state:
11510                  */
11511                 if (head->next != next) {
11512 +                       raw_spin_unlock_irq(&curr->pi_lock);
11513                         spin_unlock(&hb->lock);
11514 +                       raw_spin_lock_irq(&curr->pi_lock);
11515                         continue;
11516                 }
11518 @@ -914,10 +921,12 @@
11519                 pi_state->owner = NULL;
11520                 raw_spin_unlock_irq(&curr->pi_lock);
11522 -               rt_mutex_unlock(&pi_state->pi_mutex);
11524 +               get_pi_state(pi_state);
11525                 spin_unlock(&hb->lock);
11527 +               rt_mutex_futex_unlock(&pi_state->pi_mutex);
11528 +               put_pi_state(pi_state);
11530                 raw_spin_lock_irq(&curr->pi_lock);
11531         }
11532         raw_spin_unlock_irq(&curr->pi_lock);
11533 @@ -971,6 +980,39 @@
11534   *
11535   * [10] There is no transient state which leaves owner and user space
11536   *     TID out of sync.
11537 + *
11538 + *
11539 + * Serialization and lifetime rules:
11540 + *
11541 + * hb->lock:
11542 + *
11543 + *     hb -> futex_q, relation
11544 + *     futex_q -> pi_state, relation
11545 + *
11546 + *     (cannot be raw because hb can contain arbitrary amount
11547 + *      of futex_q's)
11548 + *
11549 + * pi_mutex->wait_lock:
11550 + *
11551 + *     {uval, pi_state}
11552 + *
11553 + *     (and pi_mutex 'obviously')
11554 + *
11555 + * p->pi_lock:
11556 + *
11557 + *     p->pi_state_list -> pi_state->list, relation
11558 + *
11559 + * pi_state->refcount:
11560 + *
11561 + *     pi_state lifetime
11562 + *
11563 + *
11564 + * Lock order:
11565 + *
11566 + *   hb->lock
11567 + *     pi_mutex->wait_lock
11568 + *       p->pi_lock
11569 + *
11570   */
11572  /*
11573 @@ -978,10 +1020,12 @@
11574   * the pi_state against the user space value. If correct, attach to
11575   * it.
11576   */
11577 -static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
11578 +static int attach_to_pi_state(u32 __user *uaddr, u32 uval,
11579 +                             struct futex_pi_state *pi_state,
11580                               struct futex_pi_state **ps)
11582         pid_t pid = uval & FUTEX_TID_MASK;
11583 +       int ret, uval2;
11585         /*
11586          * Userspace might have messed up non-PI and PI futexes [3]
11587 @@ -989,9 +1033,39 @@
11588         if (unlikely(!pi_state))
11589                 return -EINVAL;
11591 +       /*
11592 +        * We get here with hb->lock held, and having found a
11593 +        * futex_top_waiter(). This means that futex_lock_pi() of said futex_q
11594 +        * has dropped the hb->lock in between queue_me() and unqueue_me_pi(),
11595 +        * which in turn means that futex_lock_pi() still has a reference on
11596 +        * our pi_state.
11597 +        *
11598 +        * The waiter holding a reference on @pi_state also protects against
11599 +        * the unlocked put_pi_state() in futex_unlock_pi(), futex_lock_pi()
11600 +        * and futex_wait_requeue_pi() as it cannot go to 0 and consequently
11601 +        * free pi_state before we can take a reference ourselves.
11602 +        */
11603         WARN_ON(!atomic_read(&pi_state->refcount));
11605         /*
11606 +        * Now that we have a pi_state, we can acquire wait_lock
11607 +        * and do the state validation.
11608 +        */
11609 +       raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
11611 +       /*
11612 +        * Since {uval, pi_state} is serialized by wait_lock, and our current
11613 +        * uval was read without holding it, it can have changed. Verify it
11614 +        * still is what we expect it to be, otherwise retry the entire
11615 +        * operation.
11616 +        */
11617 +       if (get_futex_value_locked(&uval2, uaddr))
11618 +               goto out_efault;
11620 +       if (uval != uval2)
11621 +               goto out_eagain;
11623 +       /*
11624          * Handle the owner died case:
11625          */
11626         if (uval & FUTEX_OWNER_DIED) {
11627 @@ -1006,11 +1080,11 @@
11628                          * is not 0. Inconsistent state. [5]
11629                          */
11630                         if (pid)
11631 -                               return -EINVAL;
11632 +                               goto out_einval;
11633                         /*
11634                          * Take a ref on the state and return success. [4]
11635                          */
11636 -                       goto out_state;
11637 +                       goto out_attach;
11638                 }
11640                 /*
11641 @@ -1022,14 +1096,14 @@
11642                  * Take a ref on the state and return success. [6]
11643                  */
11644                 if (!pid)
11645 -                       goto out_state;
11646 +                       goto out_attach;
11647         } else {
11648                 /*
11649                  * If the owner died bit is not set, then the pi_state
11650                  * must have an owner. [7]
11651                  */
11652                 if (!pi_state->owner)
11653 -                       return -EINVAL;
11654 +                       goto out_einval;
11655         }
11657         /*
11658 @@ -1038,11 +1112,29 @@
11659          * user space TID. [9/10]
11660          */
11661         if (pid != task_pid_vnr(pi_state->owner))
11662 -               return -EINVAL;
11663 -out_state:
11664 -       atomic_inc(&pi_state->refcount);
11665 +               goto out_einval;
11667 +out_attach:
11668 +       get_pi_state(pi_state);
11669 +       raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
11670         *ps = pi_state;
11671         return 0;
11673 +out_einval:
11674 +       ret = -EINVAL;
11675 +       goto out_error;
11677 +out_eagain:
11678 +       ret = -EAGAIN;
11679 +       goto out_error;
11681 +out_efault:
11682 +       ret = -EFAULT;
11683 +       goto out_error;
11685 +out_error:
11686 +       raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
11687 +       return ret;
11690  /*
11691 @@ -1093,6 +1185,9 @@
11693         /*
11694          * No existing pi state. First waiter. [2]
11695 +        *
11696 +        * This creates pi_state, we have hb->lock held, this means nothing can
11697 +        * observe this state, wait_lock is irrelevant.
11698          */
11699         pi_state = alloc_pi_state();
11701 @@ -1117,17 +1212,18 @@
11702         return 0;
11705 -static int lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
11706 +static int lookup_pi_state(u32 __user *uaddr, u32 uval,
11707 +                          struct futex_hash_bucket *hb,
11708                            union futex_key *key, struct futex_pi_state **ps)
11710 -       struct futex_q *match = futex_top_waiter(hb, key);
11711 +       struct futex_q *top_waiter = futex_top_waiter(hb, key);
11713         /*
11714          * If there is a waiter on that futex, validate it and
11715          * attach to the pi_state when the validation succeeds.
11716          */
11717 -       if (match)
11718 -               return attach_to_pi_state(uval, match->pi_state, ps);
11719 +       if (top_waiter)
11720 +               return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);
11722         /*
11723          * We are the first waiter - try to look up the owner based on
11724 @@ -1146,7 +1242,7 @@
11725         if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)))
11726                 return -EFAULT;
11728 -       /*If user space value changed, let the caller retry */
11729 +       /* If user space value changed, let the caller retry */
11730         return curval != uval ? -EAGAIN : 0;
11733 @@ -1174,7 +1270,7 @@
11734                                 struct task_struct *task, int set_waiters)
11736         u32 uval, newval, vpid = task_pid_vnr(task);
11737 -       struct futex_q *match;
11738 +       struct futex_q *top_waiter;
11739         int ret;
11741         /*
11742 @@ -1200,9 +1296,9 @@
11743          * Lookup existing state first. If it exists, try to attach to
11744          * its pi_state.
11745          */
11746 -       match = futex_top_waiter(hb, key);
11747 -       if (match)
11748 -               return attach_to_pi_state(uval, match->pi_state, ps);
11749 +       top_waiter = futex_top_waiter(hb, key);
11750 +       if (top_waiter)
11751 +               return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);
11753         /*
11754          * No waiter and user TID is 0. We are here because the
11755 @@ -1288,45 +1384,39 @@
11756          * memory barrier is required here to prevent the following
11757          * store to lock_ptr from getting ahead of the plist_del.
11758          */
11759 -       smp_wmb();
11760 -       q->lock_ptr = NULL;
11761 +       smp_store_release(&q->lock_ptr, NULL);
11764 -static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
11765 -                        struct futex_hash_bucket *hb)
11767 + * Caller must hold a reference on @pi_state.
11768 + */
11769 +static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state)
11771 -       struct task_struct *new_owner;
11772 -       struct futex_pi_state *pi_state = this->pi_state;
11773         u32 uninitialized_var(curval), newval;
11774 +       struct task_struct *new_owner;
11775 +       bool deboost = false;
11776         WAKE_Q(wake_q);
11777 -       bool deboost;
11778 +       WAKE_Q(wake_sleeper_q);
11779         int ret = 0;
11781 -       if (!pi_state)
11782 -               return -EINVAL;
11784 -       /*
11785 -        * If current does not own the pi_state then the futex is
11786 -        * inconsistent and user space fiddled with the futex value.
11787 -        */
11788 -       if (pi_state->owner != current)
11789 -               return -EINVAL;
11791 -       raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
11792         new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
11793 +       if (WARN_ON_ONCE(!new_owner)) {
11794 +               /*
11795 +                * As per the comment in futex_unlock_pi() this should not happen.
11796 +                *
11797 +                * When this happens, give up our locks and try again, giving
11798 +                * the futex_lock_pi() instance time to complete, either by
11799 +                * waiting on the rtmutex or removing itself from the futex
11800 +                * queue.
11801 +                */
11802 +               ret = -EAGAIN;
11803 +               goto out_unlock;
11804 +       }
11806         /*
11807 -        * It is possible that the next waiter (the one that brought
11808 -        * this owner to the kernel) timed out and is no longer
11809 -        * waiting on the lock.
11810 -        */
11811 -       if (!new_owner)
11812 -               new_owner = this->task;
11814 -       /*
11815 -        * We pass it to the next owner. The WAITERS bit is always
11816 -        * kept enabled while there is PI state around. We cleanup the
11817 -        * owner died bit, because we are the owner.
11818 +        * We pass it to the next owner. The WAITERS bit is always kept
11819 +        * enabled while there is PI state around. We cleanup the owner
11820 +        * died bit, because we are the owner.
11821          */
11822         newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
11824 @@ -1335,6 +1425,7 @@
11826         if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) {
11827                 ret = -EFAULT;
11829         } else if (curval != uval) {
11830                 /*
11831                  * If a unconditional UNLOCK_PI operation (user space did not
11832 @@ -1347,10 +1438,9 @@
11833                 else
11834                         ret = -EINVAL;
11835         }
11836 -       if (ret) {
11837 -               raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
11838 -               return ret;
11839 -       }
11841 +       if (ret)
11842 +               goto out_unlock;
11844         raw_spin_lock(&pi_state->owner->pi_lock);
11845         WARN_ON(list_empty(&pi_state->list));
11846 @@ -1363,22 +1453,22 @@
11847         pi_state->owner = new_owner;
11848         raw_spin_unlock(&new_owner->pi_lock);
11850 -       raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
11852 -       deboost = rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q);
11854         /*
11855 -        * First unlock HB so the waiter does not spin on it once he got woken
11856 -        * up. Second wake up the waiter before the priority is adjusted. If we
11857 -        * deboost first (and lose our higher priority), then the task might get
11858 -        * scheduled away before the wake up can take place.
11859 +        * We've updated the uservalue, this unlock cannot fail.
11860          */
11861 -       spin_unlock(&hb->lock);
11862 -       wake_up_q(&wake_q);
11863 -       if (deboost)
11864 +       deboost = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q,
11865 +                                         &wake_sleeper_q);
11867 +out_unlock:
11868 +       raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
11870 +       if (deboost) {
11871 +               wake_up_q(&wake_q);
11872 +               wake_up_q_sleeper(&wake_sleeper_q);
11873                 rt_mutex_adjust_prio(current);
11874 +       }
11876 -       return 0;
11877 +       return ret;
11880  /*
11881 @@ -1824,7 +1914,7 @@
11882                          * If that call succeeds then we have pi_state and an
11883                          * initial refcount on it.
11884                          */
11885 -                       ret = lookup_pi_state(ret, hb2, &key2, &pi_state);
11886 +                       ret = lookup_pi_state(uaddr2, ret, hb2, &key2, &pi_state);
11887                 }
11889                 switch (ret) {
11890 @@ -1907,7 +1997,7 @@
11891                          * refcount on the pi_state and store the pointer in
11892                          * the futex_q object of the waiter.
11893                          */
11894 -                       atomic_inc(&pi_state->refcount);
11895 +                       get_pi_state(pi_state);
11896                         this->pi_state = pi_state;
11897                         ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
11898                                                         this->rt_waiter,
11899 @@ -1924,6 +2014,16 @@
11900                                 requeue_pi_wake_futex(this, &key2, hb2);
11901                                 drop_count++;
11902                                 continue;
11903 +                       } else if (ret == -EAGAIN) {
11904 +                               /*
11905 +                                * Waiter was woken by timeout or
11906 +                                * signal and has set pi_blocked_on to
11907 +                                * PI_WAKEUP_INPROGRESS before we
11908 +                                * tried to enqueue it on the rtmutex.
11909 +                                */
11910 +                               this->pi_state = NULL;
11911 +                               put_pi_state(pi_state);
11912 +                               continue;
11913                         } else if (ret) {
11914                                 /*
11915                                  * rt_mutex_start_proxy_lock() detected a
11916 @@ -2007,20 +2107,7 @@
11917         hb_waiters_dec(hb);
11920 -/**
11921 - * queue_me() - Enqueue the futex_q on the futex_hash_bucket
11922 - * @q: The futex_q to enqueue
11923 - * @hb:        The destination hash bucket
11924 - *
11925 - * The hb->lock must be held by the caller, and is released here. A call to
11926 - * queue_me() is typically paired with exactly one call to unqueue_me().  The
11927 - * exceptions involve the PI related operations, which may use unqueue_me_pi()
11928 - * or nothing if the unqueue is done as part of the wake process and the unqueue
11929 - * state is implicit in the state of woken task (see futex_wait_requeue_pi() for
11930 - * an example).
11931 - */
11932 -static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
11933 -       __releases(&hb->lock)
11934 +static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
11936         int prio;
11938 @@ -2037,6 +2124,24 @@
11939         plist_node_init(&q->list, prio);
11940         plist_add(&q->list, &hb->chain);
11941         q->task = current;
11944 +/**
11945 + * queue_me() - Enqueue the futex_q on the futex_hash_bucket
11946 + * @q: The futex_q to enqueue
11947 + * @hb:        The destination hash bucket
11948 + *
11949 + * The hb->lock must be held by the caller, and is released here. A call to
11950 + * queue_me() is typically paired with exactly one call to unqueue_me().  The
11951 + * exceptions involve the PI related operations, which may use unqueue_me_pi()
11952 + * or nothing if the unqueue is done as part of the wake process and the unqueue
11953 + * state is implicit in the state of woken task (see futex_wait_requeue_pi() for
11954 + * an example).
11955 + */
11956 +static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
11957 +       __releases(&hb->lock)
11959 +       __queue_me(q, hb);
11960         spin_unlock(&hb->lock);
11963 @@ -2123,10 +2228,13 @@
11965         u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
11966         struct futex_pi_state *pi_state = q->pi_state;
11967 -       struct task_struct *oldowner = pi_state->owner;
11968         u32 uval, uninitialized_var(curval), newval;
11969 +       struct task_struct *oldowner;
11970         int ret;
11972 +       raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
11974 +       oldowner = pi_state->owner;
11975         /* Owner died? */
11976         if (!pi_state->owner)
11977                 newtid |= FUTEX_OWNER_DIED;
11978 @@ -2134,7 +2242,8 @@
11979         /*
11980          * We are here either because we stole the rtmutex from the
11981          * previous highest priority waiter or we are the highest priority
11982 -        * waiter but failed to get the rtmutex the first time.
11983 +        * waiter but have failed to get the rtmutex the first time.
11984 +        *
11985          * We have to replace the newowner TID in the user space variable.
11986          * This must be atomic as we have to preserve the owner died bit here.
11987          *
11988 @@ -2142,17 +2251,16 @@
11989          * because we can fault here. Imagine swapped out pages or a fork
11990          * that marked all the anonymous memory readonly for cow.
11991          *
11992 -        * Modifying pi_state _before_ the user space value would
11993 -        * leave the pi_state in an inconsistent state when we fault
11994 -        * here, because we need to drop the hash bucket lock to
11995 -        * handle the fault. This might be observed in the PID check
11996 -        * in lookup_pi_state.
11997 +        * Modifying pi_state _before_ the user space value would leave the
11998 +        * pi_state in an inconsistent state when we fault here, because we
11999 +        * need to drop the locks to handle the fault. This might be observed
12000 +        * in the PID check in lookup_pi_state.
12001          */
12002  retry:
12003         if (get_futex_value_locked(&uval, uaddr))
12004                 goto handle_fault;
12006 -       while (1) {
12007 +       for (;;) {
12008                 newval = (uval & FUTEX_OWNER_DIED) | newtid;
12010                 if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
12011 @@ -2167,47 +2275,60 @@
12012          * itself.
12013          */
12014         if (pi_state->owner != NULL) {
12015 -               raw_spin_lock_irq(&pi_state->owner->pi_lock);
12016 +               raw_spin_lock(&pi_state->owner->pi_lock);
12017                 WARN_ON(list_empty(&pi_state->list));
12018                 list_del_init(&pi_state->list);
12019 -               raw_spin_unlock_irq(&pi_state->owner->pi_lock);
12020 +               raw_spin_unlock(&pi_state->owner->pi_lock);
12021         }
12023         pi_state->owner = newowner;
12025 -       raw_spin_lock_irq(&newowner->pi_lock);
12026 +       raw_spin_lock(&newowner->pi_lock);
12027         WARN_ON(!list_empty(&pi_state->list));
12028         list_add(&pi_state->list, &newowner->pi_state_list);
12029 -       raw_spin_unlock_irq(&newowner->pi_lock);
12030 +       raw_spin_unlock(&newowner->pi_lock);
12031 +       raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
12033         return 0;
12035         /*
12036 -        * To handle the page fault we need to drop the hash bucket
12037 -        * lock here. That gives the other task (either the highest priority
12038 -        * waiter itself or the task which stole the rtmutex) the
12039 -        * chance to try the fixup of the pi_state. So once we are
12040 -        * back from handling the fault we need to check the pi_state
12041 -        * after reacquiring the hash bucket lock and before trying to
12042 -        * do another fixup. When the fixup has been done already we
12043 -        * simply return.
12044 +        * To handle the page fault we need to drop the locks here. That gives
12045 +        * the other task (either the highest priority waiter itself or the
12046 +        * task which stole the rtmutex) the chance to try the fixup of the
12047 +        * pi_state. So once we are back from handling the fault we need to
12048 +        * check the pi_state after reacquiring the locks and before trying to
12049 +        * do another fixup. When the fixup has been done already we simply
12050 +        * return.
12051 +        *
12052 +        * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely
12053 +        * drop hb->lock since the caller owns the hb -> futex_q relation.
12054 +        * Dropping the pi_mutex->wait_lock requires the state revalidate.
12055          */
12056  handle_fault:
12057 +       raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
12058         spin_unlock(q->lock_ptr);
12060         ret = fault_in_user_writeable(uaddr);
12062         spin_lock(q->lock_ptr);
12063 +       raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
12065         /*
12066          * Check if someone else fixed it for us:
12067          */
12068 -       if (pi_state->owner != oldowner)
12069 -               return 0;
12070 +       if (pi_state->owner != oldowner) {
12071 +               ret = 0;
12072 +               goto out_unlock;
12073 +       }
12075         if (ret)
12076 -               return ret;
12077 +               goto out_unlock;
12079         goto retry;
12081 +out_unlock:
12082 +       raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
12083 +       return ret;
12086  static long futex_wait_restart(struct restart_block *restart);
12087 @@ -2229,13 +2350,16 @@
12088   */
12089  static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
12091 -       struct task_struct *owner;
12092         int ret = 0;
12094         if (locked) {
12095                 /*
12096                  * Got the lock. We might not be the anticipated owner if we
12097                  * did a lock-steal - fix up the PI-state in that case:
12098 +                *
12099 +                * We can safely read pi_state->owner without holding wait_lock
12100 +                * because we now own the rt_mutex, only the owner will attempt
12101 +                * to change it.
12102                  */
12103                 if (q->pi_state->owner != current)
12104                         ret = fixup_pi_state_owner(uaddr, q, current);
12105 @@ -2243,43 +2367,15 @@
12106         }
12108         /*
12109 -        * Catch the rare case, where the lock was released when we were on the
12110 -        * way back before we locked the hash bucket.
12111 -        */
12112 -       if (q->pi_state->owner == current) {
12113 -               /*
12114 -                * Try to get the rt_mutex now. This might fail as some other
12115 -                * task acquired the rt_mutex after we removed ourself from the
12116 -                * rt_mutex waiters list.
12117 -                */
12118 -               if (rt_mutex_trylock(&q->pi_state->pi_mutex)) {
12119 -                       locked = 1;
12120 -                       goto out;
12121 -               }
12123 -               /*
12124 -                * pi_state is incorrect, some other task did a lock steal and
12125 -                * we returned due to timeout or signal without taking the
12126 -                * rt_mutex. Too late.
12127 -                */
12128 -               raw_spin_lock_irq(&q->pi_state->pi_mutex.wait_lock);
12129 -               owner = rt_mutex_owner(&q->pi_state->pi_mutex);
12130 -               if (!owner)
12131 -                       owner = rt_mutex_next_owner(&q->pi_state->pi_mutex);
12132 -               raw_spin_unlock_irq(&q->pi_state->pi_mutex.wait_lock);
12133 -               ret = fixup_pi_state_owner(uaddr, q, owner);
12134 -               goto out;
12135 -       }
12137 -       /*
12138          * Paranoia check. If we did not take the lock, then we should not be
12139          * the owner of the rt_mutex.
12140          */
12141 -       if (rt_mutex_owner(&q->pi_state->pi_mutex) == current)
12142 +       if (rt_mutex_owner(&q->pi_state->pi_mutex) == current) {
12143                 printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p "
12144                                 "pi-state %p\n", ret,
12145                                 q->pi_state->pi_mutex.owner,
12146                                 q->pi_state->owner);
12147 +       }
12149  out:
12150         return ret ? ret : locked;
12151 @@ -2503,6 +2599,8 @@
12152                          ktime_t *time, int trylock)
12154         struct hrtimer_sleeper timeout, *to = NULL;
12155 +       struct futex_pi_state *pi_state = NULL;
12156 +       struct rt_mutex_waiter rt_waiter;
12157         struct futex_hash_bucket *hb;
12158         struct futex_q q = futex_q_init;
12159         int res, ret;
12160 @@ -2555,25 +2653,77 @@
12161                 }
12162         }
12164 +       WARN_ON(!q.pi_state);
12166         /*
12167          * Only actually queue now that the atomic ops are done:
12168          */
12169 -       queue_me(&q, hb);
12170 +       __queue_me(&q, hb);
12172 -       WARN_ON(!q.pi_state);
12173 -       /*
12174 -        * Block on the PI mutex:
12175 -        */
12176 -       if (!trylock) {
12177 -               ret = rt_mutex_timed_futex_lock(&q.pi_state->pi_mutex, to);
12178 -       } else {
12179 -               ret = rt_mutex_trylock(&q.pi_state->pi_mutex);
12180 +       if (trylock) {
12181 +               ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex);
12182                 /* Fixup the trylock return value: */
12183                 ret = ret ? 0 : -EWOULDBLOCK;
12184 +               goto no_block;
12185 +       }
12187 +       rt_mutex_init_waiter(&rt_waiter, false);
12189 +       /*
12190 +        * On PREEMPT_RT_FULL, when hb->lock becomes an rt_mutex, we must not
12191 +        * hold it while doing rt_mutex_start_proxy(), because then it will
12192 +        * include hb->lock in the blocking chain, even through we'll not in
12193 +        * fact hold it while blocking. This will lead it to report -EDEADLK
12194 +        * and BUG when futex_unlock_pi() interleaves with this.
12195 +        *
12196 +        * Therefore acquire wait_lock while holding hb->lock, but drop the
12197 +        * latter before calling rt_mutex_start_proxy_lock(). This still fully
12198 +        * serializes against futex_unlock_pi() as that does the exact same
12199 +        * lock handoff sequence.
12200 +        */
12201 +       raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock);
12202 +       /*
12203 +        * the migrate_disable() here disables migration in the in_atomic() fast
12204 +        * path which is enabled again in the following spin_unlock(). We have
12205 +        * one migrate_disable() pending in the slow-path which is reversed
12206 +        * after the raw_spin_unlock_irq() where we leave the atomic context.
12207 +        */
12208 +       migrate_disable();
12210 +       spin_unlock(q.lock_ptr);
12211 +       ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current);
12212 +       raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock);
12213 +       migrate_enable();
12215 +       if (ret) {
12216 +               if (ret == 1)
12217 +                       ret = 0;
12219 +               spin_lock(q.lock_ptr);
12220 +               goto no_block;
12221         }
12224 +       if (unlikely(to))
12225 +               hrtimer_start_expires(&to->timer, HRTIMER_MODE_ABS);
12227 +       ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter);
12229         spin_lock(q.lock_ptr);
12230         /*
12231 +        * If we failed to acquire the lock (signal/timeout), we must
12232 +        * first acquire the hb->lock before removing the lock from the
12233 +        * rt_mutex waitqueue, such that we can keep the hb and rt_mutex
12234 +        * wait lists consistent.
12235 +        *
12236 +        * In particular; it is important that futex_unlock_pi() can not
12237 +        * observe this inconsistency.
12238 +        */
12239 +       if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter))
12240 +               ret = 0;
12242 +no_block:
12243 +       /*
12244          * Fixup the pi_state owner and possibly acquire the lock if we
12245          * haven't already.
12246          */
12247 @@ -2589,12 +2739,19 @@
12248          * If fixup_owner() faulted and was unable to handle the fault, unlock
12249          * it and return the fault to userspace.
12250          */
12251 -       if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current))
12252 -               rt_mutex_unlock(&q.pi_state->pi_mutex);
12253 +       if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current)) {
12254 +               pi_state = q.pi_state;
12255 +               get_pi_state(pi_state);
12256 +       }
12258         /* Unqueue and drop the lock */
12259         unqueue_me_pi(&q);
12261 +       if (pi_state) {
12262 +               rt_mutex_futex_unlock(&pi_state->pi_mutex);
12263 +               put_pi_state(pi_state);
12264 +       }
12266         goto out_put_key;
12268  out_unlock_put_key:
12269 @@ -2631,7 +2788,7 @@
12270         u32 uninitialized_var(curval), uval, vpid = task_pid_vnr(current);
12271         union futex_key key = FUTEX_KEY_INIT;
12272         struct futex_hash_bucket *hb;
12273 -       struct futex_q *match;
12274 +       struct futex_q *top_waiter;
12275         int ret;
12277  retry:
12278 @@ -2655,12 +2812,48 @@
12279          * all and we at least want to know if user space fiddled
12280          * with the futex value instead of blindly unlocking.
12281          */
12282 -       match = futex_top_waiter(hb, &key);
12283 -       if (match) {
12284 -               ret = wake_futex_pi(uaddr, uval, match, hb);
12285 +       top_waiter = futex_top_waiter(hb, &key);
12286 +       if (top_waiter) {
12287 +               struct futex_pi_state *pi_state = top_waiter->pi_state;
12289 +               ret = -EINVAL;
12290 +               if (!pi_state)
12291 +                       goto out_unlock;
12293                 /*
12294 -                * In case of success wake_futex_pi dropped the hash
12295 -                * bucket lock.
12296 +                * If current does not own the pi_state then the futex is
12297 +                * inconsistent and user space fiddled with the futex value.
12298 +                */
12299 +               if (pi_state->owner != current)
12300 +                       goto out_unlock;
12302 +               get_pi_state(pi_state);
12303 +               /*
12304 +                * By taking wait_lock while still holding hb->lock, we ensure
12305 +                * there is no point where we hold neither; and therefore
12306 +                * wake_futex_pi() must observe a state consistent with what we
12307 +                * observed.
12308 +                */
12309 +               raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
12310 +               /*
12311 +                * Magic trickery for now to make the RT migrate disable
12312 +                * logic happy. The following spin_unlock() happens with
12313 +                * interrupts disabled so the internal migrate_enable()
12314 +                * won't undo the migrate_disable() which was issued when
12315 +                * locking hb->lock.
12316 +                */
12317 +               migrate_disable();
12318 +               spin_unlock(&hb->lock);
12320 +               /* Drops pi_state->pi_mutex.wait_lock */
12321 +               ret = wake_futex_pi(uaddr, uval, pi_state);
12323 +               migrate_enable();
12325 +               put_pi_state(pi_state);
12327 +               /*
12328 +                * Success, we're done! No tricky corner cases.
12329                  */
12330                 if (!ret)
12331                         goto out_putkey;
12332 @@ -2675,7 +2868,6 @@
12333                  * setting the FUTEX_WAITERS bit. Try again.
12334                  */
12335                 if (ret == -EAGAIN) {
12336 -                       spin_unlock(&hb->lock);
12337                         put_futex_key(&key);
12338                         goto retry;
12339                 }
12340 @@ -2683,7 +2875,7 @@
12341                  * wake_futex_pi has detected invalid state. Tell user
12342                  * space.
12343                  */
12344 -               goto out_unlock;
12345 +               goto out_putkey;
12346         }
12348         /*
12349 @@ -2693,8 +2885,10 @@
12350          * preserve the WAITERS bit not the OWNER_DIED one. We are the
12351          * owner.
12352          */
12353 -       if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0))
12354 +       if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0)) {
12355 +               spin_unlock(&hb->lock);
12356                 goto pi_faulted;
12357 +       }
12359         /*
12360          * If uval has changed, let user space handle it.
12361 @@ -2708,7 +2902,6 @@
12362         return ret;
12364  pi_faulted:
12365 -       spin_unlock(&hb->lock);
12366         put_futex_key(&key);
12368         ret = fault_in_user_writeable(uaddr);
12369 @@ -2812,8 +3005,9 @@
12370                                  u32 __user *uaddr2)
12372         struct hrtimer_sleeper timeout, *to = NULL;
12373 +       struct futex_pi_state *pi_state = NULL;
12374         struct rt_mutex_waiter rt_waiter;
12375 -       struct futex_hash_bucket *hb;
12376 +       struct futex_hash_bucket *hb, *hb2;
12377         union futex_key key2 = FUTEX_KEY_INIT;
12378         struct futex_q q = futex_q_init;
12379         int res, ret;
12380 @@ -2838,10 +3032,7 @@
12381          * The waiter is allocated on our stack, manipulated by the requeue
12382          * code while we sleep on uaddr.
12383          */
12384 -       debug_rt_mutex_init_waiter(&rt_waiter);
12385 -       RB_CLEAR_NODE(&rt_waiter.pi_tree_entry);
12386 -       RB_CLEAR_NODE(&rt_waiter.tree_entry);
12387 -       rt_waiter.task = NULL;
12388 +       rt_mutex_init_waiter(&rt_waiter, false);
12390         ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
12391         if (unlikely(ret != 0))
12392 @@ -2872,20 +3063,55 @@
12393         /* Queue the futex_q, drop the hb lock, wait for wakeup. */
12394         futex_wait_queue_me(hb, &q, to);
12396 -       spin_lock(&hb->lock);
12397 -       ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
12398 -       spin_unlock(&hb->lock);
12399 -       if (ret)
12400 -               goto out_put_keys;
12401 +       /*
12402 +        * On RT we must avoid races with requeue and trying to block
12403 +        * on two mutexes (hb->lock and uaddr2's rtmutex) by
12404 +        * serializing access to pi_blocked_on with pi_lock.
12405 +        */
12406 +       raw_spin_lock_irq(&current->pi_lock);
12407 +       if (current->pi_blocked_on) {
12408 +               /*
12409 +                * We have been requeued or are in the process of
12410 +                * being requeued.
12411 +                */
12412 +               raw_spin_unlock_irq(&current->pi_lock);
12413 +       } else {
12414 +               /*
12415 +                * Setting pi_blocked_on to PI_WAKEUP_INPROGRESS
12416 +                * prevents a concurrent requeue from moving us to the
12417 +                * uaddr2 rtmutex. After that we can safely acquire
12418 +                * (and possibly block on) hb->lock.
12419 +                */
12420 +               current->pi_blocked_on = PI_WAKEUP_INPROGRESS;
12421 +               raw_spin_unlock_irq(&current->pi_lock);
12423 +               spin_lock(&hb->lock);
12425 +               /*
12426 +                * Clean up pi_blocked_on. We might leak it otherwise
12427 +                * when we succeeded with the hb->lock in the fast
12428 +                * path.
12429 +                */
12430 +               raw_spin_lock_irq(&current->pi_lock);
12431 +               current->pi_blocked_on = NULL;
12432 +               raw_spin_unlock_irq(&current->pi_lock);
12434 +               ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
12435 +               spin_unlock(&hb->lock);
12436 +               if (ret)
12437 +                       goto out_put_keys;
12438 +       }
12440         /*
12441 -        * In order for us to be here, we know our q.key == key2, and since
12442 -        * we took the hb->lock above, we also know that futex_requeue() has
12443 -        * completed and we no longer have to concern ourselves with a wakeup
12444 -        * race with the atomic proxy lock acquisition by the requeue code. The
12445 -        * futex_requeue dropped our key1 reference and incremented our key2
12446 -        * reference count.
12447 +        * In order to be here, we have either been requeued, are in
12448 +        * the process of being requeued, or requeue successfully
12449 +        * acquired uaddr2 on our behalf.  If pi_blocked_on was
12450 +        * non-null above, we may be racing with a requeue.  Do not
12451 +        * rely on q->lock_ptr to be hb2->lock until after blocking on
12452 +        * hb->lock or hb2->lock. The futex_requeue dropped our key1
12453 +        * reference and incremented our key2 reference count.
12454          */
12455 +       hb2 = hash_futex(&key2);
12457         /* Check if the requeue code acquired the second futex for us. */
12458         if (!q.rt_waiter) {
12459 @@ -2894,16 +3120,19 @@
12460                  * did a lock-steal - fix up the PI-state in that case.
12461                  */
12462                 if (q.pi_state && (q.pi_state->owner != current)) {
12463 -                       spin_lock(q.lock_ptr);
12464 +                       spin_lock(&hb2->lock);
12465 +                       BUG_ON(&hb2->lock != q.lock_ptr);
12466                         ret = fixup_pi_state_owner(uaddr2, &q, current);
12467 -                       if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current)
12468 -                               rt_mutex_unlock(&q.pi_state->pi_mutex);
12469 +                       if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) {
12470 +                               pi_state = q.pi_state;
12471 +                               get_pi_state(pi_state);
12472 +                       }
12473                         /*
12474                          * Drop the reference to the pi state which
12475                          * the requeue_pi() code acquired for us.
12476                          */
12477                         put_pi_state(q.pi_state);
12478 -                       spin_unlock(q.lock_ptr);
12479 +                       spin_unlock(&hb2->lock);
12480                 }
12481         } else {
12482                 struct rt_mutex *pi_mutex;
12483 @@ -2915,10 +3144,14 @@
12484                  */
12485                 WARN_ON(!q.pi_state);
12486                 pi_mutex = &q.pi_state->pi_mutex;
12487 -               ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter);
12488 -               debug_rt_mutex_free_waiter(&rt_waiter);
12489 +               ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter);
12491 -               spin_lock(q.lock_ptr);
12492 +               spin_lock(&hb2->lock);
12493 +               BUG_ON(&hb2->lock != q.lock_ptr);
12494 +               if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter))
12495 +                       ret = 0;
12497 +               debug_rt_mutex_free_waiter(&rt_waiter);
12498                 /*
12499                  * Fixup the pi_state owner and possibly acquire the lock if we
12500                  * haven't already.
12501 @@ -2936,13 +3169,20 @@
12502                  * the fault, unlock the rt_mutex and return the fault to
12503                  * userspace.
12504                  */
12505 -               if (ret && rt_mutex_owner(pi_mutex) == current)
12506 -                       rt_mutex_unlock(pi_mutex);
12507 +               if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) {
12508 +                       pi_state = q.pi_state;
12509 +                       get_pi_state(pi_state);
12510 +               }
12512                 /* Unqueue and drop the lock. */
12513                 unqueue_me_pi(&q);
12514         }
12516 +       if (pi_state) {
12517 +               rt_mutex_futex_unlock(&pi_state->pi_mutex);
12518 +               put_pi_state(pi_state);
12519 +       }
12521         if (ret == -EINTR) {
12522                 /*
12523                  * We've already been requeued, but cannot restart by calling
12524 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/irq/handle.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/irq/handle.c
12525 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/irq/handle.c     2017-04-16 10:38:29.000000000 +0200
12526 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/irq/handle.c  2017-04-18 17:54:26.000000000 +0200
12527 @@ -181,10 +181,16 @@
12529         irqreturn_t retval;
12530         unsigned int flags = 0;
12531 +       struct pt_regs *regs = get_irq_regs();
12532 +       u64 ip = regs ? instruction_pointer(regs) : 0;
12534         retval = __handle_irq_event_percpu(desc, &flags);
12536 -       add_interrupt_randomness(desc->irq_data.irq, flags);
12537 +#ifdef CONFIG_PREEMPT_RT_FULL
12538 +       desc->random_ip = ip;
12539 +#else
12540 +       add_interrupt_randomness(desc->irq_data.irq, flags, ip);
12541 +#endif
12543         if (!noirqdebug)
12544                 note_interrupt(desc, retval);
12545 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/irq/manage.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/irq/manage.c
12546 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/irq/manage.c     2017-04-16 10:38:29.000000000 +0200
12547 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/irq/manage.c  2017-04-18 17:54:26.000000000 +0200
12548 @@ -22,6 +22,7 @@
12549  #include "internals.h"
12551  #ifdef CONFIG_IRQ_FORCED_THREADING
12552 +# ifndef CONFIG_PREEMPT_RT_BASE
12553  __read_mostly bool force_irqthreads;
12555  static int __init setup_forced_irqthreads(char *arg)
12556 @@ -30,6 +31,7 @@
12557         return 0;
12559  early_param("threadirqs", setup_forced_irqthreads);
12560 +# endif
12561  #endif
12563  static void __synchronize_hardirq(struct irq_desc *desc)
12564 @@ -233,7 +235,12 @@
12566         if (desc->affinity_notify) {
12567                 kref_get(&desc->affinity_notify->kref);
12569 +#ifdef CONFIG_PREEMPT_RT_BASE
12570 +               swork_queue(&desc->affinity_notify->swork);
12571 +#else
12572                 schedule_work(&desc->affinity_notify->work);
12573 +#endif
12574         }
12575         irqd_set(data, IRQD_AFFINITY_SET);
12577 @@ -271,10 +278,8 @@
12579  EXPORT_SYMBOL_GPL(irq_set_affinity_hint);
12581 -static void irq_affinity_notify(struct work_struct *work)
12582 +static void _irq_affinity_notify(struct irq_affinity_notify *notify)
12584 -       struct irq_affinity_notify *notify =
12585 -               container_of(work, struct irq_affinity_notify, work);
12586         struct irq_desc *desc = irq_to_desc(notify->irq);
12587         cpumask_var_t cpumask;
12588         unsigned long flags;
12589 @@ -296,6 +301,35 @@
12590         kref_put(&notify->kref, notify->release);
12593 +#ifdef CONFIG_PREEMPT_RT_BASE
12594 +static void init_helper_thread(void)
12596 +       static int init_sworker_once;
12598 +       if (init_sworker_once)
12599 +               return;
12600 +       if (WARN_ON(swork_get()))
12601 +               return;
12602 +       init_sworker_once = 1;
12605 +static void irq_affinity_notify(struct swork_event *swork)
12607 +       struct irq_affinity_notify *notify =
12608 +               container_of(swork, struct irq_affinity_notify, swork);
12609 +       _irq_affinity_notify(notify);
12612 +#else
12614 +static void irq_affinity_notify(struct work_struct *work)
12616 +       struct irq_affinity_notify *notify =
12617 +               container_of(work, struct irq_affinity_notify, work);
12618 +       _irq_affinity_notify(notify);
12620 +#endif
12622  /**
12623   *     irq_set_affinity_notifier - control notification of IRQ affinity changes
12624   *     @irq:           Interrupt for which to enable/disable notification
12625 @@ -324,7 +358,12 @@
12626         if (notify) {
12627                 notify->irq = irq;
12628                 kref_init(&notify->kref);
12629 +#ifdef CONFIG_PREEMPT_RT_BASE
12630 +               INIT_SWORK(&notify->swork, irq_affinity_notify);
12631 +               init_helper_thread();
12632 +#else
12633                 INIT_WORK(&notify->work, irq_affinity_notify);
12634 +#endif
12635         }
12637         raw_spin_lock_irqsave(&desc->lock, flags);
12638 @@ -879,7 +918,15 @@
12639         local_bh_disable();
12640         ret = action->thread_fn(action->irq, action->dev_id);
12641         irq_finalize_oneshot(desc, action);
12642 -       local_bh_enable();
12643 +       /*
12644 +        * Interrupts which have real time requirements can be set up
12645 +        * to avoid softirq processing in the thread handler. This is
12646 +        * safe as these interrupts do not raise soft interrupts.
12647 +        */
12648 +       if (irq_settings_no_softirq_call(desc))
12649 +               _local_bh_enable();
12650 +       else
12651 +               local_bh_enable();
12652         return ret;
12655 @@ -976,6 +1023,12 @@
12656                 if (action_ret == IRQ_WAKE_THREAD)
12657                         irq_wake_secondary(desc, action);
12659 +#ifdef CONFIG_PREEMPT_RT_FULL
12660 +               migrate_disable();
12661 +               add_interrupt_randomness(action->irq, 0,
12662 +                                desc->random_ip ^ (unsigned long) action);
12663 +               migrate_enable();
12664 +#endif
12665                 wake_threads_waitq(desc);
12666         }
12668 @@ -1336,6 +1389,9 @@
12669                         irqd_set(&desc->irq_data, IRQD_NO_BALANCING);
12670                 }
12672 +               if (new->flags & IRQF_NO_SOFTIRQ_CALL)
12673 +                       irq_settings_set_no_softirq_call(desc);
12675                 /* Set default affinity mask once everything is setup */
12676                 setup_affinity(desc, mask);
12678 @@ -2061,7 +2117,7 @@
12679   *     This call sets the internal irqchip state of an interrupt,
12680   *     depending on the value of @which.
12681   *
12682 - *     This function should be called with preemption disabled if the
12683 + *     This function should be called with migration disabled if the
12684   *     interrupt controller has per-cpu registers.
12685   */
12686  int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
12687 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/irq/settings.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/irq/settings.h
12688 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/irq/settings.h   2017-04-16 10:38:29.000000000 +0200
12689 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/irq/settings.h        2017-04-18 17:54:26.000000000 +0200
12690 @@ -16,6 +16,7 @@
12691         _IRQ_PER_CPU_DEVID      = IRQ_PER_CPU_DEVID,
12692         _IRQ_IS_POLLED          = IRQ_IS_POLLED,
12693         _IRQ_DISABLE_UNLAZY     = IRQ_DISABLE_UNLAZY,
12694 +       _IRQ_NO_SOFTIRQ_CALL    = IRQ_NO_SOFTIRQ_CALL,
12695         _IRQF_MODIFY_MASK       = IRQF_MODIFY_MASK,
12696  };
12698 @@ -30,6 +31,7 @@
12699  #define IRQ_PER_CPU_DEVID      GOT_YOU_MORON
12700  #define IRQ_IS_POLLED          GOT_YOU_MORON
12701  #define IRQ_DISABLE_UNLAZY     GOT_YOU_MORON
12702 +#define IRQ_NO_SOFTIRQ_CALL    GOT_YOU_MORON
12703  #undef IRQF_MODIFY_MASK
12704  #define IRQF_MODIFY_MASK       GOT_YOU_MORON
12706 @@ -40,6 +42,16 @@
12707         desc->status_use_accessors |= (set & _IRQF_MODIFY_MASK);
12710 +static inline bool irq_settings_no_softirq_call(struct irq_desc *desc)
12712 +       return desc->status_use_accessors & _IRQ_NO_SOFTIRQ_CALL;
12715 +static inline void irq_settings_set_no_softirq_call(struct irq_desc *desc)
12717 +       desc->status_use_accessors |= _IRQ_NO_SOFTIRQ_CALL;
12720  static inline bool irq_settings_is_per_cpu(struct irq_desc *desc)
12722         return desc->status_use_accessors & _IRQ_PER_CPU;
12723 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/irq/spurious.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/irq/spurious.c
12724 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/irq/spurious.c   2017-04-16 10:38:29.000000000 +0200
12725 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/irq/spurious.c        2017-04-18 17:54:26.000000000 +0200
12726 @@ -442,6 +442,10 @@
12728  static int __init irqfixup_setup(char *str)
12730 +#ifdef CONFIG_PREEMPT_RT_BASE
12731 +       pr_warn("irqfixup boot option not supported w/ CONFIG_PREEMPT_RT_BASE\n");
12732 +       return 1;
12733 +#endif
12734         irqfixup = 1;
12735         printk(KERN_WARNING "Misrouted IRQ fixup support enabled.\n");
12736         printk(KERN_WARNING "This may impact system performance.\n");
12737 @@ -454,6 +458,10 @@
12739  static int __init irqpoll_setup(char *str)
12741 +#ifdef CONFIG_PREEMPT_RT_BASE
12742 +       pr_warn("irqpoll boot option not supported w/ CONFIG_PREEMPT_RT_BASE\n");
12743 +       return 1;
12744 +#endif
12745         irqfixup = 2;
12746         printk(KERN_WARNING "Misrouted IRQ fixup and polling support "
12747                                 "enabled\n");
12748 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/irq_work.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/irq_work.c
12749 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/irq_work.c       2017-04-16 10:38:29.000000000 +0200
12750 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/irq_work.c    2017-04-18 17:54:26.000000000 +0200
12751 @@ -17,6 +17,7 @@
12752  #include <linux/cpu.h>
12753  #include <linux/notifier.h>
12754  #include <linux/smp.h>
12755 +#include <linux/interrupt.h>
12756  #include <asm/processor.h>
12759 @@ -65,6 +66,8 @@
12760   */
12761  bool irq_work_queue_on(struct irq_work *work, int cpu)
12763 +       struct llist_head *list;
12765         /* All work should have been flushed before going offline */
12766         WARN_ON_ONCE(cpu_is_offline(cpu));
12768 @@ -75,7 +78,12 @@
12769         if (!irq_work_claim(work))
12770                 return false;
12772 -       if (llist_add(&work->llnode, &per_cpu(raised_list, cpu)))
12773 +       if (IS_ENABLED(CONFIG_PREEMPT_RT_FULL) && !(work->flags & IRQ_WORK_HARD_IRQ))
12774 +               list = &per_cpu(lazy_list, cpu);
12775 +       else
12776 +               list = &per_cpu(raised_list, cpu);
12778 +       if (llist_add(&work->llnode, list))
12779                 arch_send_call_function_single_ipi(cpu);
12781         return true;
12782 @@ -86,6 +94,9 @@
12783  /* Enqueue the irq work @work on the current CPU */
12784  bool irq_work_queue(struct irq_work *work)
12786 +       struct llist_head *list;
12787 +       bool lazy_work, realtime = IS_ENABLED(CONFIG_PREEMPT_RT_FULL);
12789         /* Only queue if not already pending */
12790         if (!irq_work_claim(work))
12791                 return false;
12792 @@ -93,13 +104,15 @@
12793         /* Queue the entry and raise the IPI if needed. */
12794         preempt_disable();
12796 -       /* If the work is "lazy", handle it from next tick if any */
12797 -       if (work->flags & IRQ_WORK_LAZY) {
12798 -               if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) &&
12799 -                   tick_nohz_tick_stopped())
12800 -                       arch_irq_work_raise();
12801 -       } else {
12802 -               if (llist_add(&work->llnode, this_cpu_ptr(&raised_list)))
12803 +       lazy_work = work->flags & IRQ_WORK_LAZY;
12805 +       if (lazy_work || (realtime && !(work->flags & IRQ_WORK_HARD_IRQ)))
12806 +               list = this_cpu_ptr(&lazy_list);
12807 +       else
12808 +               list = this_cpu_ptr(&raised_list);
12810 +       if (llist_add(&work->llnode, list)) {
12811 +               if (!lazy_work || tick_nohz_tick_stopped())
12812                         arch_irq_work_raise();
12813         }
12815 @@ -116,9 +129,8 @@
12816         raised = this_cpu_ptr(&raised_list);
12817         lazy = this_cpu_ptr(&lazy_list);
12819 -       if (llist_empty(raised) || arch_irq_work_has_interrupt())
12820 -               if (llist_empty(lazy))
12821 -                       return false;
12822 +       if (llist_empty(raised) && llist_empty(lazy))
12823 +               return false;
12825         /* All work should have been flushed before going offline */
12826         WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
12827 @@ -132,7 +144,7 @@
12828         struct irq_work *work;
12829         struct llist_node *llnode;
12831 -       BUG_ON(!irqs_disabled());
12832 +       BUG_ON_NONRT(!irqs_disabled());
12834         if (llist_empty(list))
12835                 return;
12836 @@ -169,7 +181,16 @@
12837  void irq_work_run(void)
12839         irq_work_run_list(this_cpu_ptr(&raised_list));
12840 -       irq_work_run_list(this_cpu_ptr(&lazy_list));
12841 +       if (IS_ENABLED(CONFIG_PREEMPT_RT_FULL)) {
12842 +               /*
12843 +                * NOTE: we raise softirq via IPI for safety,
12844 +                * and execute in irq_work_tick() to move the
12845 +                * overhead from hard to soft irq context.
12846 +                */
12847 +               if (!llist_empty(this_cpu_ptr(&lazy_list)))
12848 +                       raise_softirq(TIMER_SOFTIRQ);
12849 +       } else
12850 +               irq_work_run_list(this_cpu_ptr(&lazy_list));
12852  EXPORT_SYMBOL_GPL(irq_work_run);
12854 @@ -179,8 +200,17 @@
12856         if (!llist_empty(raised) && !arch_irq_work_has_interrupt())
12857                 irq_work_run_list(raised);
12859 +       if (!IS_ENABLED(CONFIG_PREEMPT_RT_FULL))
12860 +               irq_work_run_list(this_cpu_ptr(&lazy_list));
12863 +#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_PREEMPT_RT_FULL)
12864 +void irq_work_tick_soft(void)
12866         irq_work_run_list(this_cpu_ptr(&lazy_list));
12868 +#endif
12870  /*
12871   * Synchronize against the irq_work @entry, ensures the entry is not
12872 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/ksysfs.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/ksysfs.c
12873 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/ksysfs.c 2017-04-16 10:38:29.000000000 +0200
12874 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/ksysfs.c      2017-04-18 17:54:26.000000000 +0200
12875 @@ -136,6 +136,15 @@
12877  #endif /* CONFIG_KEXEC_CORE */
12879 +#if defined(CONFIG_PREEMPT_RT_FULL)
12880 +static ssize_t  realtime_show(struct kobject *kobj,
12881 +                             struct kobj_attribute *attr, char *buf)
12883 +       return sprintf(buf, "%d\n", 1);
12885 +KERNEL_ATTR_RO(realtime);
12886 +#endif
12888  /* whether file capabilities are enabled */
12889  static ssize_t fscaps_show(struct kobject *kobj,
12890                                   struct kobj_attribute *attr, char *buf)
12891 @@ -225,6 +234,9 @@
12892         &rcu_expedited_attr.attr,
12893         &rcu_normal_attr.attr,
12894  #endif
12895 +#ifdef CONFIG_PREEMPT_RT_FULL
12896 +       &realtime_attr.attr,
12897 +#endif
12898         NULL
12899  };
12901 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/locking/Makefile linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/locking/Makefile
12902 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/locking/Makefile 2017-04-16 10:38:29.000000000 +0200
12903 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/locking/Makefile      2017-04-18 17:54:26.000000000 +0200
12904 @@ -2,7 +2,7 @@
12905  # and is generally not a function of system call inputs.
12906  KCOV_INSTRUMENT                := n
12908 -obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o
12909 +obj-y += semaphore.o percpu-rwsem.o
12911  ifdef CONFIG_FUNCTION_TRACER
12912  CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE)
12913 @@ -11,7 +11,11 @@
12914  CFLAGS_REMOVE_rtmutex-debug.o = $(CC_FLAGS_FTRACE)
12915  endif
12917 +ifneq ($(CONFIG_PREEMPT_RT_FULL),y)
12918 +obj-y += mutex.o
12919  obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
12920 +endif
12921 +obj-y += rwsem.o
12922  obj-$(CONFIG_LOCKDEP) += lockdep.o
12923  ifeq ($(CONFIG_PROC_FS),y)
12924  obj-$(CONFIG_LOCKDEP) += lockdep_proc.o
12925 @@ -24,7 +28,10 @@
12926  obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
12927  obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
12928  obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o
12929 +ifneq ($(CONFIG_PREEMPT_RT_FULL),y)
12930  obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
12931  obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o
12932 +endif
12933 +obj-$(CONFIG_PREEMPT_RT_FULL) += rt.o rwsem-rt.o
12934  obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o
12935  obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o
12936 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/locking/lockdep.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/locking/lockdep.c
12937 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/locking/lockdep.c        2017-04-16 10:38:29.000000000 +0200
12938 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/locking/lockdep.c     2017-04-18 17:54:26.000000000 +0200
12939 @@ -658,6 +658,7 @@
12940         struct lockdep_subclass_key *key;
12941         struct hlist_head *hash_head;
12942         struct lock_class *class;
12943 +       bool is_static = false;
12945         if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) {
12946                 debug_locks_off();
12947 @@ -671,10 +672,23 @@
12949         /*
12950          * Static locks do not have their class-keys yet - for them the key
12951 -        * is the lock object itself:
12952 -        */
12953 -       if (unlikely(!lock->key))
12954 -               lock->key = (void *)lock;
12955 +        * is the lock object itself. If the lock is in the per cpu area,
12956 +        * the canonical address of the lock (per cpu offset removed) is
12957 +        * used.
12958 +        */
12959 +       if (unlikely(!lock->key)) {
12960 +               unsigned long can_addr, addr = (unsigned long)lock;
12962 +               if (__is_kernel_percpu_address(addr, &can_addr))
12963 +                       lock->key = (void *)can_addr;
12964 +               else if (__is_module_percpu_address(addr, &can_addr))
12965 +                       lock->key = (void *)can_addr;
12966 +               else if (static_obj(lock))
12967 +                       lock->key = (void *)lock;
12968 +               else
12969 +                       return ERR_PTR(-EINVAL);
12970 +               is_static = true;
12971 +       }
12973         /*
12974          * NOTE: the class-key must be unique. For dynamic locks, a static
12975 @@ -706,7 +720,7 @@
12976                 }
12977         }
12979 -       return NULL;
12980 +       return is_static || static_obj(lock->key) ? NULL : ERR_PTR(-EINVAL);
12983  /*
12984 @@ -724,19 +738,18 @@
12985         DEBUG_LOCKS_WARN_ON(!irqs_disabled());
12987         class = look_up_lock_class(lock, subclass);
12988 -       if (likely(class))
12989 +       if (likely(!IS_ERR_OR_NULL(class)))
12990                 goto out_set_class_cache;
12992         /*
12993          * Debug-check: all keys must be persistent!
12994 -        */
12995 -       if (!static_obj(lock->key)) {
12996 +        */
12997 +       if (IS_ERR(class)) {
12998                 debug_locks_off();
12999                 printk("INFO: trying to register non-static key.\n");
13000                 printk("the code is fine but needs lockdep annotation.\n");
13001                 printk("turning off the locking correctness validator.\n");
13002                 dump_stack();
13004                 return NULL;
13005         }
13007 @@ -3410,7 +3423,7 @@
13008                  * Clearly if the lock hasn't been acquired _ever_, we're not
13009                  * holding it either, so report failure.
13010                  */
13011 -               if (!class)
13012 +               if (IS_ERR_OR_NULL(class))
13013                         return 0;
13015                 /*
13016 @@ -3689,6 +3702,7 @@
13017                 }
13018         }
13020 +#ifndef CONFIG_PREEMPT_RT_FULL
13021         /*
13022          * We dont accurately track softirq state in e.g.
13023          * hardirq contexts (such as on 4KSTACKS), so only
13024 @@ -3703,6 +3717,7 @@
13025                         DEBUG_LOCKS_WARN_ON(!current->softirqs_enabled);
13026                 }
13027         }
13028 +#endif
13030         if (!debug_locks)
13031                 print_irqtrace_events(current);
13032 @@ -4159,7 +4174,7 @@
13033                  * If the class exists we look it up and zap it:
13034                  */
13035                 class = look_up_lock_class(lock, j);
13036 -               if (class)
13037 +               if (!IS_ERR_OR_NULL(class))
13038                         zap_class(class);
13039         }
13040         /*
13041 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/locking/locktorture.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/locking/locktorture.c
13042 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/locking/locktorture.c    2017-04-16 10:38:29.000000000 +0200
13043 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/locking/locktorture.c 2017-04-18 17:54:26.000000000 +0200
13044 @@ -26,7 +26,6 @@
13045  #include <linux/kthread.h>
13046  #include <linux/sched/rt.h>
13047  #include <linux/spinlock.h>
13048 -#include <linux/rwlock.h>
13049  #include <linux/mutex.h>
13050  #include <linux/rwsem.h>
13051  #include <linux/smp.h>
13052 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/locking/percpu-rwsem.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/locking/percpu-rwsem.c
13053 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/locking/percpu-rwsem.c   2017-04-16 10:38:29.000000000 +0200
13054 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/locking/percpu-rwsem.c        2017-04-18 17:54:26.000000000 +0200
13055 @@ -18,7 +18,7 @@
13056         /* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */
13057         rcu_sync_init(&sem->rss, RCU_SCHED_SYNC);
13058         __init_rwsem(&sem->rw_sem, name, rwsem_key);
13059 -       init_waitqueue_head(&sem->writer);
13060 +       init_swait_queue_head(&sem->writer);
13061         sem->readers_block = 0;
13062         return 0;
13064 @@ -103,7 +103,7 @@
13065         __this_cpu_dec(*sem->read_count);
13067         /* Prod writer to recheck readers_active */
13068 -       wake_up(&sem->writer);
13069 +       swake_up(&sem->writer);
13071  EXPORT_SYMBOL_GPL(__percpu_up_read);
13073 @@ -160,7 +160,7 @@
13074          */
13076         /* Wait for all now active readers to complete. */
13077 -       wait_event(sem->writer, readers_active_check(sem));
13078 +       swait_event(sem->writer, readers_active_check(sem));
13080  EXPORT_SYMBOL_GPL(percpu_down_write);
13082 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/locking/rt.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/locking/rt.c
13083 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/locking/rt.c     1970-01-01 01:00:00.000000000 +0100
13084 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/locking/rt.c  2017-04-18 17:54:26.000000000 +0200
13085 @@ -0,0 +1,331 @@
13087 + * kernel/rt.c
13088 + *
13089 + * Real-Time Preemption Support
13090 + *
13091 + * started by Ingo Molnar:
13092 + *
13093 + *  Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
13094 + *  Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
13095 + *
13096 + * historic credit for proving that Linux spinlocks can be implemented via
13097 + * RT-aware mutexes goes to many people: The Pmutex project (Dirk Grambow
13098 + * and others) who prototyped it on 2.4 and did lots of comparative
13099 + * research and analysis; TimeSys, for proving that you can implement a
13100 + * fully preemptible kernel via the use of IRQ threading and mutexes;
13101 + * Bill Huey for persuasively arguing on lkml that the mutex model is the
13102 + * right one; and to MontaVista, who ported pmutexes to 2.6.
13103 + *
13104 + * This code is a from-scratch implementation and is not based on pmutexes,
13105 + * but the idea of converting spinlocks to mutexes is used here too.
13106 + *
13107 + * lock debugging, locking tree, deadlock detection:
13108 + *
13109 + *  Copyright (C) 2004, LynuxWorks, Inc., Igor Manyilov, Bill Huey
13110 + *  Released under the General Public License (GPL).
13111 + *
13112 + * Includes portions of the generic R/W semaphore implementation from:
13113 + *
13114 + *  Copyright (c) 2001   David Howells (dhowells@redhat.com).
13115 + *  - Derived partially from idea by Andrea Arcangeli <andrea@suse.de>
13116 + *  - Derived also from comments by Linus
13117 + *
13118 + * Pending ownership of locks and ownership stealing:
13119 + *
13120 + *  Copyright (C) 2005, Kihon Technologies Inc., Steven Rostedt
13121 + *
13122 + *   (also by Steven Rostedt)
13123 + *    - Converted single pi_lock to individual task locks.
13124 + *
13125 + * By Esben Nielsen:
13126 + *    Doing priority inheritance with help of the scheduler.
13127 + *
13128 + *  Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
13129 + *  - major rework based on Esben Nielsens initial patch
13130 + *  - replaced thread_info references by task_struct refs
13131 + *  - removed task->pending_owner dependency
13132 + *  - BKL drop/reacquire for semaphore style locks to avoid deadlocks
13133 + *    in the scheduler return path as discussed with Steven Rostedt
13134 + *
13135 + *  Copyright (C) 2006, Kihon Technologies Inc.
13136 + *    Steven Rostedt <rostedt@goodmis.org>
13137 + *  - debugged and patched Thomas Gleixner's rework.
13138 + *  - added back the cmpxchg to the rework.
13139 + *  - turned atomic require back on for SMP.
13140 + */
13142 +#include <linux/spinlock.h>
13143 +#include <linux/rtmutex.h>
13144 +#include <linux/sched.h>
13145 +#include <linux/delay.h>
13146 +#include <linux/module.h>
13147 +#include <linux/kallsyms.h>
13148 +#include <linux/syscalls.h>
13149 +#include <linux/interrupt.h>
13150 +#include <linux/plist.h>
13151 +#include <linux/fs.h>
13152 +#include <linux/futex.h>
13153 +#include <linux/hrtimer.h>
13155 +#include "rtmutex_common.h"
13158 + * struct mutex functions
13159 + */
13160 +void __mutex_do_init(struct mutex *mutex, const char *name,
13161 +                    struct lock_class_key *key)
13163 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
13164 +       /*
13165 +        * Make sure we are not reinitializing a held lock:
13166 +        */
13167 +       debug_check_no_locks_freed((void *)mutex, sizeof(*mutex));
13168 +       lockdep_init_map(&mutex->dep_map, name, key, 0);
13169 +#endif
13170 +       mutex->lock.save_state = 0;
13172 +EXPORT_SYMBOL(__mutex_do_init);
13174 +void __lockfunc _mutex_lock(struct mutex *lock)
13176 +       mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
13177 +       rt_mutex_lock(&lock->lock);
13179 +EXPORT_SYMBOL(_mutex_lock);
13181 +int __lockfunc _mutex_lock_interruptible(struct mutex *lock)
13183 +       int ret;
13185 +       mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
13186 +       ret = rt_mutex_lock_interruptible(&lock->lock);
13187 +       if (ret)
13188 +               mutex_release(&lock->dep_map, 1, _RET_IP_);
13189 +       return ret;
13191 +EXPORT_SYMBOL(_mutex_lock_interruptible);
13193 +int __lockfunc _mutex_lock_killable(struct mutex *lock)
13195 +       int ret;
13197 +       mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
13198 +       ret = rt_mutex_lock_killable(&lock->lock);
13199 +       if (ret)
13200 +               mutex_release(&lock->dep_map, 1, _RET_IP_);
13201 +       return ret;
13203 +EXPORT_SYMBOL(_mutex_lock_killable);
13205 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
13206 +void __lockfunc _mutex_lock_nested(struct mutex *lock, int subclass)
13208 +       mutex_acquire_nest(&lock->dep_map, subclass, 0, NULL, _RET_IP_);
13209 +       rt_mutex_lock(&lock->lock);
13211 +EXPORT_SYMBOL(_mutex_lock_nested);
13213 +void __lockfunc _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest)
13215 +       mutex_acquire_nest(&lock->dep_map, 0, 0, nest, _RET_IP_);
13216 +       rt_mutex_lock(&lock->lock);
13218 +EXPORT_SYMBOL(_mutex_lock_nest_lock);
13220 +int __lockfunc _mutex_lock_interruptible_nested(struct mutex *lock, int subclass)
13222 +       int ret;
13224 +       mutex_acquire_nest(&lock->dep_map, subclass, 0, NULL, _RET_IP_);
13225 +       ret = rt_mutex_lock_interruptible(&lock->lock);
13226 +       if (ret)
13227 +               mutex_release(&lock->dep_map, 1, _RET_IP_);
13228 +       return ret;
13230 +EXPORT_SYMBOL(_mutex_lock_interruptible_nested);
13232 +int __lockfunc _mutex_lock_killable_nested(struct mutex *lock, int subclass)
13234 +       int ret;
13236 +       mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
13237 +       ret = rt_mutex_lock_killable(&lock->lock);
13238 +       if (ret)
13239 +               mutex_release(&lock->dep_map, 1, _RET_IP_);
13240 +       return ret;
13242 +EXPORT_SYMBOL(_mutex_lock_killable_nested);
13243 +#endif
13245 +int __lockfunc _mutex_trylock(struct mutex *lock)
13247 +       int ret = rt_mutex_trylock(&lock->lock);
13249 +       if (ret)
13250 +               mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
13252 +       return ret;
13254 +EXPORT_SYMBOL(_mutex_trylock);
13256 +void __lockfunc _mutex_unlock(struct mutex *lock)
13258 +       mutex_release(&lock->dep_map, 1, _RET_IP_);
13259 +       rt_mutex_unlock(&lock->lock);
13261 +EXPORT_SYMBOL(_mutex_unlock);
13264 + * rwlock_t functions
13265 + */
13266 +int __lockfunc rt_write_trylock(rwlock_t *rwlock)
13268 +       int ret;
13270 +       migrate_disable();
13271 +       ret = rt_mutex_trylock(&rwlock->lock);
13272 +       if (ret)
13273 +               rwlock_acquire(&rwlock->dep_map, 0, 1, _RET_IP_);
13274 +       else
13275 +               migrate_enable();
13277 +       return ret;
13279 +EXPORT_SYMBOL(rt_write_trylock);
13281 +int __lockfunc rt_write_trylock_irqsave(rwlock_t *rwlock, unsigned long *flags)
13283 +       int ret;
13285 +       *flags = 0;
13286 +       ret = rt_write_trylock(rwlock);
13287 +       return ret;
13289 +EXPORT_SYMBOL(rt_write_trylock_irqsave);
13291 +int __lockfunc rt_read_trylock(rwlock_t *rwlock)
13293 +       struct rt_mutex *lock = &rwlock->lock;
13294 +       int ret = 1;
13296 +       /*
13297 +        * recursive read locks succeed when current owns the lock,
13298 +        * but not when read_depth == 0 which means that the lock is
13299 +        * write locked.
13300 +        */
13301 +       if (rt_mutex_owner(lock) != current) {
13302 +               migrate_disable();
13303 +               ret = rt_mutex_trylock(lock);
13304 +               if (ret)
13305 +                       rwlock_acquire(&rwlock->dep_map, 0, 1, _RET_IP_);
13306 +               else
13307 +                       migrate_enable();
13309 +       } else if (!rwlock->read_depth) {
13310 +               ret = 0;
13311 +       }
13313 +       if (ret)
13314 +               rwlock->read_depth++;
13316 +       return ret;
13318 +EXPORT_SYMBOL(rt_read_trylock);
13320 +void __lockfunc rt_write_lock(rwlock_t *rwlock)
13322 +       rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_);
13323 +       __rt_spin_lock(&rwlock->lock);
13325 +EXPORT_SYMBOL(rt_write_lock);
13327 +void __lockfunc rt_read_lock(rwlock_t *rwlock)
13329 +       struct rt_mutex *lock = &rwlock->lock;
13332 +       /*
13333 +        * recursive read locks succeed when current owns the lock
13334 +        */
13335 +       if (rt_mutex_owner(lock) != current) {
13336 +               rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_);
13337 +               __rt_spin_lock(lock);
13338 +       }
13339 +       rwlock->read_depth++;
13342 +EXPORT_SYMBOL(rt_read_lock);
13344 +void __lockfunc rt_write_unlock(rwlock_t *rwlock)
13346 +       /* NOTE: we always pass in '1' for nested, for simplicity */
13347 +       rwlock_release(&rwlock->dep_map, 1, _RET_IP_);
13348 +       __rt_spin_unlock(&rwlock->lock);
13349 +       migrate_enable();
13351 +EXPORT_SYMBOL(rt_write_unlock);
13353 +void __lockfunc rt_read_unlock(rwlock_t *rwlock)
13355 +       /* Release the lock only when read_depth is down to 0 */
13356 +       if (--rwlock->read_depth == 0) {
13357 +               rwlock_release(&rwlock->dep_map, 1, _RET_IP_);
13358 +               __rt_spin_unlock(&rwlock->lock);
13359 +               migrate_enable();
13360 +       }
13362 +EXPORT_SYMBOL(rt_read_unlock);
13364 +unsigned long __lockfunc rt_write_lock_irqsave(rwlock_t *rwlock)
13366 +       rt_write_lock(rwlock);
13368 +       return 0;
13370 +EXPORT_SYMBOL(rt_write_lock_irqsave);
13372 +unsigned long __lockfunc rt_read_lock_irqsave(rwlock_t *rwlock)
13374 +       rt_read_lock(rwlock);
13376 +       return 0;
13378 +EXPORT_SYMBOL(rt_read_lock_irqsave);
13380 +void __rt_rwlock_init(rwlock_t *rwlock, char *name, struct lock_class_key *key)
13382 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
13383 +       /*
13384 +        * Make sure we are not reinitializing a held lock:
13385 +        */
13386 +       debug_check_no_locks_freed((void *)rwlock, sizeof(*rwlock));
13387 +       lockdep_init_map(&rwlock->dep_map, name, key, 0);
13388 +#endif
13389 +       rwlock->lock.save_state = 1;
13390 +       rwlock->read_depth = 0;
13392 +EXPORT_SYMBOL(__rt_rwlock_init);
13394 +/**
13395 + * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0
13396 + * @cnt: the atomic which we are to dec
13397 + * @lock: the mutex to return holding if we dec to 0
13398 + *
13399 + * return true and hold lock if we dec to 0, return false otherwise
13400 + */
13401 +int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock)
13403 +       /* dec if we can't possibly hit 0 */
13404 +       if (atomic_add_unless(cnt, -1, 1))
13405 +               return 0;
13406 +       /* we might hit 0, so take the lock */
13407 +       mutex_lock(lock);
13408 +       if (!atomic_dec_and_test(cnt)) {
13409 +               /* when we actually did the dec, we didn't hit 0 */
13410 +               mutex_unlock(lock);
13411 +               return 0;
13412 +       }
13413 +       /* we hit 0, and we hold the lock */
13414 +       return 1;
13416 +EXPORT_SYMBOL(atomic_dec_and_mutex_lock);
13417 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/locking/rtmutex-debug.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/locking/rtmutex-debug.c
13418 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/locking/rtmutex-debug.c  2017-04-16 10:38:29.000000000 +0200
13419 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/locking/rtmutex-debug.c       2017-04-18 17:54:26.000000000 +0200
13420 @@ -173,12 +173,3 @@
13421         lock->name = name;
13424 -void
13425 -rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task)
13429 -void rt_mutex_deadlock_account_unlock(struct task_struct *task)
13433 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/locking/rtmutex-debug.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/locking/rtmutex-debug.h
13434 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/locking/rtmutex-debug.h  2017-04-16 10:38:29.000000000 +0200
13435 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/locking/rtmutex-debug.h       2017-04-18 17:54:26.000000000 +0200
13436 @@ -9,9 +9,6 @@
13437   * This file contains macros used solely by rtmutex.c. Debug version.
13438   */
13440 -extern void
13441 -rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task);
13442 -extern void rt_mutex_deadlock_account_unlock(struct task_struct *task);
13443  extern void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter);
13444  extern void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter);
13445  extern void debug_rt_mutex_init(struct rt_mutex *lock, const char *name);
13446 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/locking/rtmutex.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/locking/rtmutex.c
13447 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/locking/rtmutex.c        2017-04-16 10:38:29.000000000 +0200
13448 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/locking/rtmutex.c     2017-04-18 17:54:26.000000000 +0200
13449 @@ -7,6 +7,11 @@
13450   *  Copyright (C) 2005-2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
13451   *  Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt
13452   *  Copyright (C) 2006 Esben Nielsen
13453 + *  Adaptive Spinlocks:
13454 + *  Copyright (C) 2008 Novell, Inc., Gregory Haskins, Sven Dietrich,
13455 + *                                  and Peter Morreale,
13456 + * Adaptive Spinlocks simplification:
13457 + *  Copyright (C) 2008 Red Hat, Inc., Steven Rostedt <srostedt@redhat.com>
13458   *
13459   *  See Documentation/locking/rt-mutex-design.txt for details.
13460   */
13461 @@ -16,6 +21,7 @@
13462  #include <linux/sched/rt.h>
13463  #include <linux/sched/deadline.h>
13464  #include <linux/timer.h>
13465 +#include <linux/ww_mutex.h>
13467  #include "rtmutex_common.h"
13469 @@ -133,6 +139,12 @@
13470                 WRITE_ONCE(*p, owner & ~RT_MUTEX_HAS_WAITERS);
13473 +static int rt_mutex_real_waiter(struct rt_mutex_waiter *waiter)
13475 +       return waiter && waiter != PI_WAKEUP_INPROGRESS &&
13476 +               waiter != PI_REQUEUE_INPROGRESS;
13479  /*
13480   * We can speed up the acquire/release, if there's no debugging state to be
13481   * set up.
13482 @@ -414,6 +426,14 @@
13483         return debug_rt_mutex_detect_deadlock(waiter, chwalk);
13486 +static void rt_mutex_wake_waiter(struct rt_mutex_waiter *waiter)
13488 +       if (waiter->savestate)
13489 +               wake_up_lock_sleeper(waiter->task);
13490 +       else
13491 +               wake_up_process(waiter->task);
13494  /*
13495   * Max number of times we'll walk the boosting chain:
13496   */
13497 @@ -421,7 +441,8 @@
13499  static inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p)
13501 -       return p->pi_blocked_on ? p->pi_blocked_on->lock : NULL;
13502 +       return rt_mutex_real_waiter(p->pi_blocked_on) ?
13503 +               p->pi_blocked_on->lock : NULL;
13506  /*
13507 @@ -557,7 +578,7 @@
13508          * reached or the state of the chain has changed while we
13509          * dropped the locks.
13510          */
13511 -       if (!waiter)
13512 +       if (!rt_mutex_real_waiter(waiter))
13513                 goto out_unlock_pi;
13515         /*
13516 @@ -719,13 +740,16 @@
13517          * follow here. This is the end of the chain we are walking.
13518          */
13519         if (!rt_mutex_owner(lock)) {
13520 +               struct rt_mutex_waiter *lock_top_waiter;
13522                 /*
13523                  * If the requeue [7] above changed the top waiter,
13524                  * then we need to wake the new top waiter up to try
13525                  * to get the lock.
13526                  */
13527 -               if (prerequeue_top_waiter != rt_mutex_top_waiter(lock))
13528 -                       wake_up_process(rt_mutex_top_waiter(lock)->task);
13529 +               lock_top_waiter = rt_mutex_top_waiter(lock);
13530 +               if (prerequeue_top_waiter != lock_top_waiter)
13531 +                       rt_mutex_wake_waiter(lock_top_waiter);
13532                 raw_spin_unlock_irq(&lock->wait_lock);
13533                 return 0;
13534         }
13535 @@ -818,6 +842,25 @@
13536         return ret;
13540 +#define STEAL_NORMAL  0
13541 +#define STEAL_LATERAL 1
13544 + * Note that RT tasks are excluded from lateral-steals to prevent the
13545 + * introduction of an unbounded latency
13546 + */
13547 +static inline int lock_is_stealable(struct task_struct *task,
13548 +                                   struct task_struct *pendowner, int mode)
13550 +    if (mode == STEAL_NORMAL || rt_task(task)) {
13551 +           if (task->prio >= pendowner->prio)
13552 +                   return 0;
13553 +    } else if (task->prio > pendowner->prio)
13554 +           return 0;
13555 +    return 1;
13558  /*
13559   * Try to take an rt-mutex
13560   *
13561 @@ -828,8 +871,9 @@
13562   * @waiter: The waiter that is queued to the lock's wait tree if the
13563   *         callsite called task_blocked_on_lock(), otherwise NULL
13564   */
13565 -static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
13566 -                               struct rt_mutex_waiter *waiter)
13567 +static int __try_to_take_rt_mutex(struct rt_mutex *lock,
13568 +                                 struct task_struct *task,
13569 +                                 struct rt_mutex_waiter *waiter, int mode)
13571         /*
13572          * Before testing whether we can acquire @lock, we set the
13573 @@ -866,8 +910,10 @@
13574                  * If waiter is not the highest priority waiter of
13575                  * @lock, give up.
13576                  */
13577 -               if (waiter != rt_mutex_top_waiter(lock))
13578 +               if (waiter != rt_mutex_top_waiter(lock)) {
13579 +                       /* XXX lock_is_stealable() ? */
13580                         return 0;
13581 +               }
13583                 /*
13584                  * We can acquire the lock. Remove the waiter from the
13585 @@ -885,14 +931,10 @@
13586                  * not need to be dequeued.
13587                  */
13588                 if (rt_mutex_has_waiters(lock)) {
13589 -                       /*
13590 -                        * If @task->prio is greater than or equal to
13591 -                        * the top waiter priority (kernel view),
13592 -                        * @task lost.
13593 -                        */
13594 -                       if (task->prio >= rt_mutex_top_waiter(lock)->prio)
13595 -                               return 0;
13596 +                       struct task_struct *pown = rt_mutex_top_waiter(lock)->task;
13598 +                       if (task != pown && !lock_is_stealable(task, pown, mode))
13599 +                               return 0;
13600                         /*
13601                          * The current top waiter stays enqueued. We
13602                          * don't have to change anything in the lock
13603 @@ -936,10 +978,394 @@
13604          */
13605         rt_mutex_set_owner(lock, task);
13607 -       rt_mutex_deadlock_account_lock(lock, task);
13608 +       return 1;
13611 +#ifdef CONFIG_PREEMPT_RT_FULL
13613 + * preemptible spin_lock functions:
13614 + */
13615 +static inline void rt_spin_lock_fastlock(struct rt_mutex *lock,
13616 +                                        void  (*slowfn)(struct rt_mutex *lock,
13617 +                                                        bool mg_off),
13618 +                                        bool do_mig_dis)
13620 +       might_sleep_no_state_check();
13622 +       if (do_mig_dis)
13623 +               migrate_disable();
13625 +       if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
13626 +               return;
13627 +       else
13628 +               slowfn(lock, do_mig_dis);
13631 +static inline void rt_spin_lock_fastunlock(struct rt_mutex *lock,
13632 +                                          void  (*slowfn)(struct rt_mutex *lock))
13634 +       if (likely(rt_mutex_cmpxchg_release(lock, current, NULL)))
13635 +               return;
13636 +       else
13637 +               slowfn(lock);
13639 +#ifdef CONFIG_SMP
13641 + * Note that owner is a speculative pointer and dereferencing relies
13642 + * on rcu_read_lock() and the check against the lock owner.
13643 + */
13644 +static int adaptive_wait(struct rt_mutex *lock,
13645 +                        struct task_struct *owner)
13647 +       int res = 0;
13649 +       rcu_read_lock();
13650 +       for (;;) {
13651 +               if (owner != rt_mutex_owner(lock))
13652 +                       break;
13653 +               /*
13654 +                * Ensure that owner->on_cpu is dereferenced _after_
13655 +                * checking the above to be valid.
13656 +                */
13657 +               barrier();
13658 +               if (!owner->on_cpu) {
13659 +                       res = 1;
13660 +                       break;
13661 +               }
13662 +               cpu_relax();
13663 +       }
13664 +       rcu_read_unlock();
13665 +       return res;
13667 +#else
13668 +static int adaptive_wait(struct rt_mutex *lock,
13669 +                        struct task_struct *orig_owner)
13671         return 1;
13673 +#endif
13675 +static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
13676 +                                  struct rt_mutex_waiter *waiter,
13677 +                                  struct task_struct *task,
13678 +                                  enum rtmutex_chainwalk chwalk);
13680 + * Slow path lock function spin_lock style: this variant is very
13681 + * careful not to miss any non-lock wakeups.
13682 + *
13683 + * We store the current state under p->pi_lock in p->saved_state and
13684 + * the try_to_wake_up() code handles this accordingly.
13685 + */
13686 +static void  noinline __sched rt_spin_lock_slowlock(struct rt_mutex *lock,
13687 +                                                   bool mg_off)
13689 +       struct task_struct *lock_owner, *self = current;
13690 +       struct rt_mutex_waiter waiter, *top_waiter;
13691 +       unsigned long flags;
13692 +       int ret;
13694 +       rt_mutex_init_waiter(&waiter, true);
13696 +       raw_spin_lock_irqsave(&lock->wait_lock, flags);
13698 +       if (__try_to_take_rt_mutex(lock, self, NULL, STEAL_LATERAL)) {
13699 +               raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
13700 +               return;
13701 +       }
13703 +       BUG_ON(rt_mutex_owner(lock) == self);
13705 +       /*
13706 +        * We save whatever state the task is in and we'll restore it
13707 +        * after acquiring the lock taking real wakeups into account
13708 +        * as well. We are serialized via pi_lock against wakeups. See
13709 +        * try_to_wake_up().
13710 +        */
13711 +       raw_spin_lock(&self->pi_lock);
13712 +       self->saved_state = self->state;
13713 +       __set_current_state_no_track(TASK_UNINTERRUPTIBLE);
13714 +       raw_spin_unlock(&self->pi_lock);
13716 +       ret = task_blocks_on_rt_mutex(lock, &waiter, self, RT_MUTEX_MIN_CHAINWALK);
13717 +       BUG_ON(ret);
13719 +       for (;;) {
13720 +               /* Try to acquire the lock again. */
13721 +               if (__try_to_take_rt_mutex(lock, self, &waiter, STEAL_LATERAL))
13722 +                       break;
13724 +               top_waiter = rt_mutex_top_waiter(lock);
13725 +               lock_owner = rt_mutex_owner(lock);
13727 +               raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
13729 +               debug_rt_mutex_print_deadlock(&waiter);
13731 +               if (top_waiter != &waiter || adaptive_wait(lock, lock_owner)) {
13732 +                       if (mg_off)
13733 +                               migrate_enable();
13734 +                       schedule();
13735 +                       if (mg_off)
13736 +                               migrate_disable();
13737 +               }
13739 +               raw_spin_lock_irqsave(&lock->wait_lock, flags);
13741 +               raw_spin_lock(&self->pi_lock);
13742 +               __set_current_state_no_track(TASK_UNINTERRUPTIBLE);
13743 +               raw_spin_unlock(&self->pi_lock);
13744 +       }
13746 +       /*
13747 +        * Restore the task state to current->saved_state. We set it
13748 +        * to the original state above and the try_to_wake_up() code
13749 +        * has possibly updated it when a real (non-rtmutex) wakeup
13750 +        * happened while we were blocked. Clear saved_state so
13751 +        * try_to_wakeup() does not get confused.
13752 +        */
13753 +       raw_spin_lock(&self->pi_lock);
13754 +       __set_current_state_no_track(self->saved_state);
13755 +       self->saved_state = TASK_RUNNING;
13756 +       raw_spin_unlock(&self->pi_lock);
13758 +       /*
13759 +        * try_to_take_rt_mutex() sets the waiter bit
13760 +        * unconditionally. We might have to fix that up:
13761 +        */
13762 +       fixup_rt_mutex_waiters(lock);
13764 +       BUG_ON(rt_mutex_has_waiters(lock) && &waiter == rt_mutex_top_waiter(lock));
13765 +       BUG_ON(!RB_EMPTY_NODE(&waiter.tree_entry));
13767 +       raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
13769 +       debug_rt_mutex_free_waiter(&waiter);
13772 +static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
13773 +                                   struct wake_q_head *wake_sleeper_q,
13774 +                                   struct rt_mutex *lock);
13776 + * Slow path to release a rt_mutex spin_lock style
13777 + */
13778 +static void  noinline __sched rt_spin_lock_slowunlock(struct rt_mutex *lock)
13780 +       unsigned long flags;
13781 +       WAKE_Q(wake_q);
13782 +       WAKE_Q(wake_sleeper_q);
13784 +       raw_spin_lock_irqsave(&lock->wait_lock, flags);
13786 +       debug_rt_mutex_unlock(lock);
13788 +       if (!rt_mutex_has_waiters(lock)) {
13789 +               lock->owner = NULL;
13790 +               raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
13791 +               return;
13792 +       }
13794 +       mark_wakeup_next_waiter(&wake_q, &wake_sleeper_q, lock);
13796 +       raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
13797 +       wake_up_q(&wake_q);
13798 +       wake_up_q_sleeper(&wake_sleeper_q);
13800 +       /* Undo pi boosting.when necessary */
13801 +       rt_mutex_adjust_prio(current);
13804 +void __lockfunc rt_spin_lock__no_mg(spinlock_t *lock)
13806 +       rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock, false);
13807 +       spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
13809 +EXPORT_SYMBOL(rt_spin_lock__no_mg);
13811 +void __lockfunc rt_spin_lock(spinlock_t *lock)
13813 +       rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock, true);
13814 +       spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
13816 +EXPORT_SYMBOL(rt_spin_lock);
13818 +void __lockfunc __rt_spin_lock(struct rt_mutex *lock)
13820 +       rt_spin_lock_fastlock(lock, rt_spin_lock_slowlock, true);
13822 +EXPORT_SYMBOL(__rt_spin_lock);
13824 +void __lockfunc __rt_spin_lock__no_mg(struct rt_mutex *lock)
13826 +       rt_spin_lock_fastlock(lock, rt_spin_lock_slowlock, false);
13828 +EXPORT_SYMBOL(__rt_spin_lock__no_mg);
13830 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
13831 +void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int subclass)
13833 +       spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
13834 +       rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock, true);
13836 +EXPORT_SYMBOL(rt_spin_lock_nested);
13837 +#endif
13839 +void __lockfunc rt_spin_unlock__no_mg(spinlock_t *lock)
13841 +       /* NOTE: we always pass in '1' for nested, for simplicity */
13842 +       spin_release(&lock->dep_map, 1, _RET_IP_);
13843 +       rt_spin_lock_fastunlock(&lock->lock, rt_spin_lock_slowunlock);
13845 +EXPORT_SYMBOL(rt_spin_unlock__no_mg);
13847 +void __lockfunc rt_spin_unlock(spinlock_t *lock)
13849 +       /* NOTE: we always pass in '1' for nested, for simplicity */
13850 +       spin_release(&lock->dep_map, 1, _RET_IP_);
13851 +       rt_spin_lock_fastunlock(&lock->lock, rt_spin_lock_slowunlock);
13852 +       migrate_enable();
13854 +EXPORT_SYMBOL(rt_spin_unlock);
13856 +void __lockfunc __rt_spin_unlock(struct rt_mutex *lock)
13858 +       rt_spin_lock_fastunlock(lock, rt_spin_lock_slowunlock);
13860 +EXPORT_SYMBOL(__rt_spin_unlock);
13863 + * Wait for the lock to get unlocked: instead of polling for an unlock
13864 + * (like raw spinlocks do), we lock and unlock, to force the kernel to
13865 + * schedule if there's contention:
13866 + */
13867 +void __lockfunc rt_spin_unlock_wait(spinlock_t *lock)
13869 +       spin_lock(lock);
13870 +       spin_unlock(lock);
13872 +EXPORT_SYMBOL(rt_spin_unlock_wait);
13874 +int __lockfunc rt_spin_trylock__no_mg(spinlock_t *lock)
13876 +       int ret;
13878 +       ret = rt_mutex_trylock(&lock->lock);
13879 +       if (ret)
13880 +               spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
13881 +       return ret;
13883 +EXPORT_SYMBOL(rt_spin_trylock__no_mg);
13885 +int __lockfunc rt_spin_trylock(spinlock_t *lock)
13887 +       int ret;
13889 +       migrate_disable();
13890 +       ret = rt_mutex_trylock(&lock->lock);
13891 +       if (ret)
13892 +               spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
13893 +       else
13894 +               migrate_enable();
13895 +       return ret;
13897 +EXPORT_SYMBOL(rt_spin_trylock);
13899 +int __lockfunc rt_spin_trylock_bh(spinlock_t *lock)
13901 +       int ret;
13903 +       local_bh_disable();
13904 +       ret = rt_mutex_trylock(&lock->lock);
13905 +       if (ret) {
13906 +               migrate_disable();
13907 +               spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
13908 +       } else
13909 +               local_bh_enable();
13910 +       return ret;
13912 +EXPORT_SYMBOL(rt_spin_trylock_bh);
13914 +int __lockfunc rt_spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags)
13916 +       int ret;
13918 +       *flags = 0;
13919 +       ret = rt_mutex_trylock(&lock->lock);
13920 +       if (ret) {
13921 +               migrate_disable();
13922 +               spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
13923 +       }
13924 +       return ret;
13926 +EXPORT_SYMBOL(rt_spin_trylock_irqsave);
13928 +int atomic_dec_and_spin_lock(atomic_t *atomic, spinlock_t *lock)
13930 +       /* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */
13931 +       if (atomic_add_unless(atomic, -1, 1))
13932 +               return 0;
13933 +       rt_spin_lock(lock);
13934 +       if (atomic_dec_and_test(atomic))
13935 +               return 1;
13936 +       rt_spin_unlock(lock);
13937 +       return 0;
13939 +EXPORT_SYMBOL(atomic_dec_and_spin_lock);
13941 +       void
13942 +__rt_spin_lock_init(spinlock_t *lock, char *name, struct lock_class_key *key)
13944 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
13945 +       /*
13946 +        * Make sure we are not reinitializing a held lock:
13947 +        */
13948 +       debug_check_no_locks_freed((void *)lock, sizeof(*lock));
13949 +       lockdep_init_map(&lock->dep_map, name, key, 0);
13950 +#endif
13952 +EXPORT_SYMBOL(__rt_spin_lock_init);
13954 +#endif /* PREEMPT_RT_FULL */
13956 +#ifdef CONFIG_PREEMPT_RT_FULL
13957 +       static inline int __sched
13958 +__mutex_lock_check_stamp(struct rt_mutex *lock, struct ww_acquire_ctx *ctx)
13960 +       struct ww_mutex *ww = container_of(lock, struct ww_mutex, base.lock);
13961 +       struct ww_acquire_ctx *hold_ctx = ACCESS_ONCE(ww->ctx);
13963 +       if (!hold_ctx)
13964 +               return 0;
13966 +       if (unlikely(ctx == hold_ctx))
13967 +               return -EALREADY;
13969 +       if (ctx->stamp - hold_ctx->stamp <= LONG_MAX &&
13970 +           (ctx->stamp != hold_ctx->stamp || ctx > hold_ctx)) {
13971 +#ifdef CONFIG_DEBUG_MUTEXES
13972 +               DEBUG_LOCKS_WARN_ON(ctx->contending_lock);
13973 +               ctx->contending_lock = ww;
13974 +#endif
13975 +               return -EDEADLK;
13976 +       }
13978 +       return 0;
13980 +#else
13981 +       static inline int __sched
13982 +__mutex_lock_check_stamp(struct rt_mutex *lock, struct ww_acquire_ctx *ctx)
13984 +       BUG();
13985 +       return 0;
13988 +#endif
13990 +static inline int
13991 +try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
13992 +                    struct rt_mutex_waiter *waiter)
13994 +       return __try_to_take_rt_mutex(lock, task, waiter, STEAL_NORMAL);
13997  /*
13998   * Task blocks on lock.
13999 @@ -971,6 +1397,23 @@
14000                 return -EDEADLK;
14002         raw_spin_lock(&task->pi_lock);
14004 +       /*
14005 +        * In the case of futex requeue PI, this will be a proxy
14006 +        * lock. The task will wake unaware that it is enqueueed on
14007 +        * this lock. Avoid blocking on two locks and corrupting
14008 +        * pi_blocked_on via the PI_WAKEUP_INPROGRESS
14009 +        * flag. futex_wait_requeue_pi() sets this when it wakes up
14010 +        * before requeue (due to a signal or timeout). Do not enqueue
14011 +        * the task if PI_WAKEUP_INPROGRESS is set.
14012 +        */
14013 +       if (task != current && task->pi_blocked_on == PI_WAKEUP_INPROGRESS) {
14014 +               raw_spin_unlock(&task->pi_lock);
14015 +               return -EAGAIN;
14016 +       }
14018 +       BUG_ON(rt_mutex_real_waiter(task->pi_blocked_on));
14020         __rt_mutex_adjust_prio(task);
14021         waiter->task = task;
14022         waiter->lock = lock;
14023 @@ -994,7 +1437,7 @@
14024                 rt_mutex_enqueue_pi(owner, waiter);
14026                 __rt_mutex_adjust_prio(owner);
14027 -               if (owner->pi_blocked_on)
14028 +               if (rt_mutex_real_waiter(owner->pi_blocked_on))
14029                         chain_walk = 1;
14030         } else if (rt_mutex_cond_detect_deadlock(waiter, chwalk)) {
14031                 chain_walk = 1;
14032 @@ -1036,6 +1479,7 @@
14033   * Called with lock->wait_lock held and interrupts disabled.
14034   */
14035  static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
14036 +                                   struct wake_q_head *wake_sleeper_q,
14037                                     struct rt_mutex *lock)
14039         struct rt_mutex_waiter *waiter;
14040 @@ -1064,7 +1508,10 @@
14042         raw_spin_unlock(&current->pi_lock);
14044 -       wake_q_add(wake_q, waiter->task);
14045 +       if (waiter->savestate)
14046 +               wake_q_add(wake_sleeper_q, waiter->task);
14047 +       else
14048 +               wake_q_add(wake_q, waiter->task);
14051  /*
14052 @@ -1078,7 +1525,7 @@
14054         bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock));
14055         struct task_struct *owner = rt_mutex_owner(lock);
14056 -       struct rt_mutex *next_lock;
14057 +       struct rt_mutex *next_lock = NULL;
14059         raw_spin_lock(&current->pi_lock);
14060         rt_mutex_dequeue(lock, waiter);
14061 @@ -1102,7 +1549,8 @@
14062         __rt_mutex_adjust_prio(owner);
14064         /* Store the lock on which owner is blocked or NULL */
14065 -       next_lock = task_blocked_on_lock(owner);
14066 +       if (rt_mutex_real_waiter(owner->pi_blocked_on))
14067 +               next_lock = task_blocked_on_lock(owner);
14069         raw_spin_unlock(&owner->pi_lock);
14071 @@ -1138,21 +1586,30 @@
14072         raw_spin_lock_irqsave(&task->pi_lock, flags);
14074         waiter = task->pi_blocked_on;
14075 -       if (!waiter || (waiter->prio == task->prio &&
14076 +       if (!rt_mutex_real_waiter(waiter) || (waiter->prio == task->prio &&
14077                         !dl_prio(task->prio))) {
14078                 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
14079                 return;
14080         }
14081         next_lock = waiter->lock;
14082 -       raw_spin_unlock_irqrestore(&task->pi_lock, flags);
14084         /* gets dropped in rt_mutex_adjust_prio_chain()! */
14085         get_task_struct(task);
14087 +       raw_spin_unlock_irqrestore(&task->pi_lock, flags);
14088         rt_mutex_adjust_prio_chain(task, RT_MUTEX_MIN_CHAINWALK, NULL,
14089                                    next_lock, NULL, task);
14092 +void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter, bool savestate)
14094 +       debug_rt_mutex_init_waiter(waiter);
14095 +       RB_CLEAR_NODE(&waiter->pi_tree_entry);
14096 +       RB_CLEAR_NODE(&waiter->tree_entry);
14097 +       waiter->task = NULL;
14098 +       waiter->savestate = savestate;
14101  /**
14102   * __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop
14103   * @lock:               the rt_mutex to take
14104 @@ -1166,7 +1623,8 @@
14105  static int __sched
14106  __rt_mutex_slowlock(struct rt_mutex *lock, int state,
14107                     struct hrtimer_sleeper *timeout,
14108 -                   struct rt_mutex_waiter *waiter)
14109 +                   struct rt_mutex_waiter *waiter,
14110 +                   struct ww_acquire_ctx *ww_ctx)
14112         int ret = 0;
14114 @@ -1175,16 +1633,17 @@
14115                 if (try_to_take_rt_mutex(lock, current, waiter))
14116                         break;
14118 -               /*
14119 -                * TASK_INTERRUPTIBLE checks for signals and
14120 -                * timeout. Ignored otherwise.
14121 -                */
14122 -               if (unlikely(state == TASK_INTERRUPTIBLE)) {
14123 -                       /* Signal pending? */
14124 -                       if (signal_pending(current))
14125 -                               ret = -EINTR;
14126 -                       if (timeout && !timeout->task)
14127 -                               ret = -ETIMEDOUT;
14128 +               if (timeout && !timeout->task) {
14129 +                       ret = -ETIMEDOUT;
14130 +                       break;
14131 +               }
14132 +               if (signal_pending_state(state, current)) {
14133 +                       ret = -EINTR;
14134 +                       break;
14135 +               }
14137 +               if (ww_ctx && ww_ctx->acquired > 0) {
14138 +                       ret = __mutex_lock_check_stamp(lock, ww_ctx);
14139                         if (ret)
14140                                 break;
14141                 }
14142 @@ -1223,35 +1682,94 @@
14143         }
14147 - * Slow path lock function:
14148 - */
14149 -static int __sched
14150 -rt_mutex_slowlock(struct rt_mutex *lock, int state,
14151 -                 struct hrtimer_sleeper *timeout,
14152 -                 enum rtmutex_chainwalk chwalk)
14153 +static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww,
14154 +                                                  struct ww_acquire_ctx *ww_ctx)
14156 -       struct rt_mutex_waiter waiter;
14157 -       unsigned long flags;
14158 -       int ret = 0;
14159 +#ifdef CONFIG_DEBUG_MUTEXES
14160 +       /*
14161 +        * If this WARN_ON triggers, you used ww_mutex_lock to acquire,
14162 +        * but released with a normal mutex_unlock in this call.
14163 +        *
14164 +        * This should never happen, always use ww_mutex_unlock.
14165 +        */
14166 +       DEBUG_LOCKS_WARN_ON(ww->ctx);
14168 -       debug_rt_mutex_init_waiter(&waiter);
14169 -       RB_CLEAR_NODE(&waiter.pi_tree_entry);
14170 -       RB_CLEAR_NODE(&waiter.tree_entry);
14171 +       /*
14172 +        * Not quite done after calling ww_acquire_done() ?
14173 +        */
14174 +       DEBUG_LOCKS_WARN_ON(ww_ctx->done_acquire);
14176 +       if (ww_ctx->contending_lock) {
14177 +               /*
14178 +                * After -EDEADLK you tried to
14179 +                * acquire a different ww_mutex? Bad!
14180 +                */
14181 +               DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock != ww);
14183 +               /*
14184 +                * You called ww_mutex_lock after receiving -EDEADLK,
14185 +                * but 'forgot' to unlock everything else first?
14186 +                */
14187 +               DEBUG_LOCKS_WARN_ON(ww_ctx->acquired > 0);
14188 +               ww_ctx->contending_lock = NULL;
14189 +       }
14191         /*
14192 -        * Technically we could use raw_spin_[un]lock_irq() here, but this can
14193 -        * be called in early boot if the cmpxchg() fast path is disabled
14194 -        * (debug, no architecture support). In this case we will acquire the
14195 -        * rtmutex with lock->wait_lock held. But we cannot unconditionally
14196 -        * enable interrupts in that early boot case. So we need to use the
14197 -        * irqsave/restore variants.
14198 +        * Naughty, using a different class will lead to undefined behavior!
14199          */
14200 -       raw_spin_lock_irqsave(&lock->wait_lock, flags);
14201 +       DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class);
14202 +#endif
14203 +       ww_ctx->acquired++;
14206 +#ifdef CONFIG_PREEMPT_RT_FULL
14207 +static void ww_mutex_account_lock(struct rt_mutex *lock,
14208 +                                 struct ww_acquire_ctx *ww_ctx)
14210 +       struct ww_mutex *ww = container_of(lock, struct ww_mutex, base.lock);
14211 +       struct rt_mutex_waiter *waiter, *n;
14213 +       /*
14214 +        * This branch gets optimized out for the common case,
14215 +        * and is only important for ww_mutex_lock.
14216 +        */
14217 +       ww_mutex_lock_acquired(ww, ww_ctx);
14218 +       ww->ctx = ww_ctx;
14220 +       /*
14221 +        * Give any possible sleeping processes the chance to wake up,
14222 +        * so they can recheck if they have to back off.
14223 +        */
14224 +       rbtree_postorder_for_each_entry_safe(waiter, n, &lock->waiters,
14225 +                                            tree_entry) {
14226 +               /* XXX debug rt mutex waiter wakeup */
14228 +               BUG_ON(waiter->lock != lock);
14229 +               rt_mutex_wake_waiter(waiter);
14230 +       }
14233 +#else
14235 +static void ww_mutex_account_lock(struct rt_mutex *lock,
14236 +                                 struct ww_acquire_ctx *ww_ctx)
14238 +       BUG();
14240 +#endif
14242 +int __sched rt_mutex_slowlock_locked(struct rt_mutex *lock, int state,
14243 +                                    struct hrtimer_sleeper *timeout,
14244 +                                    enum rtmutex_chainwalk chwalk,
14245 +                                    struct ww_acquire_ctx *ww_ctx,
14246 +                                    struct rt_mutex_waiter *waiter)
14248 +       int ret;
14250         /* Try to acquire the lock again: */
14251         if (try_to_take_rt_mutex(lock, current, NULL)) {
14252 -               raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
14253 +               if (ww_ctx)
14254 +                       ww_mutex_account_lock(lock, ww_ctx);
14255                 return 0;
14256         }
14258 @@ -1261,17 +1779,27 @@
14259         if (unlikely(timeout))
14260                 hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
14262 -       ret = task_blocks_on_rt_mutex(lock, &waiter, current, chwalk);
14263 +       ret = task_blocks_on_rt_mutex(lock, waiter, current, chwalk);
14265 -       if (likely(!ret))
14266 +       if (likely(!ret)) {
14267                 /* sleep on the mutex */
14268 -               ret = __rt_mutex_slowlock(lock, state, timeout, &waiter);
14269 +               ret = __rt_mutex_slowlock(lock, state, timeout, waiter,
14270 +                                         ww_ctx);
14271 +       } else if (ww_ctx) {
14272 +               /* ww_mutex received EDEADLK, let it become EALREADY */
14273 +               ret = __mutex_lock_check_stamp(lock, ww_ctx);
14274 +               BUG_ON(!ret);
14275 +       }
14277         if (unlikely(ret)) {
14278                 __set_current_state(TASK_RUNNING);
14279                 if (rt_mutex_has_waiters(lock))
14280 -                       remove_waiter(lock, &waiter);
14281 -               rt_mutex_handle_deadlock(ret, chwalk, &waiter);
14282 +                       remove_waiter(lock, waiter);
14283 +               /* ww_mutex want to report EDEADLK/EALREADY, let them */
14284 +               if (!ww_ctx)
14285 +                       rt_mutex_handle_deadlock(ret, chwalk, waiter);
14286 +       } else if (ww_ctx) {
14287 +               ww_mutex_account_lock(lock, ww_ctx);
14288         }
14290         /*
14291 @@ -1279,6 +1807,36 @@
14292          * unconditionally. We might have to fix that up.
14293          */
14294         fixup_rt_mutex_waiters(lock);
14295 +       return ret;
14299 + * Slow path lock function:
14300 + */
14301 +static int __sched
14302 +rt_mutex_slowlock(struct rt_mutex *lock, int state,
14303 +                 struct hrtimer_sleeper *timeout,
14304 +                 enum rtmutex_chainwalk chwalk,
14305 +                 struct ww_acquire_ctx *ww_ctx)
14307 +       struct rt_mutex_waiter waiter;
14308 +       unsigned long flags;
14309 +       int ret = 0;
14311 +       rt_mutex_init_waiter(&waiter, false);
14313 +       /*
14314 +        * Technically we could use raw_spin_[un]lock_irq() here, but this can
14315 +        * be called in early boot if the cmpxchg() fast path is disabled
14316 +        * (debug, no architecture support). In this case we will acquire the
14317 +        * rtmutex with lock->wait_lock held. But we cannot unconditionally
14318 +        * enable interrupts in that early boot case. So we need to use the
14319 +        * irqsave/restore variants.
14320 +        */
14321 +       raw_spin_lock_irqsave(&lock->wait_lock, flags);
14323 +       ret = rt_mutex_slowlock_locked(lock, state, timeout, chwalk, ww_ctx,
14324 +                                      &waiter);
14326         raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
14328 @@ -1331,7 +1889,8 @@
14329   * Return whether the current task needs to undo a potential priority boosting.
14330   */
14331  static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
14332 -                                       struct wake_q_head *wake_q)
14333 +                                       struct wake_q_head *wake_q,
14334 +                                       struct wake_q_head *wake_sleeper_q)
14336         unsigned long flags;
14338 @@ -1340,8 +1899,6 @@
14340         debug_rt_mutex_unlock(lock);
14342 -       rt_mutex_deadlock_account_unlock(current);
14344         /*
14345          * We must be careful here if the fast path is enabled. If we
14346          * have no waiters queued we cannot set owner to NULL here
14347 @@ -1387,7 +1944,7 @@
14348          *
14349          * Queue the next waiter for wakeup once we release the wait_lock.
14350          */
14351 -       mark_wakeup_next_waiter(wake_q, lock);
14352 +       mark_wakeup_next_waiter(wake_q, wake_sleeper_q, lock);
14354         raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
14356 @@ -1403,63 +1960,79 @@
14357   */
14358  static inline int
14359  rt_mutex_fastlock(struct rt_mutex *lock, int state,
14360 +                 struct ww_acquire_ctx *ww_ctx,
14361                   int (*slowfn)(struct rt_mutex *lock, int state,
14362                                 struct hrtimer_sleeper *timeout,
14363 -                               enum rtmutex_chainwalk chwalk))
14364 +                               enum rtmutex_chainwalk chwalk,
14365 +                               struct ww_acquire_ctx *ww_ctx))
14367 -       if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) {
14368 -               rt_mutex_deadlock_account_lock(lock, current);
14369 +       if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
14370                 return 0;
14371 -       } else
14372 -               return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK);
14374 +       return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK, ww_ctx);
14377  static inline int
14378  rt_mutex_timed_fastlock(struct rt_mutex *lock, int state,
14379                         struct hrtimer_sleeper *timeout,
14380                         enum rtmutex_chainwalk chwalk,
14381 +                       struct ww_acquire_ctx *ww_ctx,
14382                         int (*slowfn)(struct rt_mutex *lock, int state,
14383                                       struct hrtimer_sleeper *timeout,
14384 -                                     enum rtmutex_chainwalk chwalk))
14385 +                                     enum rtmutex_chainwalk chwalk,
14386 +                                     struct ww_acquire_ctx *ww_ctx))
14388         if (chwalk == RT_MUTEX_MIN_CHAINWALK &&
14389 -           likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) {
14390 -               rt_mutex_deadlock_account_lock(lock, current);
14391 +           likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
14392                 return 0;
14393 -       } else
14394 -               return slowfn(lock, state, timeout, chwalk);
14396 +       return slowfn(lock, state, timeout, chwalk, ww_ctx);
14399  static inline int
14400  rt_mutex_fasttrylock(struct rt_mutex *lock,
14401                      int (*slowfn)(struct rt_mutex *lock))
14403 -       if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) {
14404 -               rt_mutex_deadlock_account_lock(lock, current);
14405 +       if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
14406                 return 1;
14407 -       }
14409         return slowfn(lock);
14412  static inline void
14413  rt_mutex_fastunlock(struct rt_mutex *lock,
14414                     bool (*slowfn)(struct rt_mutex *lock,
14415 -                                  struct wake_q_head *wqh))
14416 +                                  struct wake_q_head *wqh,
14417 +                                  struct wake_q_head *wq_sleeper))
14419         WAKE_Q(wake_q);
14420 +       WAKE_Q(wake_sleeper_q);
14421 +       bool deboost;
14423 -       if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) {
14424 -               rt_mutex_deadlock_account_unlock(current);
14425 +       if (likely(rt_mutex_cmpxchg_release(lock, current, NULL)))
14426 +               return;
14428 -       } else {
14429 -               bool deboost = slowfn(lock, &wake_q);
14430 +       deboost = slowfn(lock, &wake_q, &wake_sleeper_q);
14432 -               wake_up_q(&wake_q);
14433 +       wake_up_q(&wake_q);
14434 +       wake_up_q_sleeper(&wake_sleeper_q);
14436 -               /* Undo pi boosting if necessary: */
14437 -               if (deboost)
14438 -                       rt_mutex_adjust_prio(current);
14439 -       }
14440 +       /* Undo pi boosting if necessary: */
14441 +       if (deboost)
14442 +               rt_mutex_adjust_prio(current);
14445 +/**
14446 + * rt_mutex_lock_state - lock a rt_mutex with a given state
14447 + *
14448 + * @lock:      The rt_mutex to be locked
14449 + * @state:     The state to set when blocking on the rt_mutex
14450 + */
14451 +int __sched rt_mutex_lock_state(struct rt_mutex *lock, int state)
14453 +       might_sleep();
14455 +       return rt_mutex_fastlock(lock, state, NULL, rt_mutex_slowlock);
14458  /**
14459 @@ -1469,15 +2042,13 @@
14460   */
14461  void __sched rt_mutex_lock(struct rt_mutex *lock)
14463 -       might_sleep();
14465 -       rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, rt_mutex_slowlock);
14466 +       rt_mutex_lock_state(lock, TASK_UNINTERRUPTIBLE);
14468  EXPORT_SYMBOL_GPL(rt_mutex_lock);
14470  /**
14471   * rt_mutex_lock_interruptible - lock a rt_mutex interruptible
14472 - *
14473 + **
14474   * @lock:              the rt_mutex to be locked
14475   *
14476   * Returns:
14477 @@ -1486,23 +2057,32 @@
14478   */
14479  int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock)
14481 -       might_sleep();
14483 -       return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, rt_mutex_slowlock);
14484 +       return rt_mutex_lock_state(lock, TASK_INTERRUPTIBLE);
14486  EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
14489 - * Futex variant with full deadlock detection.
14490 +/**
14491 + * rt_mutex_lock_killable - lock a rt_mutex killable
14492 + *
14493 + * @lock:              the rt_mutex to be locked
14494 + * @detect_deadlock:   deadlock detection on/off
14495 + *
14496 + * Returns:
14497 + *  0          on success
14498 + * -EINTR      when interrupted by a signal
14499   */
14500 -int rt_mutex_timed_futex_lock(struct rt_mutex *lock,
14501 -                             struct hrtimer_sleeper *timeout)
14502 +int __sched rt_mutex_lock_killable(struct rt_mutex *lock)
14504 -       might_sleep();
14505 +       return rt_mutex_lock_state(lock, TASK_KILLABLE);
14507 +EXPORT_SYMBOL_GPL(rt_mutex_lock_killable);
14509 -       return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
14510 -                                      RT_MUTEX_FULL_CHAINWALK,
14511 -                                      rt_mutex_slowlock);
14513 + * Futex variant, must not use fastpath.
14514 + */
14515 +int __sched rt_mutex_futex_trylock(struct rt_mutex *lock)
14517 +       return rt_mutex_slowtrylock(lock);
14520  /**
14521 @@ -1525,6 +2105,7 @@
14523         return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
14524                                        RT_MUTEX_MIN_CHAINWALK,
14525 +                                      NULL,
14526                                        rt_mutex_slowlock);
14528  EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);
14529 @@ -1542,7 +2123,11 @@
14530   */
14531  int __sched rt_mutex_trylock(struct rt_mutex *lock)
14533 +#ifdef CONFIG_PREEMPT_RT_FULL
14534 +       if (WARN_ON_ONCE(in_irq() || in_nmi()))
14535 +#else
14536         if (WARN_ON_ONCE(in_irq() || in_nmi() || in_serving_softirq()))
14537 +#endif
14538                 return 0;
14540         return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock);
14541 @@ -1561,20 +2146,41 @@
14542  EXPORT_SYMBOL_GPL(rt_mutex_unlock);
14544  /**
14545 - * rt_mutex_futex_unlock - Futex variant of rt_mutex_unlock
14546 - * @lock: the rt_mutex to be unlocked
14547 - *
14548 - * Returns: true/false indicating whether priority adjustment is
14549 - * required or not.
14550 + * Futex variant, that since futex variants do not use the fast-path, can be
14551 + * simple and will not need to retry.
14552   */
14553 -bool __sched rt_mutex_futex_unlock(struct rt_mutex *lock,
14554 -                                  struct wake_q_head *wqh)
14555 +bool __sched __rt_mutex_futex_unlock(struct rt_mutex *lock,
14556 +                                   struct wake_q_head *wake_q,
14557 +                                   struct wake_q_head *wq_sleeper)
14559 +       lockdep_assert_held(&lock->wait_lock);
14561 +       debug_rt_mutex_unlock(lock);
14563 +       if (!rt_mutex_has_waiters(lock)) {
14564 +               lock->owner = NULL;
14565 +               return false; /* done */
14566 +       }
14568 +       mark_wakeup_next_waiter(wake_q, wq_sleeper, lock);
14569 +       return true; /* deboost and wakeups */
14572 +void __sched rt_mutex_futex_unlock(struct rt_mutex *lock)
14574 -       if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) {
14575 -               rt_mutex_deadlock_account_unlock(current);
14576 -               return false;
14577 +       WAKE_Q(wake_q);
14578 +       WAKE_Q(wake_sleeper_q);
14579 +       bool deboost;
14581 +       raw_spin_lock_irq(&lock->wait_lock);
14582 +       deboost = __rt_mutex_futex_unlock(lock, &wake_q, &wake_sleeper_q);
14583 +       raw_spin_unlock_irq(&lock->wait_lock);
14585 +       if (deboost) {
14586 +               wake_up_q(&wake_q);
14587 +               wake_up_q_sleeper(&wake_sleeper_q);
14588 +               rt_mutex_adjust_prio(current);
14589         }
14590 -       return rt_mutex_slowunlock(lock, wqh);
14593  /**
14594 @@ -1607,13 +2213,12 @@
14595  void __rt_mutex_init(struct rt_mutex *lock, const char *name)
14597         lock->owner = NULL;
14598 -       raw_spin_lock_init(&lock->wait_lock);
14599         lock->waiters = RB_ROOT;
14600         lock->waiters_leftmost = NULL;
14602         debug_rt_mutex_init(lock, name);
14604 -EXPORT_SYMBOL_GPL(__rt_mutex_init);
14605 +EXPORT_SYMBOL(__rt_mutex_init);
14607  /**
14608   * rt_mutex_init_proxy_locked - initialize and lock a rt_mutex on behalf of a
14609 @@ -1628,10 +2233,9 @@
14610  void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
14611                                 struct task_struct *proxy_owner)
14613 -       __rt_mutex_init(lock, NULL);
14614 +       rt_mutex_init(lock);
14615         debug_rt_mutex_proxy_lock(lock, proxy_owner);
14616         rt_mutex_set_owner(lock, proxy_owner);
14617 -       rt_mutex_deadlock_account_lock(lock, proxy_owner);
14620  /**
14621 @@ -1647,34 +2251,45 @@
14623         debug_rt_mutex_proxy_unlock(lock);
14624         rt_mutex_set_owner(lock, NULL);
14625 -       rt_mutex_deadlock_account_unlock(proxy_owner);
14628 -/**
14629 - * rt_mutex_start_proxy_lock() - Start lock acquisition for another task
14630 - * @lock:              the rt_mutex to take
14631 - * @waiter:            the pre-initialized rt_mutex_waiter
14632 - * @task:              the task to prepare
14633 - *
14634 - * Returns:
14635 - *  0 - task blocked on lock
14636 - *  1 - acquired the lock for task, caller should wake it up
14637 - * <0 - error
14638 - *
14639 - * Special API call for FUTEX_REQUEUE_PI support.
14640 - */
14641 -int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
14642 +int __rt_mutex_start_proxy_lock(struct rt_mutex *lock,
14643                               struct rt_mutex_waiter *waiter,
14644                               struct task_struct *task)
14646         int ret;
14648 -       raw_spin_lock_irq(&lock->wait_lock);
14649 +       if (try_to_take_rt_mutex(lock, task, NULL))
14650 +               return 1;
14652 -       if (try_to_take_rt_mutex(lock, task, NULL)) {
14653 +#ifdef CONFIG_PREEMPT_RT_FULL
14654 +       /*
14655 +        * In PREEMPT_RT there's an added race.
14656 +        * If the task, that we are about to requeue, times out,
14657 +        * it can set the PI_WAKEUP_INPROGRESS. This tells the requeue
14658 +        * to skip this task. But right after the task sets
14659 +        * its pi_blocked_on to PI_WAKEUP_INPROGRESS it can then
14660 +        * block on the spin_lock(&hb->lock), which in RT is an rtmutex.
14661 +        * This will replace the PI_WAKEUP_INPROGRESS with the actual
14662 +        * lock that it blocks on. We *must not* place this task
14663 +        * on this proxy lock in that case.
14664 +        *
14665 +        * To prevent this race, we first take the task's pi_lock
14666 +        * and check if it has updated its pi_blocked_on. If it has,
14667 +        * we assume that it woke up and we return -EAGAIN.
14668 +        * Otherwise, we set the task's pi_blocked_on to
14669 +        * PI_REQUEUE_INPROGRESS, so that if the task is waking up
14670 +        * it will know that we are in the process of requeuing it.
14671 +        */
14672 +       raw_spin_lock(&task->pi_lock);
14673 +       if (task->pi_blocked_on) {
14674 +               raw_spin_unlock(&task->pi_lock);
14675                 raw_spin_unlock_irq(&lock->wait_lock);
14676 -               return 1;
14677 +               return -EAGAIN;
14678         }
14679 +       task->pi_blocked_on = PI_REQUEUE_INPROGRESS;
14680 +       raw_spin_unlock(&task->pi_lock);
14681 +#endif
14683         /* We enforce deadlock detection for futexes */
14684         ret = task_blocks_on_rt_mutex(lock, waiter, task,
14685 @@ -1690,17 +2305,41 @@
14686                 ret = 0;
14687         }
14689 -       if (unlikely(ret))
14690 +       if (ret && rt_mutex_has_waiters(lock))
14691                 remove_waiter(lock, waiter);
14693 -       raw_spin_unlock_irq(&lock->wait_lock);
14695         debug_rt_mutex_print_deadlock(waiter);
14697         return ret;
14700  /**
14701 + * rt_mutex_start_proxy_lock() - Start lock acquisition for another task
14702 + * @lock:              the rt_mutex to take
14703 + * @waiter:            the pre-initialized rt_mutex_waiter
14704 + * @task:              the task to prepare
14705 + *
14706 + * Returns:
14707 + *  0 - task blocked on lock
14708 + *  1 - acquired the lock for task, caller should wake it up
14709 + * <0 - error
14710 + *
14711 + * Special API call for FUTEX_REQUEUE_PI support.
14712 + */
14713 +int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
14714 +                             struct rt_mutex_waiter *waiter,
14715 +                             struct task_struct *task)
14717 +       int ret;
14719 +       raw_spin_lock_irq(&lock->wait_lock);
14720 +       ret = __rt_mutex_start_proxy_lock(lock, waiter, task);
14721 +       raw_spin_unlock_irq(&lock->wait_lock);
14723 +       return ret;
14726 +/**
14727   * rt_mutex_next_owner - return the next owner of the lock
14728   *
14729   * @lock: the rt lock query
14730 @@ -1721,21 +2360,23 @@
14733  /**
14734 - * rt_mutex_finish_proxy_lock() - Complete lock acquisition
14735 + * rt_mutex_wait_proxy_lock() - Wait for lock acquisition
14736   * @lock:              the rt_mutex we were woken on
14737   * @to:                        the timeout, null if none. hrtimer should already have
14738   *                     been started.
14739   * @waiter:            the pre-initialized rt_mutex_waiter
14740   *
14741 - * Complete the lock acquisition started our behalf by another thread.
14742 + * Wait for the the lock acquisition started on our behalf by
14743 + * rt_mutex_start_proxy_lock(). Upon failure, the caller must call
14744 + * rt_mutex_cleanup_proxy_lock().
14745   *
14746   * Returns:
14747   *  0 - success
14748   * <0 - error, one of -EINTR, -ETIMEDOUT
14749   *
14750 - * Special API call for PI-futex requeue support
14751 + * Special API call for PI-futex support
14752   */
14753 -int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
14754 +int rt_mutex_wait_proxy_lock(struct rt_mutex *lock,
14755                                struct hrtimer_sleeper *to,
14756                                struct rt_mutex_waiter *waiter)
14758 @@ -1746,10 +2387,47 @@
14759         set_current_state(TASK_INTERRUPTIBLE);
14761         /* sleep on the mutex */
14762 -       ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter);
14763 +       ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter, NULL);
14765 -       if (unlikely(ret))
14766 +       raw_spin_unlock_irq(&lock->wait_lock);
14768 +       return ret;
14771 +/**
14772 + * rt_mutex_cleanup_proxy_lock() - Cleanup failed lock acquisition
14773 + * @lock:              the rt_mutex we were woken on
14774 + * @waiter:            the pre-initialized rt_mutex_waiter
14775 + *
14776 + * Attempt to clean up after a failed rt_mutex_wait_proxy_lock().
14777 + *
14778 + * Unless we acquired the lock; we're still enqueued on the wait-list and can
14779 + * in fact still be granted ownership until we're removed. Therefore we can
14780 + * find we are in fact the owner and must disregard the
14781 + * rt_mutex_wait_proxy_lock() failure.
14782 + *
14783 + * Returns:
14784 + *  true  - did the cleanup, we done.
14785 + *  false - we acquired the lock after rt_mutex_wait_proxy_lock() returned,
14786 + *          caller should disregards its return value.
14787 + *
14788 + * Special API call for PI-futex support
14789 + */
14790 +bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock,
14791 +                                struct rt_mutex_waiter *waiter)
14793 +       bool cleanup = false;
14795 +       raw_spin_lock_irq(&lock->wait_lock);
14796 +       /*
14797 +        * Unless we're the owner; we're still enqueued on the wait_list.
14798 +        * So check if we became owner, if not, take us off the wait_list.
14799 +        */
14800 +       if (rt_mutex_owner(lock) != current) {
14801                 remove_waiter(lock, waiter);
14802 +               fixup_rt_mutex_waiters(lock);
14803 +               cleanup = true;
14804 +       }
14806         /*
14807          * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
14808 @@ -1759,5 +2437,91 @@
14810         raw_spin_unlock_irq(&lock->wait_lock);
14812 +       return cleanup;
14815 +static inline int
14816 +ww_mutex_deadlock_injection(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
14818 +#ifdef CONFIG_DEBUG_WW_MUTEX_SLOWPATH
14819 +       unsigned tmp;
14821 +       if (ctx->deadlock_inject_countdown-- == 0) {
14822 +               tmp = ctx->deadlock_inject_interval;
14823 +               if (tmp > UINT_MAX/4)
14824 +                       tmp = UINT_MAX;
14825 +               else
14826 +                       tmp = tmp*2 + tmp + tmp/2;
14828 +               ctx->deadlock_inject_interval = tmp;
14829 +               ctx->deadlock_inject_countdown = tmp;
14830 +               ctx->contending_lock = lock;
14832 +               ww_mutex_unlock(lock);
14834 +               return -EDEADLK;
14835 +       }
14836 +#endif
14838 +       return 0;
14841 +#ifdef CONFIG_PREEMPT_RT_FULL
14842 +int __sched
14843 +__ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ww_ctx)
14845 +       int ret;
14847 +       might_sleep();
14849 +       mutex_acquire_nest(&lock->base.dep_map, 0, 0, &ww_ctx->dep_map, _RET_IP_);
14850 +       ret = rt_mutex_slowlock(&lock->base.lock, TASK_INTERRUPTIBLE, NULL, 0, ww_ctx);
14851 +       if (ret)
14852 +               mutex_release(&lock->base.dep_map, 1, _RET_IP_);
14853 +       else if (!ret && ww_ctx->acquired > 1)
14854 +               return ww_mutex_deadlock_injection(lock, ww_ctx);
14856         return ret;
14858 +EXPORT_SYMBOL_GPL(__ww_mutex_lock_interruptible);
14860 +int __sched
14861 +__ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ww_ctx)
14863 +       int ret;
14865 +       might_sleep();
14867 +       mutex_acquire_nest(&lock->base.dep_map, 0, 0, &ww_ctx->dep_map, _RET_IP_);
14868 +       ret = rt_mutex_slowlock(&lock->base.lock, TASK_UNINTERRUPTIBLE, NULL, 0, ww_ctx);
14869 +       if (ret)
14870 +               mutex_release(&lock->base.dep_map, 1, _RET_IP_);
14871 +       else if (!ret && ww_ctx->acquired > 1)
14872 +               return ww_mutex_deadlock_injection(lock, ww_ctx);
14874 +       return ret;
14876 +EXPORT_SYMBOL_GPL(__ww_mutex_lock);
14878 +void __sched ww_mutex_unlock(struct ww_mutex *lock)
14880 +       int nest = !!lock->ctx;
14882 +       /*
14883 +        * The unlocking fastpath is the 0->1 transition from 'locked'
14884 +        * into 'unlocked' state:
14885 +        */
14886 +       if (nest) {
14887 +#ifdef CONFIG_DEBUG_MUTEXES
14888 +               DEBUG_LOCKS_WARN_ON(!lock->ctx->acquired);
14889 +#endif
14890 +               if (lock->ctx->acquired > 0)
14891 +                       lock->ctx->acquired--;
14892 +               lock->ctx = NULL;
14893 +       }
14895 +       mutex_release(&lock->base.dep_map, nest, _RET_IP_);
14896 +       rt_mutex_unlock(&lock->base.lock);
14898 +EXPORT_SYMBOL(ww_mutex_unlock);
14899 +#endif
14900 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/locking/rtmutex.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/locking/rtmutex.h
14901 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/locking/rtmutex.h        2017-04-16 10:38:29.000000000 +0200
14902 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/locking/rtmutex.h     2017-04-18 17:54:26.000000000 +0200
14903 @@ -11,8 +11,6 @@
14904   */
14906  #define rt_mutex_deadlock_check(l)                     (0)
14907 -#define rt_mutex_deadlock_account_lock(m, t)           do { } while (0)
14908 -#define rt_mutex_deadlock_account_unlock(l)            do { } while (0)
14909  #define debug_rt_mutex_init_waiter(w)                  do { } while (0)
14910  #define debug_rt_mutex_free_waiter(w)                  do { } while (0)
14911  #define debug_rt_mutex_lock(l)                         do { } while (0)
14912 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/locking/rtmutex_common.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/locking/rtmutex_common.h
14913 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/locking/rtmutex_common.h 2017-04-16 10:38:29.000000000 +0200
14914 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/locking/rtmutex_common.h      2017-04-18 17:54:26.000000000 +0200
14915 @@ -27,6 +27,7 @@
14916         struct rb_node          pi_tree_entry;
14917         struct task_struct      *task;
14918         struct rt_mutex         *lock;
14919 +       bool                    savestate;
14920  #ifdef CONFIG_DEBUG_RT_MUTEXES
14921         unsigned long           ip;
14922         struct pid              *deadlock_task_pid;
14923 @@ -98,22 +99,45 @@
14924  /*
14925   * PI-futex support (proxy locking functions, etc.):
14926   */
14927 +#define PI_WAKEUP_INPROGRESS   ((struct rt_mutex_waiter *) 1)
14928 +#define PI_REQUEUE_INPROGRESS  ((struct rt_mutex_waiter *) 2)
14930  extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock);
14931  extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
14932                                        struct task_struct *proxy_owner);
14933  extern void rt_mutex_proxy_unlock(struct rt_mutex *lock,
14934                                   struct task_struct *proxy_owner);
14935 +extern void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter, bool savetate);
14936 +extern int __rt_mutex_start_proxy_lock(struct rt_mutex *lock,
14937 +                                    struct rt_mutex_waiter *waiter,
14938 +                                    struct task_struct *task);
14939  extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
14940                                      struct rt_mutex_waiter *waiter,
14941                                      struct task_struct *task);
14942 -extern int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
14943 -                                     struct hrtimer_sleeper *to,
14944 -                                     struct rt_mutex_waiter *waiter);
14945 -extern int rt_mutex_timed_futex_lock(struct rt_mutex *l, struct hrtimer_sleeper *to);
14946 -extern bool rt_mutex_futex_unlock(struct rt_mutex *lock,
14947 -                                 struct wake_q_head *wqh);
14948 +extern int rt_mutex_wait_proxy_lock(struct rt_mutex *lock,
14949 +                              struct hrtimer_sleeper *to,
14950 +                              struct rt_mutex_waiter *waiter);
14951 +extern bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock,
14952 +                                struct rt_mutex_waiter *waiter);
14954 +extern int rt_mutex_futex_trylock(struct rt_mutex *l);
14956 +extern void rt_mutex_futex_unlock(struct rt_mutex *lock);
14957 +extern bool __rt_mutex_futex_unlock(struct rt_mutex *lock,
14958 +                                struct wake_q_head *wqh,
14959 +                                struct wake_q_head *wq_sleeper);
14961  extern void rt_mutex_adjust_prio(struct task_struct *task);
14963 +/* RW semaphore special interface */
14964 +struct ww_acquire_ctx;
14966 +int __sched rt_mutex_slowlock_locked(struct rt_mutex *lock, int state,
14967 +                                    struct hrtimer_sleeper *timeout,
14968 +                                    enum rtmutex_chainwalk chwalk,
14969 +                                    struct ww_acquire_ctx *ww_ctx,
14970 +                                    struct rt_mutex_waiter *waiter);
14972  #ifdef CONFIG_DEBUG_RT_MUTEXES
14973  # include "rtmutex-debug.h"
14974  #else
14975 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/locking/rwsem-rt.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/locking/rwsem-rt.c
14976 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/locking/rwsem-rt.c       1970-01-01 01:00:00.000000000 +0100
14977 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/locking/rwsem-rt.c    2017-04-18 17:54:26.000000000 +0200
14978 @@ -0,0 +1,268 @@
14980 + */
14981 +#include <linux/rwsem.h>
14982 +#include <linux/sched.h>
14983 +#include <linux/export.h>
14985 +#include "rtmutex_common.h"
14988 + * RT-specific reader/writer semaphores
14989 + *
14990 + * down_write()
14991 + *  1) Lock sem->rtmutex
14992 + *  2) Remove the reader BIAS to force readers into the slow path
14993 + *  3) Wait until all readers have left the critical region
14994 + *  4) Mark it write locked
14995 + *
14996 + * up_write()
14997 + *  1) Remove the write locked marker
14998 + *  2) Set the reader BIAS so readers can use the fast path again
14999 + *  3) Unlock sem->rtmutex to release blocked readers
15000 + *
15001 + * down_read()
15002 + *  1) Try fast path acquisition (reader BIAS is set)
15003 + *  2) Take sem->rtmutex.wait_lock which protects the writelocked flag
15004 + *  3) If !writelocked, acquire it for read
15005 + *  4) If writelocked, block on sem->rtmutex
15006 + *  5) unlock sem->rtmutex, goto 1)
15007 + *
15008 + * up_read()
15009 + *  1) Try fast path release (reader count != 1)
15010 + *  2) Wake the writer waiting in down_write()#3
15011 + *
15012 + * down_read()#3 has the consequence, that rw semaphores on RT are not writer
15013 + * fair, but writers, which should be avoided in RT tasks (think mmap_sem),
15014 + * are subject to the rtmutex priority/DL inheritance mechanism.
15015 + *
15016 + * It's possible to make the rw semaphores writer fair by keeping a list of
15017 + * active readers. A blocked writer would force all newly incoming readers to
15018 + * block on the rtmutex, but the rtmutex would have to be proxy locked for one
15019 + * reader after the other. We can't use multi-reader inheritance because there
15020 + * is no way to support that with SCHED_DEADLINE. Implementing the one by one
15021 + * reader boosting/handover mechanism is a major surgery for a very dubious
15022 + * value.
15023 + *
15024 + * The risk of writer starvation is there, but the pathological use cases
15025 + * which trigger it are not necessarily the typical RT workloads.
15026 + */
15028 +void __rwsem_init(struct rw_semaphore *sem, const char *name,
15029 +                 struct lock_class_key *key)
15031 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
15032 +       /*
15033 +        * Make sure we are not reinitializing a held semaphore:
15034 +        */
15035 +       debug_check_no_locks_freed((void *)sem, sizeof(*sem));
15036 +       lockdep_init_map(&sem->dep_map, name, key, 0);
15037 +#endif
15038 +       atomic_set(&sem->readers, READER_BIAS);
15040 +EXPORT_SYMBOL(__rwsem_init);
15042 +int __down_read_trylock(struct rw_semaphore *sem)
15044 +       int r, old;
15046 +       /*
15047 +        * Increment reader count, if sem->readers < 0, i.e. READER_BIAS is
15048 +        * set.
15049 +        */
15050 +       for (r = atomic_read(&sem->readers); r < 0;) {
15051 +               old = atomic_cmpxchg(&sem->readers, r, r + 1);
15052 +               if (likely(old == r))
15053 +                       return 1;
15054 +               r = old;
15055 +       }
15056 +       return 0;
15059 +void __sched __down_read(struct rw_semaphore *sem)
15061 +       struct rt_mutex *m = &sem->rtmutex;
15062 +       struct rt_mutex_waiter waiter;
15064 +       if (__down_read_trylock(sem))
15065 +               return;
15067 +       might_sleep();
15068 +       raw_spin_lock_irq(&m->wait_lock);
15069 +       /*
15070 +        * Allow readers as long as the writer has not completely
15071 +        * acquired the semaphore for write.
15072 +        */
15073 +       if (atomic_read(&sem->readers) != WRITER_BIAS) {
15074 +               atomic_inc(&sem->readers);
15075 +               raw_spin_unlock_irq(&m->wait_lock);
15076 +               return;
15077 +       }
15079 +       /*
15080 +        * Call into the slow lock path with the rtmutex->wait_lock
15081 +        * held, so this can't result in the following race:
15082 +        *
15083 +        * Reader1              Reader2         Writer
15084 +        *                      down_read()
15085 +        *                                      down_write()
15086 +        *                                      rtmutex_lock(m)
15087 +        *                                      swait()
15088 +        * down_read()
15089 +        * unlock(m->wait_lock)
15090 +        *                      up_read()
15091 +        *                      swake()
15092 +        *                                      lock(m->wait_lock)
15093 +        *                                      sem->writelocked=true
15094 +        *                                      unlock(m->wait_lock)
15095 +        *
15096 +        *                                      up_write()
15097 +        *                                      sem->writelocked=false
15098 +        *                                      rtmutex_unlock(m)
15099 +        *                      down_read()
15100 +        *                                      down_write()
15101 +        *                                      rtmutex_lock(m)
15102 +        *                                      swait()
15103 +        * rtmutex_lock(m)
15104 +        *
15105 +        * That would put Reader1 behind the writer waiting on
15106 +        * Reader2 to call up_read() which might be unbound.
15107 +        */
15108 +       rt_mutex_init_waiter(&waiter, false);
15109 +       rt_mutex_slowlock_locked(m, TASK_UNINTERRUPTIBLE, NULL,
15110 +                                RT_MUTEX_MIN_CHAINWALK, NULL,
15111 +                                &waiter);
15112 +       /*
15113 +        * The slowlock() above is guaranteed to return with the rtmutex is
15114 +        * now held, so there can't be a writer active. Increment the reader
15115 +        * count and immediately drop the rtmutex again.
15116 +        */
15117 +       atomic_inc(&sem->readers);
15118 +       raw_spin_unlock_irq(&m->wait_lock);
15119 +       rt_mutex_unlock(m);
15121 +       debug_rt_mutex_free_waiter(&waiter);
15124 +void __up_read(struct rw_semaphore *sem)
15126 +       struct rt_mutex *m = &sem->rtmutex;
15127 +       struct task_struct *tsk;
15129 +       /*
15130 +        * sem->readers can only hit 0 when a writer is waiting for the
15131 +        * active readers to leave the critical region.
15132 +        */
15133 +       if (!atomic_dec_and_test(&sem->readers))
15134 +               return;
15136 +       might_sleep();
15137 +       raw_spin_lock_irq(&m->wait_lock);
15138 +       /*
15139 +        * Wake the writer, i.e. the rtmutex owner. It might release the
15140 +        * rtmutex concurrently in the fast path (due to a signal), but to
15141 +        * clean up the rwsem it needs to acquire m->wait_lock. The worst
15142 +        * case which can happen is a spurious wakeup.
15143 +        */
15144 +       tsk = rt_mutex_owner(m);
15145 +       if (tsk)
15146 +               wake_up_process(tsk);
15148 +       raw_spin_unlock_irq(&m->wait_lock);
15151 +static void __up_write_unlock(struct rw_semaphore *sem, int bias,
15152 +                             unsigned long flags)
15154 +       struct rt_mutex *m = &sem->rtmutex;
15156 +       atomic_add(READER_BIAS - bias, &sem->readers);
15157 +       raw_spin_unlock_irqrestore(&m->wait_lock, flags);
15158 +       rt_mutex_unlock(m);
15161 +static int __sched __down_write_common(struct rw_semaphore *sem, int state)
15163 +       struct rt_mutex *m = &sem->rtmutex;
15164 +       unsigned long flags;
15166 +       /* Take the rtmutex as a first step */
15167 +       if (rt_mutex_lock_state(m, state))
15168 +               return -EINTR;
15170 +       /* Force readers into slow path */
15171 +       atomic_sub(READER_BIAS, &sem->readers);
15172 +       might_sleep();
15174 +       set_current_state(state);
15175 +       for (;;) {
15176 +               raw_spin_lock_irqsave(&m->wait_lock, flags);
15177 +               /* Have all readers left the critical region? */
15178 +               if (!atomic_read(&sem->readers)) {
15179 +                       atomic_set(&sem->readers, WRITER_BIAS);
15180 +                       __set_current_state(TASK_RUNNING);
15181 +                       raw_spin_unlock_irqrestore(&m->wait_lock, flags);
15182 +                       return 0;
15183 +               }
15185 +               if (signal_pending_state(state, current)) {
15186 +                       __set_current_state(TASK_RUNNING);
15187 +                       __up_write_unlock(sem, 0, flags);
15188 +                       return -EINTR;
15189 +               }
15190 +               raw_spin_unlock_irqrestore(&m->wait_lock, flags);
15192 +               if (atomic_read(&sem->readers) != 0) {
15193 +                       schedule();
15194 +                       set_current_state(state);
15195 +               }
15196 +       }
15199 +void __sched __down_write(struct rw_semaphore *sem)
15201 +       __down_write_common(sem, TASK_UNINTERRUPTIBLE);
15204 +int __sched __down_write_killable(struct rw_semaphore *sem)
15206 +       return __down_write_common(sem, TASK_KILLABLE);
15209 +int __down_write_trylock(struct rw_semaphore *sem)
15211 +       struct rt_mutex *m = &sem->rtmutex;
15212 +       unsigned long flags;
15214 +       if (!rt_mutex_trylock(m))
15215 +               return 0;
15217 +       atomic_sub(READER_BIAS, &sem->readers);
15219 +       raw_spin_lock_irqsave(&m->wait_lock, flags);
15220 +       if (!atomic_read(&sem->readers)) {
15221 +               atomic_set(&sem->readers, WRITER_BIAS);
15222 +               raw_spin_unlock_irqrestore(&m->wait_lock, flags);
15223 +               return 1;
15224 +       }
15225 +       __up_write_unlock(sem, 0, flags);
15226 +       return 0;
15229 +void __up_write(struct rw_semaphore *sem)
15231 +       struct rt_mutex *m = &sem->rtmutex;
15232 +       unsigned long flags;
15234 +       raw_spin_lock_irqsave(&m->wait_lock, flags);
15235 +       __up_write_unlock(sem, WRITER_BIAS, flags);
15238 +void __downgrade_write(struct rw_semaphore *sem)
15240 +       struct rt_mutex *m = &sem->rtmutex;
15241 +       unsigned long flags;
15243 +       raw_spin_lock_irqsave(&m->wait_lock, flags);
15244 +       /* Release it and account current as reader */
15245 +       __up_write_unlock(sem, WRITER_BIAS - 1, flags);
15247 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/locking/spinlock.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/locking/spinlock.c
15248 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/locking/spinlock.c       2017-04-16 10:38:29.000000000 +0200
15249 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/locking/spinlock.c    2017-04-18 17:54:26.000000000 +0200
15250 @@ -124,8 +124,11 @@
15251   *         __[spin|read|write]_lock_bh()
15252   */
15253  BUILD_LOCK_OPS(spin, raw_spinlock);
15255 +#ifndef CONFIG_PREEMPT_RT_FULL
15256  BUILD_LOCK_OPS(read, rwlock);
15257  BUILD_LOCK_OPS(write, rwlock);
15258 +#endif
15260  #endif
15262 @@ -209,6 +212,8 @@
15263  EXPORT_SYMBOL(_raw_spin_unlock_bh);
15264  #endif
15266 +#ifndef CONFIG_PREEMPT_RT_FULL
15268  #ifndef CONFIG_INLINE_READ_TRYLOCK
15269  int __lockfunc _raw_read_trylock(rwlock_t *lock)
15271 @@ -353,6 +358,8 @@
15272  EXPORT_SYMBOL(_raw_write_unlock_bh);
15273  #endif
15275 +#endif /* !PREEMPT_RT_FULL */
15277  #ifdef CONFIG_DEBUG_LOCK_ALLOC
15279  void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass)
15280 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/locking/spinlock_debug.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/locking/spinlock_debug.c
15281 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/locking/spinlock_debug.c 2017-04-16 10:38:29.000000000 +0200
15282 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/locking/spinlock_debug.c      2017-04-18 17:54:26.000000000 +0200
15283 @@ -31,6 +31,7 @@
15285  EXPORT_SYMBOL(__raw_spin_lock_init);
15287 +#ifndef CONFIG_PREEMPT_RT_FULL
15288  void __rwlock_init(rwlock_t *lock, const char *name,
15289                    struct lock_class_key *key)
15291 @@ -48,6 +49,7 @@
15294  EXPORT_SYMBOL(__rwlock_init);
15295 +#endif
15297  static void spin_dump(raw_spinlock_t *lock, const char *msg)
15299 @@ -159,6 +161,7 @@
15300         arch_spin_unlock(&lock->raw_lock);
15303 +#ifndef CONFIG_PREEMPT_RT_FULL
15304  static void rwlock_bug(rwlock_t *lock, const char *msg)
15306         if (!debug_locks_off())
15307 @@ -300,3 +303,5 @@
15308         debug_write_unlock(lock);
15309         arch_write_unlock(&lock->raw_lock);
15312 +#endif
15313 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/module.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/module.c
15314 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/module.c 2017-04-16 10:38:29.000000000 +0200
15315 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/module.c      2017-04-18 17:54:26.000000000 +0200
15316 @@ -660,16 +660,7 @@
15317                 memcpy(per_cpu_ptr(mod->percpu, cpu), from, size);
15320 -/**
15321 - * is_module_percpu_address - test whether address is from module static percpu
15322 - * @addr: address to test
15323 - *
15324 - * Test whether @addr belongs to module static percpu area.
15325 - *
15326 - * RETURNS:
15327 - * %true if @addr is from module static percpu area
15328 - */
15329 -bool is_module_percpu_address(unsigned long addr)
15330 +bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr)
15332         struct module *mod;
15333         unsigned int cpu;
15334 @@ -683,9 +674,15 @@
15335                         continue;
15336                 for_each_possible_cpu(cpu) {
15337                         void *start = per_cpu_ptr(mod->percpu, cpu);
15338 +                       void *va = (void *)addr;
15340 -                       if ((void *)addr >= start &&
15341 -                           (void *)addr < start + mod->percpu_size) {
15342 +                       if (va >= start && va < start + mod->percpu_size) {
15343 +                               if (can_addr) {
15344 +                                       *can_addr = (unsigned long) (va - start);
15345 +                                       *can_addr += (unsigned long)
15346 +                                               per_cpu_ptr(mod->percpu,
15347 +                                                           get_boot_cpu_id());
15348 +                               }
15349                                 preempt_enable();
15350                                 return true;
15351                         }
15352 @@ -696,6 +693,20 @@
15353         return false;
15356 +/**
15357 + * is_module_percpu_address - test whether address is from module static percpu
15358 + * @addr: address to test
15359 + *
15360 + * Test whether @addr belongs to module static percpu area.
15361 + *
15362 + * RETURNS:
15363 + * %true if @addr is from module static percpu area
15364 + */
15365 +bool is_module_percpu_address(unsigned long addr)
15367 +       return __is_module_percpu_address(addr, NULL);
15370  #else /* ... !CONFIG_SMP */
15372  static inline void __percpu *mod_percpu(struct module *mod)
15373 @@ -727,6 +738,11 @@
15374         return false;
15377 +bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr)
15379 +       return false;
15382  #endif /* CONFIG_SMP */
15384  #define MODINFO_ATTR(field)    \
15385 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/panic.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/panic.c
15386 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/panic.c  2017-04-16 10:38:29.000000000 +0200
15387 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/panic.c       2017-04-18 17:54:26.000000000 +0200
15388 @@ -482,9 +482,11 @@
15390  static int init_oops_id(void)
15392 +#ifndef CONFIG_PREEMPT_RT_FULL
15393         if (!oops_id)
15394                 get_random_bytes(&oops_id, sizeof(oops_id));
15395         else
15396 +#endif
15397                 oops_id++;
15399         return 0;
15400 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/power/hibernate.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/power/hibernate.c
15401 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/power/hibernate.c        2017-04-16 10:38:29.000000000 +0200
15402 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/power/hibernate.c     2017-04-18 17:54:26.000000000 +0200
15403 @@ -286,6 +286,8 @@
15405         local_irq_disable();
15407 +       system_state = SYSTEM_SUSPEND;
15409         error = syscore_suspend();
15410         if (error) {
15411                 printk(KERN_ERR "PM: Some system devices failed to power down, "
15412 @@ -317,6 +319,7 @@
15413         syscore_resume();
15415   Enable_irqs:
15416 +       system_state = SYSTEM_RUNNING;
15417         local_irq_enable();
15419   Enable_cpus:
15420 @@ -446,6 +449,7 @@
15421                 goto Enable_cpus;
15423         local_irq_disable();
15424 +       system_state = SYSTEM_SUSPEND;
15426         error = syscore_suspend();
15427         if (error)
15428 @@ -479,6 +483,7 @@
15429         syscore_resume();
15431   Enable_irqs:
15432 +       system_state = SYSTEM_RUNNING;
15433         local_irq_enable();
15435   Enable_cpus:
15436 @@ -564,6 +569,7 @@
15437                 goto Enable_cpus;
15439         local_irq_disable();
15440 +       system_state = SYSTEM_SUSPEND;
15441         syscore_suspend();
15442         if (pm_wakeup_pending()) {
15443                 error = -EAGAIN;
15444 @@ -576,6 +582,7 @@
15446   Power_up:
15447         syscore_resume();
15448 +       system_state = SYSTEM_RUNNING;
15449         local_irq_enable();
15451   Enable_cpus:
15452 @@ -676,6 +683,10 @@
15453         return error;
15456 +#ifndef CONFIG_SUSPEND
15457 +bool pm_in_action;
15458 +#endif
15460  /**
15461   * hibernate - Carry out system hibernation, including saving the image.
15462   */
15463 @@ -689,6 +700,8 @@
15464                 return -EPERM;
15465         }
15467 +       pm_in_action = true;
15469         lock_system_sleep();
15470         /* The snapshot device should not be opened while we're running */
15471         if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
15472 @@ -766,6 +779,7 @@
15473         atomic_inc(&snapshot_device_available);
15474   Unlock:
15475         unlock_system_sleep();
15476 +       pm_in_action = false;
15477         return error;
15480 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/power/suspend.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/power/suspend.c
15481 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/power/suspend.c  2017-04-16 10:38:29.000000000 +0200
15482 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/power/suspend.c       2017-04-18 17:54:26.000000000 +0200
15483 @@ -369,6 +369,8 @@
15484         arch_suspend_disable_irqs();
15485         BUG_ON(!irqs_disabled());
15487 +       system_state = SYSTEM_SUSPEND;
15489         error = syscore_suspend();
15490         if (!error) {
15491                 *wakeup = pm_wakeup_pending();
15492 @@ -385,6 +387,8 @@
15493                 syscore_resume();
15494         }
15496 +       system_state = SYSTEM_RUNNING;
15498         arch_suspend_enable_irqs();
15499         BUG_ON(irqs_disabled());
15501 @@ -527,6 +531,8 @@
15502         return error;
15505 +bool pm_in_action;
15507  /**
15508   * pm_suspend - Externally visible function for suspending the system.
15509   * @state: System sleep state to enter.
15510 @@ -541,6 +547,8 @@
15511         if (state <= PM_SUSPEND_ON || state >= PM_SUSPEND_MAX)
15512                 return -EINVAL;
15514 +       pm_in_action = true;
15516         error = enter_state(state);
15517         if (error) {
15518                 suspend_stats.fail++;
15519 @@ -548,6 +556,7 @@
15520         } else {
15521                 suspend_stats.success++;
15522         }
15523 +       pm_in_action = false;
15524         return error;
15526  EXPORT_SYMBOL(pm_suspend);
15527 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/printk/printk.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/printk/printk.c
15528 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/printk/printk.c  2017-04-16 10:38:29.000000000 +0200
15529 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/printk/printk.c       2017-04-18 17:54:26.000000000 +0200
15530 @@ -351,6 +351,65 @@
15531   */
15532  DEFINE_RAW_SPINLOCK(logbuf_lock);
15534 +#ifdef CONFIG_EARLY_PRINTK
15535 +struct console *early_console;
15537 +static void early_vprintk(const char *fmt, va_list ap)
15539 +       if (early_console) {
15540 +               char buf[512];
15541 +               int n = vscnprintf(buf, sizeof(buf), fmt, ap);
15543 +               early_console->write(early_console, buf, n);
15544 +       }
15547 +asmlinkage void early_printk(const char *fmt, ...)
15549 +       va_list ap;
15551 +       va_start(ap, fmt);
15552 +       early_vprintk(fmt, ap);
15553 +       va_end(ap);
15557 + * This is independent of any log levels - a global
15558 + * kill switch that turns off all of printk.
15559 + *
15560 + * Used by the NMI watchdog if early-printk is enabled.
15561 + */
15562 +static bool __read_mostly printk_killswitch;
15564 +static int __init force_early_printk_setup(char *str)
15566 +       printk_killswitch = true;
15567 +       return 0;
15569 +early_param("force_early_printk", force_early_printk_setup);
15571 +void printk_kill(void)
15573 +       printk_killswitch = true;
15576 +#ifdef CONFIG_PRINTK
15577 +static int forced_early_printk(const char *fmt, va_list ap)
15579 +       if (!printk_killswitch)
15580 +               return 0;
15581 +       early_vprintk(fmt, ap);
15582 +       return 1;
15584 +#endif
15586 +#else
15587 +static inline int forced_early_printk(const char *fmt, va_list ap)
15589 +       return 0;
15591 +#endif
15593  #ifdef CONFIG_PRINTK
15594  DECLARE_WAIT_QUEUE_HEAD(log_wait);
15595  /* the next printk record to read by syslog(READ) or /proc/kmsg */
15596 @@ -1337,6 +1396,7 @@
15598         char *text;
15599         int len = 0;
15600 +       int attempts = 0;
15602         text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL);
15603         if (!text)
15604 @@ -1348,6 +1408,14 @@
15605                 u64 seq;
15606                 u32 idx;
15607                 enum log_flags prev;
15608 +               int num_msg;
15609 +try_again:
15610 +               attempts++;
15611 +               if (attempts > 10) {
15612 +                       len = -EBUSY;
15613 +                       goto out;
15614 +               }
15615 +               num_msg = 0;
15617                 /*
15618                  * Find first record that fits, including all following records,
15619 @@ -1363,6 +1431,14 @@
15620                         prev = msg->flags;
15621                         idx = log_next(idx);
15622                         seq++;
15623 +                       num_msg++;
15624 +                       if (num_msg > 5) {
15625 +                               num_msg = 0;
15626 +                               raw_spin_unlock_irq(&logbuf_lock);
15627 +                               raw_spin_lock_irq(&logbuf_lock);
15628 +                               if (clear_seq < log_first_seq)
15629 +                                       goto try_again;
15630 +                       }
15631                 }
15633                 /* move first record forward until length fits into the buffer */
15634 @@ -1376,6 +1452,14 @@
15635                         prev = msg->flags;
15636                         idx = log_next(idx);
15637                         seq++;
15638 +                       num_msg++;
15639 +                       if (num_msg > 5) {
15640 +                               num_msg = 0;
15641 +                               raw_spin_unlock_irq(&logbuf_lock);
15642 +                               raw_spin_lock_irq(&logbuf_lock);
15643 +                               if (clear_seq < log_first_seq)
15644 +                                       goto try_again;
15645 +                       }
15646                 }
15648                 /* last message fitting into this dump */
15649 @@ -1416,6 +1500,7 @@
15650                 clear_seq = log_next_seq;
15651                 clear_idx = log_next_idx;
15652         }
15653 +out:
15654         raw_spin_unlock_irq(&logbuf_lock);
15656         kfree(text);
15657 @@ -1569,6 +1654,12 @@
15658         if (!console_drivers)
15659                 return;
15661 +       if (IS_ENABLED(CONFIG_PREEMPT_RT_BASE)) {
15662 +               if (in_irq() || in_nmi())
15663 +                       return;
15664 +       }
15666 +       migrate_disable();
15667         for_each_console(con) {
15668                 if (exclusive_console && con != exclusive_console)
15669                         continue;
15670 @@ -1584,6 +1675,7 @@
15671                 else
15672                         con->write(con, text, len);
15673         }
15674 +       migrate_enable();
15677  /*
15678 @@ -1781,6 +1873,13 @@
15679         /* cpu currently holding logbuf_lock in this function */
15680         static unsigned int logbuf_cpu = UINT_MAX;
15682 +       /*
15683 +        * Fall back to early_printk if a debugging subsystem has
15684 +        * killed printk output
15685 +        */
15686 +       if (unlikely(forced_early_printk(fmt, args)))
15687 +               return 1;
15689         if (level == LOGLEVEL_SCHED) {
15690                 level = LOGLEVEL_DEFAULT;
15691                 in_sched = true;
15692 @@ -1885,13 +1984,23 @@
15694         /* If called from the scheduler, we can not call up(). */
15695         if (!in_sched) {
15696 +               int may_trylock = 1;
15698                 lockdep_off();
15699 +#ifdef CONFIG_PREEMPT_RT_FULL
15700 +               /*
15701 +                * we can't take a sleeping lock with IRQs or preeption disabled
15702 +                * so we can't print in these contexts
15703 +                */
15704 +               if (!(preempt_count() == 0 && !irqs_disabled()))
15705 +                       may_trylock = 0;
15706 +#endif
15707                 /*
15708                  * Try to acquire and then immediately release the console
15709                  * semaphore.  The release will print out buffers and wake up
15710                  * /dev/kmsg and syslog() users.
15711                  */
15712 -               if (console_trylock())
15713 +               if (may_trylock && console_trylock())
15714                         console_unlock();
15715                 lockdep_on();
15716         }
15717 @@ -2014,26 +2123,6 @@
15719  #endif /* CONFIG_PRINTK */
15721 -#ifdef CONFIG_EARLY_PRINTK
15722 -struct console *early_console;
15724 -asmlinkage __visible void early_printk(const char *fmt, ...)
15726 -       va_list ap;
15727 -       char buf[512];
15728 -       int n;
15730 -       if (!early_console)
15731 -               return;
15733 -       va_start(ap, fmt);
15734 -       n = vscnprintf(buf, sizeof(buf), fmt, ap);
15735 -       va_end(ap);
15737 -       early_console->write(early_console, buf, n);
15739 -#endif
15741  static int __add_preferred_console(char *name, int idx, char *options,
15742                                    char *brl_options)
15744 @@ -2303,11 +2392,16 @@
15745                 goto out;
15747         len = cont_print_text(text, size);
15748 +#ifdef CONFIG_PREEMPT_RT_FULL
15749 +       raw_spin_unlock_irqrestore(&logbuf_lock, flags);
15750 +       call_console_drivers(cont.level, NULL, 0, text, len);
15751 +#else
15752         raw_spin_unlock(&logbuf_lock);
15753         stop_critical_timings();
15754         call_console_drivers(cont.level, NULL, 0, text, len);
15755         start_critical_timings();
15756         local_irq_restore(flags);
15757 +#endif
15758         return;
15759  out:
15760         raw_spin_unlock_irqrestore(&logbuf_lock, flags);
15761 @@ -2431,13 +2525,17 @@
15762                 console_idx = log_next(console_idx);
15763                 console_seq++;
15764                 console_prev = msg->flags;
15765 +#ifdef CONFIG_PREEMPT_RT_FULL
15766 +               raw_spin_unlock_irqrestore(&logbuf_lock, flags);
15767 +               call_console_drivers(level, ext_text, ext_len, text, len);
15768 +#else
15769                 raw_spin_unlock(&logbuf_lock);
15771                 stop_critical_timings();        /* don't trace print latency */
15772                 call_console_drivers(level, ext_text, ext_len, text, len);
15773                 start_critical_timings();
15774                 local_irq_restore(flags);
15776 +#endif
15777                 if (do_cond_resched)
15778                         cond_resched();
15779         }
15780 @@ -2489,6 +2587,11 @@
15782         struct console *c;
15784 +       if (IS_ENABLED(CONFIG_PREEMPT_RT_BASE)) {
15785 +               if (in_irq() || in_nmi())
15786 +                       return;
15787 +       }
15789         /*
15790          * console_unblank can no longer be called in interrupt context unless
15791          * oops_in_progress is set to 1..
15792 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/ptrace.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/ptrace.c
15793 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/ptrace.c 2017-04-16 10:38:29.000000000 +0200
15794 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/ptrace.c      2017-04-18 17:54:26.000000000 +0200
15795 @@ -166,7 +166,14 @@
15797         spin_lock_irq(&task->sighand->siglock);
15798         if (task_is_traced(task) && !__fatal_signal_pending(task)) {
15799 -               task->state = __TASK_TRACED;
15800 +               unsigned long flags;
15802 +               raw_spin_lock_irqsave(&task->pi_lock, flags);
15803 +               if (task->state & __TASK_TRACED)
15804 +                       task->state = __TASK_TRACED;
15805 +               else
15806 +                       task->saved_state = __TASK_TRACED;
15807 +               raw_spin_unlock_irqrestore(&task->pi_lock, flags);
15808                 ret = true;
15809         }
15810         spin_unlock_irq(&task->sighand->siglock);
15811 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/rcu/rcutorture.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/rcu/rcutorture.c
15812 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/rcu/rcutorture.c 2017-04-16 10:38:29.000000000 +0200
15813 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/rcu/rcutorture.c      2017-04-18 17:54:26.000000000 +0200
15814 @@ -404,6 +404,7 @@
15815         .name           = "rcu"
15816  };
15818 +#ifndef CONFIG_PREEMPT_RT_FULL
15819  /*
15820   * Definitions for rcu_bh torture testing.
15821   */
15822 @@ -443,6 +444,12 @@
15823         .name           = "rcu_bh"
15824  };
15826 +#else
15827 +static struct rcu_torture_ops rcu_bh_ops = {
15828 +       .ttype          = INVALID_RCU_FLAVOR,
15830 +#endif
15832  /*
15833   * Don't even think about trying any of these in real life!!!
15834   * The names includes "busted", and they really means it!
15835 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/rcu/tree.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/rcu/tree.c
15836 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/rcu/tree.c       2017-04-16 10:38:29.000000000 +0200
15837 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/rcu/tree.c    2017-04-18 17:54:26.000000000 +0200
15838 @@ -55,6 +55,11 @@
15839  #include <linux/random.h>
15840  #include <linux/trace_events.h>
15841  #include <linux/suspend.h>
15842 +#include <linux/delay.h>
15843 +#include <linux/gfp.h>
15844 +#include <linux/oom.h>
15845 +#include <linux/smpboot.h>
15846 +#include "../time/tick-internal.h"
15848  #include "tree.h"
15849  #include "rcu.h"
15850 @@ -260,6 +265,19 @@
15851                            this_cpu_ptr(&rcu_sched_data), true);
15854 +#ifdef CONFIG_PREEMPT_RT_FULL
15855 +static void rcu_preempt_qs(void);
15857 +void rcu_bh_qs(void)
15859 +       unsigned long flags;
15861 +       /* Callers to this function, rcu_preempt_qs(), must disable irqs. */
15862 +       local_irq_save(flags);
15863 +       rcu_preempt_qs();
15864 +       local_irq_restore(flags);
15866 +#else
15867  void rcu_bh_qs(void)
15869         if (__this_cpu_read(rcu_bh_data.cpu_no_qs.s)) {
15870 @@ -269,6 +287,7 @@
15871                 __this_cpu_write(rcu_bh_data.cpu_no_qs.b.norm, false);
15872         }
15874 +#endif
15876  static DEFINE_PER_CPU(int, rcu_sched_qs_mask);
15878 @@ -449,11 +468,13 @@
15879  /*
15880   * Return the number of RCU BH batches started thus far for debug & stats.
15881   */
15882 +#ifndef CONFIG_PREEMPT_RT_FULL
15883  unsigned long rcu_batches_started_bh(void)
15885         return rcu_bh_state.gpnum;
15887  EXPORT_SYMBOL_GPL(rcu_batches_started_bh);
15888 +#endif
15890  /*
15891   * Return the number of RCU batches completed thus far for debug & stats.
15892 @@ -473,6 +494,7 @@
15894  EXPORT_SYMBOL_GPL(rcu_batches_completed_sched);
15896 +#ifndef CONFIG_PREEMPT_RT_FULL
15897  /*
15898   * Return the number of RCU BH batches completed thus far for debug & stats.
15899   */
15900 @@ -481,6 +503,7 @@
15901         return rcu_bh_state.completed;
15903  EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
15904 +#endif
15906  /*
15907   * Return the number of RCU expedited batches completed thus far for
15908 @@ -504,6 +527,7 @@
15910  EXPORT_SYMBOL_GPL(rcu_exp_batches_completed_sched);
15912 +#ifndef CONFIG_PREEMPT_RT_FULL
15913  /*
15914   * Force a quiescent state.
15915   */
15916 @@ -522,6 +546,13 @@
15918  EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state);
15920 +#else
15921 +void rcu_force_quiescent_state(void)
15924 +EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
15925 +#endif
15927  /*
15928   * Force a quiescent state for RCU-sched.
15929   */
15930 @@ -572,9 +603,11 @@
15931         case RCU_FLAVOR:
15932                 rsp = rcu_state_p;
15933                 break;
15934 +#ifndef CONFIG_PREEMPT_RT_FULL
15935         case RCU_BH_FLAVOR:
15936                 rsp = &rcu_bh_state;
15937                 break;
15938 +#endif
15939         case RCU_SCHED_FLAVOR:
15940                 rsp = &rcu_sched_state;
15941                 break;
15942 @@ -3016,18 +3049,17 @@
15943  /*
15944   * Do RCU core processing for the current CPU.
15945   */
15946 -static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused)
15947 +static __latent_entropy void rcu_process_callbacks(void)
15949         struct rcu_state *rsp;
15951         if (cpu_is_offline(smp_processor_id()))
15952                 return;
15953 -       trace_rcu_utilization(TPS("Start RCU core"));
15954         for_each_rcu_flavor(rsp)
15955                 __rcu_process_callbacks(rsp);
15956 -       trace_rcu_utilization(TPS("End RCU core"));
15959 +static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
15960  /*
15961   * Schedule RCU callback invocation.  If the specified type of RCU
15962   * does not support RCU priority boosting, just do a direct call,
15963 @@ -3039,18 +3071,105 @@
15965         if (unlikely(!READ_ONCE(rcu_scheduler_fully_active)))
15966                 return;
15967 -       if (likely(!rsp->boost)) {
15968 -               rcu_do_batch(rsp, rdp);
15969 +       rcu_do_batch(rsp, rdp);
15972 +static void rcu_wake_cond(struct task_struct *t, int status)
15974 +       /*
15975 +        * If the thread is yielding, only wake it when this
15976 +        * is invoked from idle
15977 +        */
15978 +       if (t && (status != RCU_KTHREAD_YIELDING || is_idle_task(current)))
15979 +               wake_up_process(t);
15983 + * Wake up this CPU's rcuc kthread to do RCU core processing.
15984 + */
15985 +static void invoke_rcu_core(void)
15987 +       unsigned long flags;
15988 +       struct task_struct *t;
15990 +       if (!cpu_online(smp_processor_id()))
15991                 return;
15992 +       local_irq_save(flags);
15993 +       __this_cpu_write(rcu_cpu_has_work, 1);
15994 +       t = __this_cpu_read(rcu_cpu_kthread_task);
15995 +       if (t != NULL && current != t)
15996 +               rcu_wake_cond(t, __this_cpu_read(rcu_cpu_kthread_status));
15997 +       local_irq_restore(flags);
16000 +static void rcu_cpu_kthread_park(unsigned int cpu)
16002 +       per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
16005 +static int rcu_cpu_kthread_should_run(unsigned int cpu)
16007 +       return __this_cpu_read(rcu_cpu_has_work);
16011 + * Per-CPU kernel thread that invokes RCU callbacks.  This replaces the
16012 + * RCU softirq used in flavors and configurations of RCU that do not
16013 + * support RCU priority boosting.
16014 + */
16015 +static void rcu_cpu_kthread(unsigned int cpu)
16017 +       unsigned int *statusp = this_cpu_ptr(&rcu_cpu_kthread_status);
16018 +       char work, *workp = this_cpu_ptr(&rcu_cpu_has_work);
16019 +       int spincnt;
16021 +       for (spincnt = 0; spincnt < 10; spincnt++) {
16022 +               trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait"));
16023 +               local_bh_disable();
16024 +               *statusp = RCU_KTHREAD_RUNNING;
16025 +               this_cpu_inc(rcu_cpu_kthread_loops);
16026 +               local_irq_disable();
16027 +               work = *workp;
16028 +               *workp = 0;
16029 +               local_irq_enable();
16030 +               if (work)
16031 +                       rcu_process_callbacks();
16032 +               local_bh_enable();
16033 +               if (*workp == 0) {
16034 +                       trace_rcu_utilization(TPS("End CPU kthread@rcu_wait"));
16035 +                       *statusp = RCU_KTHREAD_WAITING;
16036 +                       return;
16037 +               }
16038         }
16039 -       invoke_rcu_callbacks_kthread();
16040 +       *statusp = RCU_KTHREAD_YIELDING;
16041 +       trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield"));
16042 +       schedule_timeout_interruptible(2);
16043 +       trace_rcu_utilization(TPS("End CPU kthread@rcu_yield"));
16044 +       *statusp = RCU_KTHREAD_WAITING;
16047 -static void invoke_rcu_core(void)
16048 +static struct smp_hotplug_thread rcu_cpu_thread_spec = {
16049 +       .store                  = &rcu_cpu_kthread_task,
16050 +       .thread_should_run      = rcu_cpu_kthread_should_run,
16051 +       .thread_fn              = rcu_cpu_kthread,
16052 +       .thread_comm            = "rcuc/%u",
16053 +       .setup                  = rcu_cpu_kthread_setup,
16054 +       .park                   = rcu_cpu_kthread_park,
16058 + * Spawn per-CPU RCU core processing kthreads.
16059 + */
16060 +static int __init rcu_spawn_core_kthreads(void)
16062 -       if (cpu_online(smp_processor_id()))
16063 -               raise_softirq(RCU_SOFTIRQ);
16064 +       int cpu;
16066 +       for_each_possible_cpu(cpu)
16067 +               per_cpu(rcu_cpu_has_work, cpu) = 0;
16068 +       BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec));
16069 +       return 0;
16071 +early_initcall(rcu_spawn_core_kthreads);
16073  /*
16074   * Handle any core-RCU processing required by a call_rcu() invocation.
16075 @@ -3195,6 +3314,7 @@
16077  EXPORT_SYMBOL_GPL(call_rcu_sched);
16079 +#ifndef CONFIG_PREEMPT_RT_FULL
16080  /*
16081   * Queue an RCU callback for invocation after a quicker grace period.
16082   */
16083 @@ -3203,6 +3323,7 @@
16084         __call_rcu(head, func, &rcu_bh_state, -1, 0);
16086  EXPORT_SYMBOL_GPL(call_rcu_bh);
16087 +#endif
16089  /*
16090   * Queue an RCU callback for lazy invocation after a grace period.
16091 @@ -3294,6 +3415,7 @@
16093  EXPORT_SYMBOL_GPL(synchronize_sched);
16095 +#ifndef CONFIG_PREEMPT_RT_FULL
16096  /**
16097   * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed.
16098   *
16099 @@ -3320,6 +3442,7 @@
16100                 wait_rcu_gp(call_rcu_bh);
16102  EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
16103 +#endif
16105  /**
16106   * get_state_synchronize_rcu - Snapshot current RCU state
16107 @@ -3698,6 +3821,7 @@
16108         mutex_unlock(&rsp->barrier_mutex);
16111 +#ifndef CONFIG_PREEMPT_RT_FULL
16112  /**
16113   * rcu_barrier_bh - Wait until all in-flight call_rcu_bh() callbacks complete.
16114   */
16115 @@ -3706,6 +3830,7 @@
16116         _rcu_barrier(&rcu_bh_state);
16118  EXPORT_SYMBOL_GPL(rcu_barrier_bh);
16119 +#endif
16121  /**
16122   * rcu_barrier_sched - Wait for in-flight call_rcu_sched() callbacks.
16123 @@ -4227,12 +4352,13 @@
16125         rcu_bootup_announce();
16126         rcu_init_geometry();
16127 +#ifndef CONFIG_PREEMPT_RT_FULL
16128         rcu_init_one(&rcu_bh_state);
16129 +#endif
16130         rcu_init_one(&rcu_sched_state);
16131         if (dump_tree)
16132                 rcu_dump_rcu_node_tree(&rcu_sched_state);
16133         __rcu_init_preempt();
16134 -       open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
16136         /*
16137          * We don't need protection against CPU-hotplug here because
16138 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/rcu/tree.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/rcu/tree.h
16139 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/rcu/tree.h       2017-04-16 10:38:29.000000000 +0200
16140 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/rcu/tree.h    2017-04-18 17:54:26.000000000 +0200
16141 @@ -588,18 +588,18 @@
16142   */
16143  extern struct rcu_state rcu_sched_state;
16145 +#ifndef CONFIG_PREEMPT_RT_FULL
16146  extern struct rcu_state rcu_bh_state;
16147 +#endif
16149  #ifdef CONFIG_PREEMPT_RCU
16150  extern struct rcu_state rcu_preempt_state;
16151  #endif /* #ifdef CONFIG_PREEMPT_RCU */
16153 -#ifdef CONFIG_RCU_BOOST
16154  DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
16155  DECLARE_PER_CPU(int, rcu_cpu_kthread_cpu);
16156  DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
16157  DECLARE_PER_CPU(char, rcu_cpu_has_work);
16158 -#endif /* #ifdef CONFIG_RCU_BOOST */
16160  #ifndef RCU_TREE_NONCORE
16162 @@ -619,10 +619,9 @@
16163  static void __init __rcu_init_preempt(void);
16164  static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
16165  static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
16166 -static void invoke_rcu_callbacks_kthread(void);
16167  static bool rcu_is_callbacks_kthread(void);
16168 +static void rcu_cpu_kthread_setup(unsigned int cpu);
16169  #ifdef CONFIG_RCU_BOOST
16170 -static void rcu_preempt_do_callbacks(void);
16171  static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
16172                                                  struct rcu_node *rnp);
16173  #endif /* #ifdef CONFIG_RCU_BOOST */
16174 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/rcu/tree_plugin.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/rcu/tree_plugin.h
16175 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/rcu/tree_plugin.h        2017-04-16 10:38:29.000000000 +0200
16176 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/rcu/tree_plugin.h     2017-04-18 17:54:26.000000000 +0200
16177 @@ -24,25 +24,10 @@
16178   *        Paul E. McKenney <paulmck@linux.vnet.ibm.com>
16179   */
16181 -#include <linux/delay.h>
16182 -#include <linux/gfp.h>
16183 -#include <linux/oom.h>
16184 -#include <linux/smpboot.h>
16185 -#include "../time/tick-internal.h"
16187  #ifdef CONFIG_RCU_BOOST
16189  #include "../locking/rtmutex_common.h"
16192 - * Control variables for per-CPU and per-rcu_node kthreads.  These
16193 - * handle all flavors of RCU.
16194 - */
16195 -static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
16196 -DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
16197 -DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
16198 -DEFINE_PER_CPU(char, rcu_cpu_has_work);
16200  #else /* #ifdef CONFIG_RCU_BOOST */
16202  /*
16203 @@ -55,6 +40,14 @@
16205  #endif /* #else #ifdef CONFIG_RCU_BOOST */
16208 + * Control variables for per-CPU and per-rcu_node kthreads.  These
16209 + * handle all flavors of RCU.
16210 + */
16211 +DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
16212 +DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
16213 +DEFINE_PER_CPU(char, rcu_cpu_has_work);
16215  #ifdef CONFIG_RCU_NOCB_CPU
16216  static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */
16217  static bool have_rcu_nocb_mask;            /* Was rcu_nocb_mask allocated? */
16218 @@ -426,7 +419,7 @@
16219         }
16221         /* Hardware IRQ handlers cannot block, complain if they get here. */
16222 -       if (in_irq() || in_serving_softirq()) {
16223 +       if (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_OFFSET)) {
16224                 lockdep_rcu_suspicious(__FILE__, __LINE__,
16225                                        "rcu_read_unlock() from irq or softirq with blocking in critical section!!!\n");
16226                 pr_alert("->rcu_read_unlock_special: %#x (b: %d, enq: %d nq: %d)\n",
16227 @@ -632,15 +625,6 @@
16228                 t->rcu_read_unlock_special.b.need_qs = true;
16231 -#ifdef CONFIG_RCU_BOOST
16233 -static void rcu_preempt_do_callbacks(void)
16235 -       rcu_do_batch(rcu_state_p, this_cpu_ptr(rcu_data_p));
16238 -#endif /* #ifdef CONFIG_RCU_BOOST */
16240  /*
16241   * Queue a preemptible-RCU callback for invocation after a grace period.
16242   */
16243 @@ -829,6 +813,19 @@
16245  #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
16248 + * If boosting, set rcuc kthreads to realtime priority.
16249 + */
16250 +static void rcu_cpu_kthread_setup(unsigned int cpu)
16252 +#ifdef CONFIG_RCU_BOOST
16253 +       struct sched_param sp;
16255 +       sp.sched_priority = kthread_prio;
16256 +       sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
16257 +#endif /* #ifdef CONFIG_RCU_BOOST */
16260  #ifdef CONFIG_RCU_BOOST
16262  #include "../locking/rtmutex_common.h"
16263 @@ -860,16 +857,6 @@
16265  #endif /* #else #ifdef CONFIG_RCU_TRACE */
16267 -static void rcu_wake_cond(struct task_struct *t, int status)
16269 -       /*
16270 -        * If the thread is yielding, only wake it when this
16271 -        * is invoked from idle
16272 -        */
16273 -       if (status != RCU_KTHREAD_YIELDING || is_idle_task(current))
16274 -               wake_up_process(t);
16277  /*
16278   * Carry out RCU priority boosting on the task indicated by ->exp_tasks
16279   * or ->boost_tasks, advancing the pointer to the next task in the
16280 @@ -1013,23 +1000,6 @@
16283  /*
16284 - * Wake up the per-CPU kthread to invoke RCU callbacks.
16285 - */
16286 -static void invoke_rcu_callbacks_kthread(void)
16288 -       unsigned long flags;
16290 -       local_irq_save(flags);
16291 -       __this_cpu_write(rcu_cpu_has_work, 1);
16292 -       if (__this_cpu_read(rcu_cpu_kthread_task) != NULL &&
16293 -           current != __this_cpu_read(rcu_cpu_kthread_task)) {
16294 -               rcu_wake_cond(__this_cpu_read(rcu_cpu_kthread_task),
16295 -                             __this_cpu_read(rcu_cpu_kthread_status));
16296 -       }
16297 -       local_irq_restore(flags);
16301   * Is the current CPU running the RCU-callbacks kthread?
16302   * Caller must have preemption disabled.
16303   */
16304 @@ -1083,67 +1053,6 @@
16305         return 0;
16308 -static void rcu_kthread_do_work(void)
16310 -       rcu_do_batch(&rcu_sched_state, this_cpu_ptr(&rcu_sched_data));
16311 -       rcu_do_batch(&rcu_bh_state, this_cpu_ptr(&rcu_bh_data));
16312 -       rcu_preempt_do_callbacks();
16315 -static void rcu_cpu_kthread_setup(unsigned int cpu)
16317 -       struct sched_param sp;
16319 -       sp.sched_priority = kthread_prio;
16320 -       sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
16323 -static void rcu_cpu_kthread_park(unsigned int cpu)
16325 -       per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
16328 -static int rcu_cpu_kthread_should_run(unsigned int cpu)
16330 -       return __this_cpu_read(rcu_cpu_has_work);
16334 - * Per-CPU kernel thread that invokes RCU callbacks.  This replaces the
16335 - * RCU softirq used in flavors and configurations of RCU that do not
16336 - * support RCU priority boosting.
16337 - */
16338 -static void rcu_cpu_kthread(unsigned int cpu)
16340 -       unsigned int *statusp = this_cpu_ptr(&rcu_cpu_kthread_status);
16341 -       char work, *workp = this_cpu_ptr(&rcu_cpu_has_work);
16342 -       int spincnt;
16344 -       for (spincnt = 0; spincnt < 10; spincnt++) {
16345 -               trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait"));
16346 -               local_bh_disable();
16347 -               *statusp = RCU_KTHREAD_RUNNING;
16348 -               this_cpu_inc(rcu_cpu_kthread_loops);
16349 -               local_irq_disable();
16350 -               work = *workp;
16351 -               *workp = 0;
16352 -               local_irq_enable();
16353 -               if (work)
16354 -                       rcu_kthread_do_work();
16355 -               local_bh_enable();
16356 -               if (*workp == 0) {
16357 -                       trace_rcu_utilization(TPS("End CPU kthread@rcu_wait"));
16358 -                       *statusp = RCU_KTHREAD_WAITING;
16359 -                       return;
16360 -               }
16361 -       }
16362 -       *statusp = RCU_KTHREAD_YIELDING;
16363 -       trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield"));
16364 -       schedule_timeout_interruptible(2);
16365 -       trace_rcu_utilization(TPS("End CPU kthread@rcu_yield"));
16366 -       *statusp = RCU_KTHREAD_WAITING;
16369  /*
16370   * Set the per-rcu_node kthread's affinity to cover all CPUs that are
16371   * served by the rcu_node in question.  The CPU hotplug lock is still
16372 @@ -1174,26 +1083,12 @@
16373         free_cpumask_var(cm);
16376 -static struct smp_hotplug_thread rcu_cpu_thread_spec = {
16377 -       .store                  = &rcu_cpu_kthread_task,
16378 -       .thread_should_run      = rcu_cpu_kthread_should_run,
16379 -       .thread_fn              = rcu_cpu_kthread,
16380 -       .thread_comm            = "rcuc/%u",
16381 -       .setup                  = rcu_cpu_kthread_setup,
16382 -       .park                   = rcu_cpu_kthread_park,
16385  /*
16386   * Spawn boost kthreads -- called as soon as the scheduler is running.
16387   */
16388  static void __init rcu_spawn_boost_kthreads(void)
16390         struct rcu_node *rnp;
16391 -       int cpu;
16393 -       for_each_possible_cpu(cpu)
16394 -               per_cpu(rcu_cpu_has_work, cpu) = 0;
16395 -       BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec));
16396         rcu_for_each_leaf_node(rcu_state_p, rnp)
16397                 (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp);
16399 @@ -1216,11 +1111,6 @@
16400         raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
16403 -static void invoke_rcu_callbacks_kthread(void)
16405 -       WARN_ON_ONCE(1);
16408  static bool rcu_is_callbacks_kthread(void)
16410         return false;
16411 @@ -1244,7 +1134,7 @@
16413  #endif /* #else #ifdef CONFIG_RCU_BOOST */
16415 -#if !defined(CONFIG_RCU_FAST_NO_HZ)
16416 +#if !defined(CONFIG_RCU_FAST_NO_HZ) || defined(CONFIG_PREEMPT_RT_FULL)
16418  /*
16419   * Check to see if any future RCU-related work will need to be done
16420 @@ -1261,7 +1151,9 @@
16421         return IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL)
16422                ? 0 : rcu_cpu_has_callbacks(NULL);
16424 +#endif /* !defined(CONFIG_RCU_FAST_NO_HZ) || defined(CONFIG_PREEMPT_RT_FULL) */
16426 +#if !defined(CONFIG_RCU_FAST_NO_HZ)
16427  /*
16428   * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up
16429   * after it.
16430 @@ -1357,6 +1249,8 @@
16431         return cbs_ready;
16434 +#ifndef CONFIG_PREEMPT_RT_FULL
16436  /*
16437   * Allow the CPU to enter dyntick-idle mode unless it has callbacks ready
16438   * to invoke.  If the CPU has callbacks, try to advance them.  Tell the
16439 @@ -1402,6 +1296,7 @@
16440         *nextevt = basemono + dj * TICK_NSEC;
16441         return 0;
16443 +#endif /* #ifndef CONFIG_PREEMPT_RT_FULL */
16445  /*
16446   * Prepare a CPU for idle from an RCU perspective.  The first major task
16447 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/rcu/update.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/rcu/update.c
16448 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/rcu/update.c     2017-04-16 10:38:29.000000000 +0200
16449 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/rcu/update.c  2017-04-18 17:54:26.000000000 +0200
16450 @@ -62,7 +62,7 @@
16451  #ifndef CONFIG_TINY_RCU
16452  module_param(rcu_expedited, int, 0);
16453  module_param(rcu_normal, int, 0);
16454 -static int rcu_normal_after_boot;
16455 +static int rcu_normal_after_boot = IS_ENABLED(CONFIG_PREEMPT_RT_FULL);
16456  module_param(rcu_normal_after_boot, int, 0);
16457  #endif /* #ifndef CONFIG_TINY_RCU */
16459 @@ -132,8 +132,7 @@
16461  EXPORT_SYMBOL_GPL(rcu_gp_is_normal);
16463 -static atomic_t rcu_expedited_nesting =
16464 -       ATOMIC_INIT(IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT) ? 1 : 0);
16465 +static atomic_t rcu_expedited_nesting =        ATOMIC_INIT(1);
16467  /*
16468   * Should normal grace-period primitives be expedited?  Intended for
16469 @@ -182,8 +181,7 @@
16470   */
16471  void rcu_end_inkernel_boot(void)
16473 -       if (IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT))
16474 -               rcu_unexpedite_gp();
16475 +       rcu_unexpedite_gp();
16476         if (rcu_normal_after_boot)
16477                 WRITE_ONCE(rcu_normal, 1);
16479 @@ -298,6 +296,7 @@
16481  EXPORT_SYMBOL_GPL(rcu_read_lock_held);
16483 +#ifndef CONFIG_PREEMPT_RT_FULL
16484  /**
16485   * rcu_read_lock_bh_held() - might we be in RCU-bh read-side critical section?
16486   *
16487 @@ -324,6 +323,7 @@
16488         return in_softirq() || irqs_disabled();
16490  EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
16491 +#endif
16493  #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
16495 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/sched/Makefile linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/sched/Makefile
16496 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/sched/Makefile   2017-04-16 10:38:29.000000000 +0200
16497 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/sched/Makefile        2017-04-18 17:54:26.000000000 +0200
16498 @@ -17,7 +17,7 @@
16500  obj-y += core.o loadavg.o clock.o cputime.o
16501  obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
16502 -obj-y += wait.o swait.o completion.o idle.o
16503 +obj-y += wait.o swait.o swork.o completion.o idle.o
16504  obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o
16505  obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
16506  obj-$(CONFIG_SCHEDSTATS) += stats.o
16507 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/sched/completion.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/sched/completion.c
16508 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/sched/completion.c       2017-04-16 10:38:29.000000000 +0200
16509 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/sched/completion.c    2017-04-18 17:54:26.000000000 +0200
16510 @@ -30,10 +30,10 @@
16512         unsigned long flags;
16514 -       spin_lock_irqsave(&x->wait.lock, flags);
16515 +       raw_spin_lock_irqsave(&x->wait.lock, flags);
16516         x->done++;
16517 -       __wake_up_locked(&x->wait, TASK_NORMAL, 1);
16518 -       spin_unlock_irqrestore(&x->wait.lock, flags);
16519 +       swake_up_locked(&x->wait);
16520 +       raw_spin_unlock_irqrestore(&x->wait.lock, flags);
16522  EXPORT_SYMBOL(complete);
16524 @@ -50,10 +50,10 @@
16526         unsigned long flags;
16528 -       spin_lock_irqsave(&x->wait.lock, flags);
16529 +       raw_spin_lock_irqsave(&x->wait.lock, flags);
16530         x->done += UINT_MAX/2;
16531 -       __wake_up_locked(&x->wait, TASK_NORMAL, 0);
16532 -       spin_unlock_irqrestore(&x->wait.lock, flags);
16533 +       swake_up_all_locked(&x->wait);
16534 +       raw_spin_unlock_irqrestore(&x->wait.lock, flags);
16536  EXPORT_SYMBOL(complete_all);
16538 @@ -62,20 +62,20 @@
16539                    long (*action)(long), long timeout, int state)
16541         if (!x->done) {
16542 -               DECLARE_WAITQUEUE(wait, current);
16543 +               DECLARE_SWAITQUEUE(wait);
16545 -               __add_wait_queue_tail_exclusive(&x->wait, &wait);
16546 +               __prepare_to_swait(&x->wait, &wait);
16547                 do {
16548                         if (signal_pending_state(state, current)) {
16549                                 timeout = -ERESTARTSYS;
16550                                 break;
16551                         }
16552                         __set_current_state(state);
16553 -                       spin_unlock_irq(&x->wait.lock);
16554 +                       raw_spin_unlock_irq(&x->wait.lock);
16555                         timeout = action(timeout);
16556 -                       spin_lock_irq(&x->wait.lock);
16557 +                       raw_spin_lock_irq(&x->wait.lock);
16558                 } while (!x->done && timeout);
16559 -               __remove_wait_queue(&x->wait, &wait);
16560 +               __finish_swait(&x->wait, &wait);
16561                 if (!x->done)
16562                         return timeout;
16563         }
16564 @@ -89,9 +89,9 @@
16566         might_sleep();
16568 -       spin_lock_irq(&x->wait.lock);
16569 +       raw_spin_lock_irq(&x->wait.lock);
16570         timeout = do_wait_for_common(x, action, timeout, state);
16571 -       spin_unlock_irq(&x->wait.lock);
16572 +       raw_spin_unlock_irq(&x->wait.lock);
16573         return timeout;
16576 @@ -277,12 +277,12 @@
16577         if (!READ_ONCE(x->done))
16578                 return 0;
16580 -       spin_lock_irqsave(&x->wait.lock, flags);
16581 +       raw_spin_lock_irqsave(&x->wait.lock, flags);
16582         if (!x->done)
16583                 ret = 0;
16584         else
16585                 x->done--;
16586 -       spin_unlock_irqrestore(&x->wait.lock, flags);
16587 +       raw_spin_unlock_irqrestore(&x->wait.lock, flags);
16588         return ret;
16590  EXPORT_SYMBOL(try_wait_for_completion);
16591 @@ -311,7 +311,7 @@
16592          * after it's acquired the lock.
16593          */
16594         smp_rmb();
16595 -       spin_unlock_wait(&x->wait.lock);
16596 +       raw_spin_unlock_wait(&x->wait.lock);
16597         return true;
16599  EXPORT_SYMBOL(completion_done);
16600 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/sched/core.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/sched/core.c
16601 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/sched/core.c     2017-04-16 10:38:29.000000000 +0200
16602 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/sched/core.c  2017-04-18 17:54:26.000000000 +0200
16603 @@ -129,7 +129,11 @@
16604   * Number of tasks to iterate in a single balance run.
16605   * Limited because this is done with IRQs disabled.
16606   */
16607 +#ifndef CONFIG_PREEMPT_RT_FULL
16608  const_debug unsigned int sysctl_sched_nr_migrate = 32;
16609 +#else
16610 +const_debug unsigned int sysctl_sched_nr_migrate = 8;
16611 +#endif
16613  /*
16614   * period over which we average the RT time consumption, measured
16615 @@ -345,6 +349,7 @@
16617         hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
16618         rq->hrtick_timer.function = hrtick;
16619 +       rq->hrtick_timer.irqsafe = 1;
16621  #else  /* CONFIG_SCHED_HRTICK */
16622  static inline void hrtick_clear(struct rq *rq)
16623 @@ -449,7 +454,7 @@
16624         head->lastp = &node->next;
16627 -void wake_up_q(struct wake_q_head *head)
16628 +void __wake_up_q(struct wake_q_head *head, bool sleeper)
16630         struct wake_q_node *node = head->first;
16632 @@ -466,7 +471,10 @@
16633                  * wake_up_process() implies a wmb() to pair with the queueing
16634                  * in wake_q_add() so as not to miss wakeups.
16635                  */
16636 -               wake_up_process(task);
16637 +               if (sleeper)
16638 +                       wake_up_lock_sleeper(task);
16639 +               else
16640 +                       wake_up_process(task);
16641                 put_task_struct(task);
16642         }
16644 @@ -502,6 +510,38 @@
16645                 trace_sched_wake_idle_without_ipi(cpu);
16648 +#ifdef CONFIG_PREEMPT_LAZY
16649 +void resched_curr_lazy(struct rq *rq)
16651 +       struct task_struct *curr = rq->curr;
16652 +       int cpu;
16654 +       if (!sched_feat(PREEMPT_LAZY)) {
16655 +               resched_curr(rq);
16656 +               return;
16657 +       }
16659 +       lockdep_assert_held(&rq->lock);
16661 +       if (test_tsk_need_resched(curr))
16662 +               return;
16664 +       if (test_tsk_need_resched_lazy(curr))
16665 +               return;
16667 +       set_tsk_need_resched_lazy(curr);
16669 +       cpu = cpu_of(rq);
16670 +       if (cpu == smp_processor_id())
16671 +               return;
16673 +       /* NEED_RESCHED_LAZY must be visible before we test polling */
16674 +       smp_mb();
16675 +       if (!tsk_is_polling(curr))
16676 +               smp_send_reschedule(cpu);
16678 +#endif
16680  void resched_cpu(int cpu)
16682         struct rq *rq = cpu_rq(cpu);
16683 @@ -525,11 +565,14 @@
16684   */
16685  int get_nohz_timer_target(void)
16687 -       int i, cpu = smp_processor_id();
16688 +       int i, cpu;
16689         struct sched_domain *sd;
16691 +       preempt_disable_rt();
16692 +       cpu = smp_processor_id();
16694         if (!idle_cpu(cpu) && is_housekeeping_cpu(cpu))
16695 -               return cpu;
16696 +               goto preempt_en_rt;
16698         rcu_read_lock();
16699         for_each_domain(cpu, sd) {
16700 @@ -548,6 +591,8 @@
16701                 cpu = housekeeping_any_cpu();
16702  unlock:
16703         rcu_read_unlock();
16704 +preempt_en_rt:
16705 +       preempt_enable_rt();
16706         return cpu;
16708  /*
16709 @@ -1100,6 +1145,11 @@
16711         lockdep_assert_held(&p->pi_lock);
16713 +       if (__migrate_disabled(p)) {
16714 +               cpumask_copy(&p->cpus_allowed, new_mask);
16715 +               return;
16716 +       }
16718         queued = task_on_rq_queued(p);
16719         running = task_current(rq, p);
16721 @@ -1122,6 +1172,84 @@
16722                 set_curr_task(rq, p);
16725 +static DEFINE_PER_CPU(struct cpumask, sched_cpumasks);
16726 +static DEFINE_MUTEX(sched_down_mutex);
16727 +static cpumask_t sched_down_cpumask;
16729 +void tell_sched_cpu_down_begin(int cpu)
16731 +       mutex_lock(&sched_down_mutex);
16732 +       cpumask_set_cpu(cpu, &sched_down_cpumask);
16733 +       mutex_unlock(&sched_down_mutex);
16736 +void tell_sched_cpu_down_done(int cpu)
16738 +       mutex_lock(&sched_down_mutex);
16739 +       cpumask_clear_cpu(cpu, &sched_down_cpumask);
16740 +       mutex_unlock(&sched_down_mutex);
16743 +/**
16744 + * migrate_me - try to move the current task off this cpu
16745 + *
16746 + * Used by the pin_current_cpu() code to try to get tasks
16747 + * to move off the current CPU as it is going down.
16748 + * It will only move the task if the task isn't pinned to
16749 + * the CPU (with migrate_disable, affinity or NO_SETAFFINITY)
16750 + * and the task has to be in a RUNNING state. Otherwise the
16751 + * movement of the task will wake it up (change its state
16752 + * to running) when the task did not expect it.
16753 + *
16754 + * Returns 1 if it succeeded in moving the current task
16755 + *         0 otherwise.
16756 + */
16757 +int migrate_me(void)
16759 +       struct task_struct *p = current;
16760 +       struct migration_arg arg;
16761 +       struct cpumask *cpumask;
16762 +       struct cpumask *mask;
16763 +       unsigned int dest_cpu;
16764 +       struct rq_flags rf;
16765 +       struct rq *rq;
16767 +       /*
16768 +        * We can not migrate tasks bounded to a CPU or tasks not
16769 +        * running. The movement of the task will wake it up.
16770 +        */
16771 +       if (p->flags & PF_NO_SETAFFINITY || p->state)
16772 +               return 0;
16774 +       mutex_lock(&sched_down_mutex);
16775 +       rq = task_rq_lock(p, &rf);
16777 +       cpumask = this_cpu_ptr(&sched_cpumasks);
16778 +       mask = &p->cpus_allowed;
16780 +       cpumask_andnot(cpumask, mask, &sched_down_cpumask);
16782 +       if (!cpumask_weight(cpumask)) {
16783 +               /* It's only on this CPU? */
16784 +               task_rq_unlock(rq, p, &rf);
16785 +               mutex_unlock(&sched_down_mutex);
16786 +               return 0;
16787 +       }
16789 +       dest_cpu = cpumask_any_and(cpu_active_mask, cpumask);
16791 +       arg.task = p;
16792 +       arg.dest_cpu = dest_cpu;
16794 +       task_rq_unlock(rq, p, &rf);
16796 +       stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
16797 +       tlb_migrate_finish(p->mm);
16798 +       mutex_unlock(&sched_down_mutex);
16800 +       return 1;
16803  /*
16804   * Change a given task's CPU affinity. Migrate the thread to a
16805   * proper CPU and schedule it away if the CPU it's executing on
16806 @@ -1179,7 +1307,7 @@
16807         }
16809         /* Can the task run on the task's current CPU? If so, we're done */
16810 -       if (cpumask_test_cpu(task_cpu(p), new_mask))
16811 +       if (cpumask_test_cpu(task_cpu(p), new_mask) || __migrate_disabled(p))
16812                 goto out;
16814         dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);
16815 @@ -1366,6 +1494,18 @@
16816         return ret;
16819 +static bool check_task_state(struct task_struct *p, long match_state)
16821 +       bool match = false;
16823 +       raw_spin_lock_irq(&p->pi_lock);
16824 +       if (p->state == match_state || p->saved_state == match_state)
16825 +               match = true;
16826 +       raw_spin_unlock_irq(&p->pi_lock);
16828 +       return match;
16831  /*
16832   * wait_task_inactive - wait for a thread to unschedule.
16833   *
16834 @@ -1410,7 +1550,7 @@
16835                  * is actually now running somewhere else!
16836                  */
16837                 while (task_running(rq, p)) {
16838 -                       if (match_state && unlikely(p->state != match_state))
16839 +                       if (match_state && !check_task_state(p, match_state))
16840                                 return 0;
16841                         cpu_relax();
16842                 }
16843 @@ -1425,7 +1565,8 @@
16844                 running = task_running(rq, p);
16845                 queued = task_on_rq_queued(p);
16846                 ncsw = 0;
16847 -               if (!match_state || p->state == match_state)
16848 +               if (!match_state || p->state == match_state ||
16849 +                   p->saved_state == match_state)
16850                         ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
16851                 task_rq_unlock(rq, p, &rf);
16853 @@ -1680,10 +1821,6 @@
16855         activate_task(rq, p, en_flags);
16856         p->on_rq = TASK_ON_RQ_QUEUED;
16858 -       /* if a worker is waking up, notify workqueue */
16859 -       if (p->flags & PF_WQ_WORKER)
16860 -               wq_worker_waking_up(p, cpu_of(rq));
16863  /*
16864 @@ -2018,8 +2155,27 @@
16865          */
16866         smp_mb__before_spinlock();
16867         raw_spin_lock_irqsave(&p->pi_lock, flags);
16868 -       if (!(p->state & state))
16869 +       if (!(p->state & state)) {
16870 +               /*
16871 +                * The task might be running due to a spinlock sleeper
16872 +                * wakeup. Check the saved state and set it to running
16873 +                * if the wakeup condition is true.
16874 +                */
16875 +               if (!(wake_flags & WF_LOCK_SLEEPER)) {
16876 +                       if (p->saved_state & state) {
16877 +                               p->saved_state = TASK_RUNNING;
16878 +                               success = 1;
16879 +                       }
16880 +               }
16881                 goto out;
16882 +       }
16884 +       /*
16885 +        * If this is a regular wakeup, then we can unconditionally
16886 +        * clear the saved state of a "lock sleeper".
16887 +        */
16888 +       if (!(wake_flags & WF_LOCK_SLEEPER))
16889 +               p->saved_state = TASK_RUNNING;
16891         trace_sched_waking(p);
16893 @@ -2102,53 +2258,6 @@
16896  /**
16897 - * try_to_wake_up_local - try to wake up a local task with rq lock held
16898 - * @p: the thread to be awakened
16899 - * @cookie: context's cookie for pinning
16900 - *
16901 - * Put @p on the run-queue if it's not already there. The caller must
16902 - * ensure that this_rq() is locked, @p is bound to this_rq() and not
16903 - * the current task.
16904 - */
16905 -static void try_to_wake_up_local(struct task_struct *p, struct pin_cookie cookie)
16907 -       struct rq *rq = task_rq(p);
16909 -       if (WARN_ON_ONCE(rq != this_rq()) ||
16910 -           WARN_ON_ONCE(p == current))
16911 -               return;
16913 -       lockdep_assert_held(&rq->lock);
16915 -       if (!raw_spin_trylock(&p->pi_lock)) {
16916 -               /*
16917 -                * This is OK, because current is on_cpu, which avoids it being
16918 -                * picked for load-balance and preemption/IRQs are still
16919 -                * disabled avoiding further scheduler activity on it and we've
16920 -                * not yet picked a replacement task.
16921 -                */
16922 -               lockdep_unpin_lock(&rq->lock, cookie);
16923 -               raw_spin_unlock(&rq->lock);
16924 -               raw_spin_lock(&p->pi_lock);
16925 -               raw_spin_lock(&rq->lock);
16926 -               lockdep_repin_lock(&rq->lock, cookie);
16927 -       }
16929 -       if (!(p->state & TASK_NORMAL))
16930 -               goto out;
16932 -       trace_sched_waking(p);
16934 -       if (!task_on_rq_queued(p))
16935 -               ttwu_activate(rq, p, ENQUEUE_WAKEUP);
16937 -       ttwu_do_wakeup(rq, p, 0, cookie);
16938 -       ttwu_stat(p, smp_processor_id(), 0);
16939 -out:
16940 -       raw_spin_unlock(&p->pi_lock);
16943 -/**
16944   * wake_up_process - Wake up a specific process
16945   * @p: The process to be woken up.
16946   *
16947 @@ -2166,6 +2275,18 @@
16949  EXPORT_SYMBOL(wake_up_process);
16951 +/**
16952 + * wake_up_lock_sleeper - Wake up a specific process blocked on a "sleeping lock"
16953 + * @p: The process to be woken up.
16954 + *
16955 + * Same as wake_up_process() above, but wake_flags=WF_LOCK_SLEEPER to indicate
16956 + * the nature of the wakeup.
16957 + */
16958 +int wake_up_lock_sleeper(struct task_struct *p)
16960 +       return try_to_wake_up(p, TASK_ALL, WF_LOCK_SLEEPER);
16963  int wake_up_state(struct task_struct *p, unsigned int state)
16965         return try_to_wake_up(p, state, 0);
16966 @@ -2442,6 +2563,9 @@
16967         p->on_cpu = 0;
16968  #endif
16969         init_task_preempt_count(p);
16970 +#ifdef CONFIG_HAVE_PREEMPT_LAZY
16971 +       task_thread_info(p)->preempt_lazy_count = 0;
16972 +#endif
16973  #ifdef CONFIG_SMP
16974         plist_node_init(&p->pushable_tasks, MAX_PRIO);
16975         RB_CLEAR_NODE(&p->pushable_dl_tasks);
16976 @@ -2770,21 +2894,16 @@
16977         finish_arch_post_lock_switch();
16979         fire_sched_in_preempt_notifiers(current);
16980 +       /*
16981 +        * We use mmdrop_delayed() here so we don't have to do the
16982 +        * full __mmdrop() when we are the last user.
16983 +        */
16984         if (mm)
16985 -               mmdrop(mm);
16986 +               mmdrop_delayed(mm);
16987         if (unlikely(prev_state == TASK_DEAD)) {
16988                 if (prev->sched_class->task_dead)
16989                         prev->sched_class->task_dead(prev);
16991 -               /*
16992 -                * Remove function-return probe instances associated with this
16993 -                * task and put them back on the free list.
16994 -                */
16995 -               kprobe_flush_task(prev);
16997 -               /* Task is done with its stack. */
16998 -               put_task_stack(prev);
17000                 put_task_struct(prev);
17001         }
17003 @@ -3252,6 +3371,77 @@
17004         schedstat_inc(this_rq()->sched_count);
17007 +#if defined(CONFIG_PREEMPT_RT_FULL) && defined(CONFIG_SMP)
17009 +void migrate_disable(void)
17011 +       struct task_struct *p = current;
17013 +       if (in_atomic() || irqs_disabled()) {
17014 +#ifdef CONFIG_SCHED_DEBUG
17015 +               p->migrate_disable_atomic++;
17016 +#endif
17017 +               return;
17018 +       }
17020 +#ifdef CONFIG_SCHED_DEBUG
17021 +       if (unlikely(p->migrate_disable_atomic)) {
17022 +               tracing_off();
17023 +               WARN_ON_ONCE(1);
17024 +       }
17025 +#endif
17027 +       if (p->migrate_disable) {
17028 +               p->migrate_disable++;
17029 +               return;
17030 +       }
17032 +       preempt_disable();
17033 +       preempt_lazy_disable();
17034 +       pin_current_cpu();
17035 +       p->migrate_disable = 1;
17036 +       preempt_enable();
17038 +EXPORT_SYMBOL(migrate_disable);
17040 +void migrate_enable(void)
17042 +       struct task_struct *p = current;
17044 +       if (in_atomic() || irqs_disabled()) {
17045 +#ifdef CONFIG_SCHED_DEBUG
17046 +               p->migrate_disable_atomic--;
17047 +#endif
17048 +               return;
17049 +       }
17051 +#ifdef CONFIG_SCHED_DEBUG
17052 +       if (unlikely(p->migrate_disable_atomic)) {
17053 +               tracing_off();
17054 +               WARN_ON_ONCE(1);
17055 +       }
17056 +#endif
17057 +       WARN_ON_ONCE(p->migrate_disable <= 0);
17059 +       if (p->migrate_disable > 1) {
17060 +               p->migrate_disable--;
17061 +               return;
17062 +       }
17064 +       preempt_disable();
17065 +       /*
17066 +        * Clearing migrate_disable causes tsk_cpus_allowed to
17067 +        * show the tasks original cpu affinity.
17068 +        */
17069 +       p->migrate_disable = 0;
17071 +       unpin_current_cpu();
17072 +       preempt_enable();
17073 +       preempt_lazy_enable();
17075 +EXPORT_SYMBOL(migrate_enable);
17076 +#endif
17078  /*
17079   * Pick up the highest-prio task:
17080   */
17081 @@ -3368,19 +3558,6 @@
17082                 } else {
17083                         deactivate_task(rq, prev, DEQUEUE_SLEEP);
17084                         prev->on_rq = 0;
17086 -                       /*
17087 -                        * If a worker went to sleep, notify and ask workqueue
17088 -                        * whether it wants to wake up a task to maintain
17089 -                        * concurrency.
17090 -                        */
17091 -                       if (prev->flags & PF_WQ_WORKER) {
17092 -                               struct task_struct *to_wakeup;
17094 -                               to_wakeup = wq_worker_sleeping(prev);
17095 -                               if (to_wakeup)
17096 -                                       try_to_wake_up_local(to_wakeup, cookie);
17097 -                       }
17098                 }
17099                 switch_count = &prev->nvcsw;
17100         }
17101 @@ -3390,6 +3567,7 @@
17103         next = pick_next_task(rq, prev, cookie);
17104         clear_tsk_need_resched(prev);
17105 +       clear_tsk_need_resched_lazy(prev);
17106         clear_preempt_need_resched();
17107         rq->clock_skip_update = 0;
17109 @@ -3437,9 +3615,20 @@
17111  static inline void sched_submit_work(struct task_struct *tsk)
17113 -       if (!tsk->state || tsk_is_pi_blocked(tsk))
17114 +       if (!tsk->state)
17115                 return;
17116         /*
17117 +        * If a worker went to sleep, notify and ask workqueue whether
17118 +        * it wants to wake up a task to maintain concurrency.
17119 +        */
17120 +       if (tsk->flags & PF_WQ_WORKER)
17121 +               wq_worker_sleeping(tsk);
17124 +       if (tsk_is_pi_blocked(tsk))
17125 +               return;
17127 +       /*
17128          * If we are going to sleep and we have plugged IO queued,
17129          * make sure to submit it to avoid deadlocks.
17130          */
17131 @@ -3447,6 +3636,12 @@
17132                 blk_schedule_flush_plug(tsk);
17135 +static void sched_update_worker(struct task_struct *tsk)
17137 +       if (tsk->flags & PF_WQ_WORKER)
17138 +               wq_worker_running(tsk);
17141  asmlinkage __visible void __sched schedule(void)
17143         struct task_struct *tsk = current;
17144 @@ -3457,6 +3652,7 @@
17145                 __schedule(false);
17146                 sched_preempt_enable_no_resched();
17147         } while (need_resched());
17148 +       sched_update_worker(tsk);
17150  EXPORT_SYMBOL(schedule);
17152 @@ -3520,6 +3716,30 @@
17153         } while (need_resched());
17156 +#ifdef CONFIG_PREEMPT_LAZY
17158 + * If TIF_NEED_RESCHED is then we allow to be scheduled away since this is
17159 + * set by a RT task. Oterwise we try to avoid beeing scheduled out as long as
17160 + * preempt_lazy_count counter >0.
17161 + */
17162 +static __always_inline int preemptible_lazy(void)
17164 +       if (test_thread_flag(TIF_NEED_RESCHED))
17165 +               return 1;
17166 +       if (current_thread_info()->preempt_lazy_count)
17167 +               return 0;
17168 +       return 1;
17171 +#else
17173 +static inline int preemptible_lazy(void)
17175 +       return 1;
17178 +#endif
17180  #ifdef CONFIG_PREEMPT
17181  /*
17182   * this is the entry point to schedule() from in-kernel preemption
17183 @@ -3534,7 +3754,8 @@
17184          */
17185         if (likely(!preemptible()))
17186                 return;
17188 +       if (!preemptible_lazy())
17189 +               return;
17190         preempt_schedule_common();
17192  NOKPROBE_SYMBOL(preempt_schedule);
17193 @@ -3561,6 +3782,9 @@
17194         if (likely(!preemptible()))
17195                 return;
17197 +       if (!preemptible_lazy())
17198 +               return;
17200         do {
17201                 /*
17202                  * Because the function tracer can trace preempt_count_sub()
17203 @@ -3583,7 +3807,16 @@
17204                  * an infinite recursion.
17205                  */
17206                 prev_ctx = exception_enter();
17207 +               /*
17208 +                * The add/subtract must not be traced by the function
17209 +                * tracer. But we still want to account for the
17210 +                * preempt off latency tracer. Since the _notrace versions
17211 +                * of add/subtract skip the accounting for latency tracer
17212 +                * we must force it manually.
17213 +                */
17214 +               start_critical_timings();
17215                 __schedule(true);
17216 +               stop_critical_timings();
17217                 exception_exit(prev_ctx);
17219                 preempt_latency_stop(1);
17220 @@ -4939,6 +5172,7 @@
17222  EXPORT_SYMBOL(__cond_resched_lock);
17224 +#ifndef CONFIG_PREEMPT_RT_FULL
17225  int __sched __cond_resched_softirq(void)
17227         BUG_ON(!in_softirq());
17228 @@ -4952,6 +5186,7 @@
17229         return 0;
17231  EXPORT_SYMBOL(__cond_resched_softirq);
17232 +#endif
17234  /**
17235   * yield - yield the current processor to other threads.
17236 @@ -5315,7 +5550,9 @@
17238         /* Set the preempt count _outside_ the spinlocks! */
17239         init_idle_preempt_count(idle, cpu);
17241 +#ifdef CONFIG_HAVE_PREEMPT_LAZY
17242 +       task_thread_info(idle)->preempt_lazy_count = 0;
17243 +#endif
17244         /*
17245          * The idle tasks have their own, simple scheduling class:
17246          */
17247 @@ -5458,6 +5695,8 @@
17248  #endif /* CONFIG_NUMA_BALANCING */
17250  #ifdef CONFIG_HOTPLUG_CPU
17251 +static DEFINE_PER_CPU(struct mm_struct *, idle_last_mm);
17253  /*
17254   * Ensures that the idle task is using init_mm right before its cpu goes
17255   * offline.
17256 @@ -5472,7 +5711,12 @@
17257                 switch_mm_irqs_off(mm, &init_mm, current);
17258                 finish_arch_post_lock_switch();
17259         }
17260 -       mmdrop(mm);
17261 +       /*
17262 +        * Defer the cleanup to an alive cpu. On RT we can neither
17263 +        * call mmdrop() nor mmdrop_delayed() from here.
17264 +        */
17265 +       per_cpu(idle_last_mm, smp_processor_id()) = mm;
17269  /*
17270 @@ -7418,6 +7662,10 @@
17271         update_max_interval();
17272         nohz_balance_exit_idle(cpu);
17273         hrtick_clear(rq);
17274 +       if (per_cpu(idle_last_mm, cpu)) {
17275 +               mmdrop_delayed(per_cpu(idle_last_mm, cpu));
17276 +               per_cpu(idle_last_mm, cpu) = NULL;
17277 +       }
17278         return 0;
17280  #endif
17281 @@ -7698,7 +7946,7 @@
17282  #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
17283  static inline int preempt_count_equals(int preempt_offset)
17285 -       int nested = preempt_count() + rcu_preempt_depth();
17286 +       int nested = preempt_count() + sched_rcu_preempt_depth();
17288         return (nested == preempt_offset);
17290 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/sched/deadline.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/sched/deadline.c
17291 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/sched/deadline.c 2017-04-16 10:38:29.000000000 +0200
17292 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/sched/deadline.c      2017-04-18 17:54:26.000000000 +0200
17293 @@ -687,6 +687,7 @@
17295         hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
17296         timer->function = dl_task_timer;
17297 +       timer->irqsafe = 1;
17300  static
17301 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/sched/debug.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/sched/debug.c
17302 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/sched/debug.c    2017-04-16 10:38:29.000000000 +0200
17303 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/sched/debug.c 2017-04-18 17:54:26.000000000 +0200
17304 @@ -558,6 +558,9 @@
17305         P(rt_throttled);
17306         PN(rt_time);
17307         PN(rt_runtime);
17308 +#ifdef CONFIG_SMP
17309 +       P(rt_nr_migratory);
17310 +#endif
17312  #undef PN
17313  #undef P
17314 @@ -953,6 +956,10 @@
17315  #endif
17316         P(policy);
17317         P(prio);
17318 +#ifdef CONFIG_PREEMPT_RT_FULL
17319 +       P(migrate_disable);
17320 +#endif
17321 +       P(nr_cpus_allowed);
17322  #undef PN_SCHEDSTAT
17323  #undef PN
17324  #undef __PN
17325 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/sched/fair.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/sched/fair.c
17326 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/sched/fair.c     2017-04-16 10:38:29.000000000 +0200
17327 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/sched/fair.c  2017-04-18 17:54:26.000000000 +0200
17328 @@ -3518,7 +3518,7 @@
17329         ideal_runtime = sched_slice(cfs_rq, curr);
17330         delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
17331         if (delta_exec > ideal_runtime) {
17332 -               resched_curr(rq_of(cfs_rq));
17333 +               resched_curr_lazy(rq_of(cfs_rq));
17334                 /*
17335                  * The current task ran long enough, ensure it doesn't get
17336                  * re-elected due to buddy favours.
17337 @@ -3542,7 +3542,7 @@
17338                 return;
17340         if (delta > ideal_runtime)
17341 -               resched_curr(rq_of(cfs_rq));
17342 +               resched_curr_lazy(rq_of(cfs_rq));
17345  static void
17346 @@ -3684,7 +3684,7 @@
17347          * validating it and just reschedule.
17348          */
17349         if (queued) {
17350 -               resched_curr(rq_of(cfs_rq));
17351 +               resched_curr_lazy(rq_of(cfs_rq));
17352                 return;
17353         }
17354         /*
17355 @@ -3866,7 +3866,7 @@
17356          * hierarchy can be throttled
17357          */
17358         if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
17359 -               resched_curr(rq_of(cfs_rq));
17360 +               resched_curr_lazy(rq_of(cfs_rq));
17363  static __always_inline
17364 @@ -4494,7 +4494,7 @@
17366                 if (delta < 0) {
17367                         if (rq->curr == p)
17368 -                               resched_curr(rq);
17369 +                               resched_curr_lazy(rq);
17370                         return;
17371                 }
17372                 hrtick_start(rq, delta);
17373 @@ -5905,7 +5905,7 @@
17374         return;
17376  preempt:
17377 -       resched_curr(rq);
17378 +       resched_curr_lazy(rq);
17379         /*
17380          * Only set the backward buddy when the current task is still
17381          * on the rq. This can happen when a wakeup gets interleaved
17382 @@ -8631,7 +8631,7 @@
17383                  * 'current' within the tree based on its new key value.
17384                  */
17385                 swap(curr->vruntime, se->vruntime);
17386 -               resched_curr(rq);
17387 +               resched_curr_lazy(rq);
17388         }
17390         se->vruntime -= cfs_rq->min_vruntime;
17391 @@ -8655,7 +8655,7 @@
17392          */
17393         if (rq->curr == p) {
17394                 if (p->prio > oldprio)
17395 -                       resched_curr(rq);
17396 +                       resched_curr_lazy(rq);
17397         } else
17398                 check_preempt_curr(rq, p, 0);
17400 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/sched/features.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/sched/features.h
17401 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/sched/features.h 2017-04-16 10:38:29.000000000 +0200
17402 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/sched/features.h      2017-04-18 17:54:26.000000000 +0200
17403 @@ -45,11 +45,19 @@
17404   */
17405  SCHED_FEAT(NONTASK_CAPACITY, true)
17407 +#ifdef CONFIG_PREEMPT_RT_FULL
17408 +SCHED_FEAT(TTWU_QUEUE, false)
17409 +# ifdef CONFIG_PREEMPT_LAZY
17410 +SCHED_FEAT(PREEMPT_LAZY, true)
17411 +# endif
17412 +#else
17414  /*
17415   * Queue remote wakeups on the target CPU and process them
17416   * using the scheduler IPI. Reduces rq->lock contention/bounces.
17417   */
17418  SCHED_FEAT(TTWU_QUEUE, true)
17419 +#endif
17421  #ifdef HAVE_RT_PUSH_IPI
17422  /*
17423 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/sched/rt.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/sched/rt.c
17424 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/sched/rt.c       2017-04-16 10:38:29.000000000 +0200
17425 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/sched/rt.c    2017-04-18 17:54:26.000000000 +0200
17426 @@ -47,6 +47,7 @@
17428         hrtimer_init(&rt_b->rt_period_timer,
17429                         CLOCK_MONOTONIC, HRTIMER_MODE_REL);
17430 +       rt_b->rt_period_timer.irqsafe = 1;
17431         rt_b->rt_period_timer.function = sched_rt_period_timer;
17434 @@ -101,6 +102,7 @@
17435         rt_rq->push_cpu = nr_cpu_ids;
17436         raw_spin_lock_init(&rt_rq->push_lock);
17437         init_irq_work(&rt_rq->push_work, push_irq_work_func);
17438 +       rt_rq->push_work.flags |= IRQ_WORK_HARD_IRQ;
17439  #endif
17440  #endif /* CONFIG_SMP */
17441         /* We start is dequeued state, because no RT tasks are queued */
17442 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/sched/sched.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/sched/sched.h
17443 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/sched/sched.h    2017-04-16 10:38:29.000000000 +0200
17444 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/sched/sched.h 2017-04-18 17:54:26.000000000 +0200
17445 @@ -1163,6 +1163,7 @@
17446  #define WF_SYNC                0x01            /* waker goes to sleep after wakeup */
17447  #define WF_FORK                0x02            /* child wakeup after fork */
17448  #define WF_MIGRATED    0x4             /* internal use, task got migrated */
17449 +#define WF_LOCK_SLEEPER        0x08            /* wakeup spinlock "sleeper" */
17451  /*
17452   * To aid in avoiding the subversion of "niceness" due to uneven distribution
17453 @@ -1346,6 +1347,15 @@
17454  extern void resched_curr(struct rq *rq);
17455  extern void resched_cpu(int cpu);
17457 +#ifdef CONFIG_PREEMPT_LAZY
17458 +extern void resched_curr_lazy(struct rq *rq);
17459 +#else
17460 +static inline void resched_curr_lazy(struct rq *rq)
17462 +       resched_curr(rq);
17464 +#endif
17466  extern struct rt_bandwidth def_rt_bandwidth;
17467  extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
17469 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/sched/swait.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/sched/swait.c
17470 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/sched/swait.c    2017-04-16 10:38:29.000000000 +0200
17471 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/sched/swait.c 2017-04-18 17:54:26.000000000 +0200
17472 @@ -1,5 +1,6 @@
17473  #include <linux/sched.h>
17474  #include <linux/swait.h>
17475 +#include <linux/suspend.h>
17477  void __init_swait_queue_head(struct swait_queue_head *q, const char *name,
17478                              struct lock_class_key *key)
17479 @@ -29,6 +30,25 @@
17481  EXPORT_SYMBOL(swake_up_locked);
17483 +void swake_up_all_locked(struct swait_queue_head *q)
17485 +       struct swait_queue *curr;
17486 +       int wakes = 0;
17488 +       while (!list_empty(&q->task_list)) {
17490 +               curr = list_first_entry(&q->task_list, typeof(*curr),
17491 +                                       task_list);
17492 +               wake_up_process(curr->task);
17493 +               list_del_init(&curr->task_list);
17494 +               wakes++;
17495 +       }
17496 +       if (pm_in_action)
17497 +               return;
17498 +       WARN(wakes > 2, "complete_all() with %d waiters\n", wakes);
17500 +EXPORT_SYMBOL(swake_up_all_locked);
17502  void swake_up(struct swait_queue_head *q)
17504         unsigned long flags;
17505 @@ -54,6 +74,7 @@
17506         if (!swait_active(q))
17507                 return;
17509 +       WARN_ON(irqs_disabled());
17510         raw_spin_lock_irq(&q->lock);
17511         list_splice_init(&q->task_list, &tmp);
17512         while (!list_empty(&tmp)) {
17513 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/sched/swork.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/sched/swork.c
17514 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/sched/swork.c    1970-01-01 01:00:00.000000000 +0100
17515 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/sched/swork.c 2017-04-18 17:54:26.000000000 +0200
17516 @@ -0,0 +1,173 @@
17518 + * Copyright (C) 2014 BMW Car IT GmbH, Daniel Wagner daniel.wagner@bmw-carit.de
17519 + *
17520 + * Provides a framework for enqueuing callbacks from irq context
17521 + * PREEMPT_RT_FULL safe. The callbacks are executed in kthread context.
17522 + */
17524 +#include <linux/swait.h>
17525 +#include <linux/swork.h>
17526 +#include <linux/kthread.h>
17527 +#include <linux/slab.h>
17528 +#include <linux/spinlock.h>
17529 +#include <linux/export.h>
17531 +#define SWORK_EVENT_PENDING     (1 << 0)
17533 +static DEFINE_MUTEX(worker_mutex);
17534 +static struct sworker *glob_worker;
17536 +struct sworker {
17537 +       struct list_head events;
17538 +       struct swait_queue_head wq;
17540 +       raw_spinlock_t lock;
17542 +       struct task_struct *task;
17543 +       int refs;
17546 +static bool swork_readable(struct sworker *worker)
17548 +       bool r;
17550 +       if (kthread_should_stop())
17551 +               return true;
17553 +       raw_spin_lock_irq(&worker->lock);
17554 +       r = !list_empty(&worker->events);
17555 +       raw_spin_unlock_irq(&worker->lock);
17557 +       return r;
17560 +static int swork_kthread(void *arg)
17562 +       struct sworker *worker = arg;
17564 +       for (;;) {
17565 +               swait_event_interruptible(worker->wq,
17566 +                                       swork_readable(worker));
17567 +               if (kthread_should_stop())
17568 +                       break;
17570 +               raw_spin_lock_irq(&worker->lock);
17571 +               while (!list_empty(&worker->events)) {
17572 +                       struct swork_event *sev;
17574 +                       sev = list_first_entry(&worker->events,
17575 +                                       struct swork_event, item);
17576 +                       list_del(&sev->item);
17577 +                       raw_spin_unlock_irq(&worker->lock);
17579 +                       WARN_ON_ONCE(!test_and_clear_bit(SWORK_EVENT_PENDING,
17580 +                                                        &sev->flags));
17581 +                       sev->func(sev);
17582 +                       raw_spin_lock_irq(&worker->lock);
17583 +               }
17584 +               raw_spin_unlock_irq(&worker->lock);
17585 +       }
17586 +       return 0;
17589 +static struct sworker *swork_create(void)
17591 +       struct sworker *worker;
17593 +       worker = kzalloc(sizeof(*worker), GFP_KERNEL);
17594 +       if (!worker)
17595 +               return ERR_PTR(-ENOMEM);
17597 +       INIT_LIST_HEAD(&worker->events);
17598 +       raw_spin_lock_init(&worker->lock);
17599 +       init_swait_queue_head(&worker->wq);
17601 +       worker->task = kthread_run(swork_kthread, worker, "kswork");
17602 +       if (IS_ERR(worker->task)) {
17603 +               kfree(worker);
17604 +               return ERR_PTR(-ENOMEM);
17605 +       }
17607 +       return worker;
17610 +static void swork_destroy(struct sworker *worker)
17612 +       kthread_stop(worker->task);
17614 +       WARN_ON(!list_empty(&worker->events));
17615 +       kfree(worker);
17618 +/**
17619 + * swork_queue - queue swork
17620 + *
17621 + * Returns %false if @work was already on a queue, %true otherwise.
17622 + *
17623 + * The work is queued and processed on a random CPU
17624 + */
17625 +bool swork_queue(struct swork_event *sev)
17627 +       unsigned long flags;
17629 +       if (test_and_set_bit(SWORK_EVENT_PENDING, &sev->flags))
17630 +               return false;
17632 +       raw_spin_lock_irqsave(&glob_worker->lock, flags);
17633 +       list_add_tail(&sev->item, &glob_worker->events);
17634 +       raw_spin_unlock_irqrestore(&glob_worker->lock, flags);
17636 +       swake_up(&glob_worker->wq);
17637 +       return true;
17639 +EXPORT_SYMBOL_GPL(swork_queue);
17641 +/**
17642 + * swork_get - get an instance of the sworker
17643 + *
17644 + * Returns an negative error code if the initialization if the worker did not
17645 + * work, %0 otherwise.
17646 + *
17647 + */
17648 +int swork_get(void)
17650 +       struct sworker *worker;
17652 +       mutex_lock(&worker_mutex);
17653 +       if (!glob_worker) {
17654 +               worker = swork_create();
17655 +               if (IS_ERR(worker)) {
17656 +                       mutex_unlock(&worker_mutex);
17657 +                       return -ENOMEM;
17658 +               }
17660 +               glob_worker = worker;
17661 +       }
17663 +       glob_worker->refs++;
17664 +       mutex_unlock(&worker_mutex);
17666 +       return 0;
17668 +EXPORT_SYMBOL_GPL(swork_get);
17670 +/**
17671 + * swork_put - puts an instance of the sworker
17672 + *
17673 + * Will destroy the sworker thread. This function must not be called until all
17674 + * queued events have been completed.
17675 + */
17676 +void swork_put(void)
17678 +       mutex_lock(&worker_mutex);
17680 +       glob_worker->refs--;
17681 +       if (glob_worker->refs > 0)
17682 +               goto out;
17684 +       swork_destroy(glob_worker);
17685 +       glob_worker = NULL;
17686 +out:
17687 +       mutex_unlock(&worker_mutex);
17689 +EXPORT_SYMBOL_GPL(swork_put);
17690 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/signal.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/signal.c
17691 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/signal.c 2017-04-16 10:38:30.000000000 +0200
17692 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/signal.c      2017-04-18 17:54:26.000000000 +0200
17693 @@ -14,6 +14,7 @@
17694  #include <linux/export.h>
17695  #include <linux/init.h>
17696  #include <linux/sched.h>
17697 +#include <linux/sched/rt.h>
17698  #include <linux/fs.h>
17699  #include <linux/tty.h>
17700  #include <linux/binfmts.h>
17701 @@ -352,13 +353,30 @@
17702         return false;
17705 +static inline struct sigqueue *get_task_cache(struct task_struct *t)
17707 +       struct sigqueue *q = t->sigqueue_cache;
17709 +       if (cmpxchg(&t->sigqueue_cache, q, NULL) != q)
17710 +               return NULL;
17711 +       return q;
17714 +static inline int put_task_cache(struct task_struct *t, struct sigqueue *q)
17716 +       if (cmpxchg(&t->sigqueue_cache, NULL, q) == NULL)
17717 +               return 0;
17718 +       return 1;
17721  /*
17722   * allocate a new signal queue record
17723   * - this may be called without locks if and only if t == current, otherwise an
17724   *   appropriate lock must be held to stop the target task from exiting
17725   */
17726  static struct sigqueue *
17727 -__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimit)
17728 +__sigqueue_do_alloc(int sig, struct task_struct *t, gfp_t flags,
17729 +                   int override_rlimit, int fromslab)
17731         struct sigqueue *q = NULL;
17732         struct user_struct *user;
17733 @@ -375,7 +393,10 @@
17734         if (override_rlimit ||
17735             atomic_read(&user->sigpending) <=
17736                         task_rlimit(t, RLIMIT_SIGPENDING)) {
17737 -               q = kmem_cache_alloc(sigqueue_cachep, flags);
17738 +               if (!fromslab)
17739 +                       q = get_task_cache(t);
17740 +               if (!q)
17741 +                       q = kmem_cache_alloc(sigqueue_cachep, flags);
17742         } else {
17743                 print_dropped_signal(sig);
17744         }
17745 @@ -392,6 +413,13 @@
17746         return q;
17749 +static struct sigqueue *
17750 +__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags,
17751 +                int override_rlimit)
17753 +       return __sigqueue_do_alloc(sig, t, flags, override_rlimit, 0);
17756  static void __sigqueue_free(struct sigqueue *q)
17758         if (q->flags & SIGQUEUE_PREALLOC)
17759 @@ -401,6 +429,21 @@
17760         kmem_cache_free(sigqueue_cachep, q);
17763 +static void sigqueue_free_current(struct sigqueue *q)
17765 +       struct user_struct *up;
17767 +       if (q->flags & SIGQUEUE_PREALLOC)
17768 +               return;
17770 +       up = q->user;
17771 +       if (rt_prio(current->normal_prio) && !put_task_cache(current, q)) {
17772 +               atomic_dec(&up->sigpending);
17773 +               free_uid(up);
17774 +       } else
17775 +                 __sigqueue_free(q);
17778  void flush_sigqueue(struct sigpending *queue)
17780         struct sigqueue *q;
17781 @@ -414,6 +457,21 @@
17784  /*
17785 + * Called from __exit_signal. Flush tsk->pending and
17786 + * tsk->sigqueue_cache
17787 + */
17788 +void flush_task_sigqueue(struct task_struct *tsk)
17790 +       struct sigqueue *q;
17792 +       flush_sigqueue(&tsk->pending);
17794 +       q = get_task_cache(tsk);
17795 +       if (q)
17796 +               kmem_cache_free(sigqueue_cachep, q);
17800   * Flush all pending signals for this kthread.
17801   */
17802  void flush_signals(struct task_struct *t)
17803 @@ -525,7 +583,7 @@
17804  still_pending:
17805                 list_del_init(&first->list);
17806                 copy_siginfo(info, &first->info);
17807 -               __sigqueue_free(first);
17808 +               sigqueue_free_current(first);
17809         } else {
17810                 /*
17811                  * Ok, it wasn't in the queue.  This must be
17812 @@ -560,6 +618,8 @@
17814         int signr;
17816 +       WARN_ON_ONCE(tsk != current);
17818         /* We only dequeue private signals from ourselves, we don't let
17819          * signalfd steal them
17820          */
17821 @@ -1156,8 +1216,8 @@
17822   * We don't want to have recursive SIGSEGV's etc, for example,
17823   * that is why we also clear SIGNAL_UNKILLABLE.
17824   */
17825 -int
17826 -force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
17827 +static int
17828 +do_force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
17830         unsigned long int flags;
17831         int ret, blocked, ignored;
17832 @@ -1182,6 +1242,39 @@
17833         return ret;
17836 +int force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
17839 + * On some archs, PREEMPT_RT has to delay sending a signal from a trap
17840 + * since it can not enable preemption, and the signal code's spin_locks
17841 + * turn into mutexes. Instead, it must set TIF_NOTIFY_RESUME which will
17842 + * send the signal on exit of the trap.
17843 + */
17844 +#ifdef ARCH_RT_DELAYS_SIGNAL_SEND
17845 +       if (in_atomic()) {
17846 +               if (WARN_ON_ONCE(t != current))
17847 +                       return 0;
17848 +               if (WARN_ON_ONCE(t->forced_info.si_signo))
17849 +                       return 0;
17851 +               if (is_si_special(info)) {
17852 +                       WARN_ON_ONCE(info != SEND_SIG_PRIV);
17853 +                       t->forced_info.si_signo = sig;
17854 +                       t->forced_info.si_errno = 0;
17855 +                       t->forced_info.si_code = SI_KERNEL;
17856 +                       t->forced_info.si_pid = 0;
17857 +                       t->forced_info.si_uid = 0;
17858 +               } else {
17859 +                       t->forced_info = *info;
17860 +               }
17862 +               set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
17863 +               return 0;
17864 +       }
17865 +#endif
17866 +       return do_force_sig_info(sig, info, t);
17869  /*
17870   * Nuke all other threads in the group.
17871   */
17872 @@ -1216,12 +1309,12 @@
17873                  * Disable interrupts early to avoid deadlocks.
17874                  * See rcu_read_unlock() comment header for details.
17875                  */
17876 -               local_irq_save(*flags);
17877 +               local_irq_save_nort(*flags);
17878                 rcu_read_lock();
17879                 sighand = rcu_dereference(tsk->sighand);
17880                 if (unlikely(sighand == NULL)) {
17881                         rcu_read_unlock();
17882 -                       local_irq_restore(*flags);
17883 +                       local_irq_restore_nort(*flags);
17884                         break;
17885                 }
17886                 /*
17887 @@ -1242,7 +1335,7 @@
17888                 }
17889                 spin_unlock(&sighand->siglock);
17890                 rcu_read_unlock();
17891 -               local_irq_restore(*flags);
17892 +               local_irq_restore_nort(*flags);
17893         }
17895         return sighand;
17896 @@ -1485,7 +1578,8 @@
17897   */
17898  struct sigqueue *sigqueue_alloc(void)
17900 -       struct sigqueue *q = __sigqueue_alloc(-1, current, GFP_KERNEL, 0);
17901 +       /* Preallocated sigqueue objects always from the slabcache ! */
17902 +       struct sigqueue *q = __sigqueue_do_alloc(-1, current, GFP_KERNEL, 0, 1);
17904         if (q)
17905                 q->flags |= SIGQUEUE_PREALLOC;
17906 @@ -1846,15 +1940,7 @@
17907                 if (gstop_done && ptrace_reparented(current))
17908                         do_notify_parent_cldstop(current, false, why);
17910 -               /*
17911 -                * Don't want to allow preemption here, because
17912 -                * sys_ptrace() needs this task to be inactive.
17913 -                *
17914 -                * XXX: implement read_unlock_no_resched().
17915 -                */
17916 -               preempt_disable();
17917                 read_unlock(&tasklist_lock);
17918 -               preempt_enable_no_resched();
17919                 freezable_schedule();
17920         } else {
17921                 /*
17922 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/softirq.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/softirq.c
17923 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/softirq.c        2017-04-16 10:38:30.000000000 +0200
17924 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/softirq.c     2017-04-18 17:54:26.000000000 +0200
17925 @@ -21,10 +21,12 @@
17926  #include <linux/freezer.h>
17927  #include <linux/kthread.h>
17928  #include <linux/rcupdate.h>
17929 +#include <linux/delay.h>
17930  #include <linux/ftrace.h>
17931  #include <linux/smp.h>
17932  #include <linux/smpboot.h>
17933  #include <linux/tick.h>
17934 +#include <linux/locallock.h>
17935  #include <linux/irq.h>
17937  #define CREATE_TRACE_POINTS
17938 @@ -56,12 +58,108 @@
17939  static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp;
17941  DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
17942 +#ifdef CONFIG_PREEMPT_RT_FULL
17943 +#define TIMER_SOFTIRQS ((1 << TIMER_SOFTIRQ) | (1 << HRTIMER_SOFTIRQ))
17944 +DEFINE_PER_CPU(struct task_struct *, ktimer_softirqd);
17945 +#endif
17947  const char * const softirq_to_name[NR_SOFTIRQS] = {
17948         "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "IRQ_POLL",
17949         "TASKLET", "SCHED", "HRTIMER", "RCU"
17950  };
17952 +#ifdef CONFIG_NO_HZ_COMMON
17953 +# ifdef CONFIG_PREEMPT_RT_FULL
17955 +struct softirq_runner {
17956 +       struct task_struct *runner[NR_SOFTIRQS];
17959 +static DEFINE_PER_CPU(struct softirq_runner, softirq_runners);
17961 +static inline void softirq_set_runner(unsigned int sirq)
17963 +       struct softirq_runner *sr = this_cpu_ptr(&softirq_runners);
17965 +       sr->runner[sirq] = current;
17968 +static inline void softirq_clr_runner(unsigned int sirq)
17970 +       struct softirq_runner *sr = this_cpu_ptr(&softirq_runners);
17972 +       sr->runner[sirq] = NULL;
17976 + * On preempt-rt a softirq running context might be blocked on a
17977 + * lock. There might be no other runnable task on this CPU because the
17978 + * lock owner runs on some other CPU. So we have to go into idle with
17979 + * the pending bit set. Therefor we need to check this otherwise we
17980 + * warn about false positives which confuses users and defeats the
17981 + * whole purpose of this test.
17982 + *
17983 + * This code is called with interrupts disabled.
17984 + */
17985 +void softirq_check_pending_idle(void)
17987 +       static int rate_limit;
17988 +       struct softirq_runner *sr = this_cpu_ptr(&softirq_runners);
17989 +       u32 warnpending;
17990 +       int i;
17992 +       if (rate_limit >= 10)
17993 +               return;
17995 +       warnpending = local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK;
17996 +       for (i = 0; i < NR_SOFTIRQS; i++) {
17997 +               struct task_struct *tsk = sr->runner[i];
17999 +               /*
18000 +                * The wakeup code in rtmutex.c wakes up the task
18001 +                * _before_ it sets pi_blocked_on to NULL under
18002 +                * tsk->pi_lock. So we need to check for both: state
18003 +                * and pi_blocked_on.
18004 +                */
18005 +               if (tsk) {
18006 +                       raw_spin_lock(&tsk->pi_lock);
18007 +                       if (tsk->pi_blocked_on || tsk->state == TASK_RUNNING) {
18008 +                               /* Clear all bits pending in that task */
18009 +                               warnpending &= ~(tsk->softirqs_raised);
18010 +                               warnpending &= ~(1 << i);
18011 +                       }
18012 +                       raw_spin_unlock(&tsk->pi_lock);
18013 +               }
18014 +       }
18016 +       if (warnpending) {
18017 +               printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
18018 +                      warnpending);
18019 +               rate_limit++;
18020 +       }
18022 +# else
18024 + * On !PREEMPT_RT we just printk rate limited:
18025 + */
18026 +void softirq_check_pending_idle(void)
18028 +       static int rate_limit;
18030 +       if (rate_limit < 10 &&
18031 +                       (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) {
18032 +               printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
18033 +                      local_softirq_pending());
18034 +               rate_limit++;
18035 +       }
18037 +# endif
18039 +#else /* !CONFIG_NO_HZ_COMMON */
18040 +static inline void softirq_set_runner(unsigned int sirq) { }
18041 +static inline void softirq_clr_runner(unsigned int sirq) { }
18042 +#endif
18044  /*
18045   * we cannot loop indefinitely here to avoid userspace starvation,
18046   * but we also don't want to introduce a worst case 1/HZ latency
18047 @@ -77,6 +175,38 @@
18048                 wake_up_process(tsk);
18051 +#ifdef CONFIG_PREEMPT_RT_FULL
18052 +static void wakeup_timer_softirqd(void)
18054 +       /* Interrupts are disabled: no need to stop preemption */
18055 +       struct task_struct *tsk = __this_cpu_read(ktimer_softirqd);
18057 +       if (tsk && tsk->state != TASK_RUNNING)
18058 +               wake_up_process(tsk);
18060 +#endif
18062 +static void handle_softirq(unsigned int vec_nr)
18064 +       struct softirq_action *h = softirq_vec + vec_nr;
18065 +       int prev_count;
18067 +       prev_count = preempt_count();
18069 +       kstat_incr_softirqs_this_cpu(vec_nr);
18071 +       trace_softirq_entry(vec_nr);
18072 +       h->action(h);
18073 +       trace_softirq_exit(vec_nr);
18074 +       if (unlikely(prev_count != preempt_count())) {
18075 +               pr_err("huh, entered softirq %u %s %p with preempt_count %08x, exited with %08x?\n",
18076 +                      vec_nr, softirq_to_name[vec_nr], h->action,
18077 +                      prev_count, preempt_count());
18078 +               preempt_count_set(prev_count);
18079 +       }
18082 +#ifndef CONFIG_PREEMPT_RT_FULL
18083  /*
18084   * If ksoftirqd is scheduled, we do not want to process pending softirqs
18085   * right now. Let ksoftirqd handle this at its own rate, to get fairness.
18086 @@ -88,6 +218,47 @@
18087         return tsk && (tsk->state == TASK_RUNNING);
18090 +static inline int ksoftirqd_softirq_pending(void)
18092 +       return local_softirq_pending();
18095 +static void handle_pending_softirqs(u32 pending)
18097 +       struct softirq_action *h = softirq_vec;
18098 +       int softirq_bit;
18100 +       local_irq_enable();
18102 +       h = softirq_vec;
18104 +       while ((softirq_bit = ffs(pending))) {
18105 +               unsigned int vec_nr;
18107 +               h += softirq_bit - 1;
18108 +               vec_nr = h - softirq_vec;
18109 +               handle_softirq(vec_nr);
18111 +               h++;
18112 +               pending >>= softirq_bit;
18113 +       }
18115 +       rcu_bh_qs();
18116 +       local_irq_disable();
18119 +static void run_ksoftirqd(unsigned int cpu)
18121 +       local_irq_disable();
18122 +       if (ksoftirqd_softirq_pending()) {
18123 +               __do_softirq();
18124 +               local_irq_enable();
18125 +               cond_resched_rcu_qs();
18126 +               return;
18127 +       }
18128 +       local_irq_enable();
18131  /*
18132   * preempt_count and SOFTIRQ_OFFSET usage:
18133   * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving
18134 @@ -243,10 +414,8 @@
18135         unsigned long end = jiffies + MAX_SOFTIRQ_TIME;
18136         unsigned long old_flags = current->flags;
18137         int max_restart = MAX_SOFTIRQ_RESTART;
18138 -       struct softirq_action *h;
18139         bool in_hardirq;
18140         __u32 pending;
18141 -       int softirq_bit;
18143         /*
18144          * Mask out PF_MEMALLOC s current task context is borrowed for the
18145 @@ -265,36 +434,7 @@
18146         /* Reset the pending bitmask before enabling irqs */
18147         set_softirq_pending(0);
18149 -       local_irq_enable();
18151 -       h = softirq_vec;
18153 -       while ((softirq_bit = ffs(pending))) {
18154 -               unsigned int vec_nr;
18155 -               int prev_count;
18157 -               h += softirq_bit - 1;
18159 -               vec_nr = h - softirq_vec;
18160 -               prev_count = preempt_count();
18162 -               kstat_incr_softirqs_this_cpu(vec_nr);
18164 -               trace_softirq_entry(vec_nr);
18165 -               h->action(h);
18166 -               trace_softirq_exit(vec_nr);
18167 -               if (unlikely(prev_count != preempt_count())) {
18168 -                       pr_err("huh, entered softirq %u %s %p with preempt_count %08x, exited with %08x?\n",
18169 -                              vec_nr, softirq_to_name[vec_nr], h->action,
18170 -                              prev_count, preempt_count());
18171 -                       preempt_count_set(prev_count);
18172 -               }
18173 -               h++;
18174 -               pending >>= softirq_bit;
18175 -       }
18177 -       rcu_bh_qs();
18178 -       local_irq_disable();
18179 +       handle_pending_softirqs(pending);
18181         pending = local_softirq_pending();
18182         if (pending) {
18183 @@ -331,6 +471,309 @@
18186  /*
18187 + * This function must run with irqs disabled!
18188 + */
18189 +void raise_softirq_irqoff(unsigned int nr)
18191 +       __raise_softirq_irqoff(nr);
18193 +       /*
18194 +        * If we're in an interrupt or softirq, we're done
18195 +        * (this also catches softirq-disabled code). We will
18196 +        * actually run the softirq once we return from
18197 +        * the irq or softirq.
18198 +        *
18199 +        * Otherwise we wake up ksoftirqd to make sure we
18200 +        * schedule the softirq soon.
18201 +        */
18202 +       if (!in_interrupt())
18203 +               wakeup_softirqd();
18206 +void __raise_softirq_irqoff(unsigned int nr)
18208 +       trace_softirq_raise(nr);
18209 +       or_softirq_pending(1UL << nr);
18212 +static inline void local_bh_disable_nort(void) { local_bh_disable(); }
18213 +static inline void _local_bh_enable_nort(void) { _local_bh_enable(); }
18214 +static void ksoftirqd_set_sched_params(unsigned int cpu) { }
18216 +#else /* !PREEMPT_RT_FULL */
18219 + * On RT we serialize softirq execution with a cpu local lock per softirq
18220 + */
18221 +static DEFINE_PER_CPU(struct local_irq_lock [NR_SOFTIRQS], local_softirq_locks);
18223 +void __init softirq_early_init(void)
18225 +       int i;
18227 +       for (i = 0; i < NR_SOFTIRQS; i++)
18228 +               local_irq_lock_init(local_softirq_locks[i]);
18231 +static void lock_softirq(int which)
18233 +       local_lock(local_softirq_locks[which]);
18236 +static void unlock_softirq(int which)
18238 +       local_unlock(local_softirq_locks[which]);
18241 +static void do_single_softirq(int which)
18243 +       unsigned long old_flags = current->flags;
18245 +       current->flags &= ~PF_MEMALLOC;
18246 +       vtime_account_irq_enter(current);
18247 +       current->flags |= PF_IN_SOFTIRQ;
18248 +       lockdep_softirq_enter();
18249 +       local_irq_enable();
18250 +       handle_softirq(which);
18251 +       local_irq_disable();
18252 +       lockdep_softirq_exit();
18253 +       current->flags &= ~PF_IN_SOFTIRQ;
18254 +       vtime_account_irq_enter(current);
18255 +       tsk_restore_flags(current, old_flags, PF_MEMALLOC);
18259 + * Called with interrupts disabled. Process softirqs which were raised
18260 + * in current context (or on behalf of ksoftirqd).
18261 + */
18262 +static void do_current_softirqs(void)
18264 +       while (current->softirqs_raised) {
18265 +               int i = __ffs(current->softirqs_raised);
18266 +               unsigned int pending, mask = (1U << i);
18268 +               current->softirqs_raised &= ~mask;
18269 +               local_irq_enable();
18271 +               /*
18272 +                * If the lock is contended, we boost the owner to
18273 +                * process the softirq or leave the critical section
18274 +                * now.
18275 +                */
18276 +               lock_softirq(i);
18277 +               local_irq_disable();
18278 +               softirq_set_runner(i);
18279 +               /*
18280 +                * Check with the local_softirq_pending() bits,
18281 +                * whether we need to process this still or if someone
18282 +                * else took care of it.
18283 +                */
18284 +               pending = local_softirq_pending();
18285 +               if (pending & mask) {
18286 +                       set_softirq_pending(pending & ~mask);
18287 +                       do_single_softirq(i);
18288 +               }
18289 +               softirq_clr_runner(i);
18290 +               WARN_ON(current->softirq_nestcnt != 1);
18291 +               local_irq_enable();
18292 +               unlock_softirq(i);
18293 +               local_irq_disable();
18294 +       }
18297 +void __local_bh_disable(void)
18299 +       if (++current->softirq_nestcnt == 1)
18300 +               migrate_disable();
18302 +EXPORT_SYMBOL(__local_bh_disable);
18304 +void __local_bh_enable(void)
18306 +       if (WARN_ON(current->softirq_nestcnt == 0))
18307 +               return;
18309 +       local_irq_disable();
18310 +       if (current->softirq_nestcnt == 1 && current->softirqs_raised)
18311 +               do_current_softirqs();
18312 +       local_irq_enable();
18314 +       if (--current->softirq_nestcnt == 0)
18315 +               migrate_enable();
18317 +EXPORT_SYMBOL(__local_bh_enable);
18319 +void _local_bh_enable(void)
18321 +       if (WARN_ON(current->softirq_nestcnt == 0))
18322 +               return;
18323 +       if (--current->softirq_nestcnt == 0)
18324 +               migrate_enable();
18326 +EXPORT_SYMBOL(_local_bh_enable);
18328 +int in_serving_softirq(void)
18330 +       return current->flags & PF_IN_SOFTIRQ;
18332 +EXPORT_SYMBOL(in_serving_softirq);
18334 +/* Called with preemption disabled */
18335 +static void run_ksoftirqd(unsigned int cpu)
18337 +       local_irq_disable();
18338 +       current->softirq_nestcnt++;
18340 +       do_current_softirqs();
18341 +       current->softirq_nestcnt--;
18342 +       local_irq_enable();
18343 +       cond_resched_rcu_qs();
18347 + * Called from netif_rx_ni(). Preemption enabled, but migration
18348 + * disabled. So the cpu can't go away under us.
18349 + */
18350 +void thread_do_softirq(void)
18352 +       if (!in_serving_softirq() && current->softirqs_raised) {
18353 +               current->softirq_nestcnt++;
18354 +               do_current_softirqs();
18355 +               current->softirq_nestcnt--;
18356 +       }
18359 +static void do_raise_softirq_irqoff(unsigned int nr)
18361 +       unsigned int mask;
18363 +       mask = 1UL << nr;
18365 +       trace_softirq_raise(nr);
18366 +       or_softirq_pending(mask);
18368 +       /*
18369 +        * If we are not in a hard interrupt and inside a bh disabled
18370 +        * region, we simply raise the flag on current. local_bh_enable()
18371 +        * will make sure that the softirq is executed. Otherwise we
18372 +        * delegate it to ksoftirqd.
18373 +        */
18374 +       if (!in_irq() && current->softirq_nestcnt)
18375 +               current->softirqs_raised |= mask;
18376 +       else if (!__this_cpu_read(ksoftirqd) || !__this_cpu_read(ktimer_softirqd))
18377 +               return;
18379 +       if (mask & TIMER_SOFTIRQS)
18380 +               __this_cpu_read(ktimer_softirqd)->softirqs_raised |= mask;
18381 +       else
18382 +               __this_cpu_read(ksoftirqd)->softirqs_raised |= mask;
18385 +static void wakeup_proper_softirq(unsigned int nr)
18387 +       if ((1UL << nr) & TIMER_SOFTIRQS)
18388 +               wakeup_timer_softirqd();
18389 +       else
18390 +               wakeup_softirqd();
18393 +void __raise_softirq_irqoff(unsigned int nr)
18395 +       do_raise_softirq_irqoff(nr);
18396 +       if (!in_irq() && !current->softirq_nestcnt)
18397 +               wakeup_proper_softirq(nr);
18401 + * Same as __raise_softirq_irqoff() but will process them in ksoftirqd
18402 + */
18403 +void __raise_softirq_irqoff_ksoft(unsigned int nr)
18405 +       unsigned int mask;
18407 +       if (WARN_ON_ONCE(!__this_cpu_read(ksoftirqd) ||
18408 +                        !__this_cpu_read(ktimer_softirqd)))
18409 +               return;
18410 +       mask = 1UL << nr;
18412 +       trace_softirq_raise(nr);
18413 +       or_softirq_pending(mask);
18414 +       if (mask & TIMER_SOFTIRQS)
18415 +               __this_cpu_read(ktimer_softirqd)->softirqs_raised |= mask;
18416 +       else
18417 +               __this_cpu_read(ksoftirqd)->softirqs_raised |= mask;
18418 +       wakeup_proper_softirq(nr);
18422 + * This function must run with irqs disabled!
18423 + */
18424 +void raise_softirq_irqoff(unsigned int nr)
18426 +       do_raise_softirq_irqoff(nr);
18428 +       /*
18429 +        * If we're in an hard interrupt we let irq return code deal
18430 +        * with the wakeup of ksoftirqd.
18431 +        */
18432 +       if (in_irq())
18433 +               return;
18434 +       /*
18435 +        * If we are in thread context but outside of a bh disabled
18436 +        * region, we need to wake ksoftirqd as well.
18437 +        *
18438 +        * CHECKME: Some of the places which do that could be wrapped
18439 +        * into local_bh_disable/enable pairs. Though it's unclear
18440 +        * whether this is worth the effort. To find those places just
18441 +        * raise a WARN() if the condition is met.
18442 +        */
18443 +       if (!current->softirq_nestcnt)
18444 +               wakeup_proper_softirq(nr);
18447 +static inline int ksoftirqd_softirq_pending(void)
18449 +       return current->softirqs_raised;
18452 +static inline void local_bh_disable_nort(void) { }
18453 +static inline void _local_bh_enable_nort(void) { }
18455 +static inline void ksoftirqd_set_sched_params(unsigned int cpu)
18457 +       /* Take over all but timer pending softirqs when starting */
18458 +       local_irq_disable();
18459 +       current->softirqs_raised = local_softirq_pending() & ~TIMER_SOFTIRQS;
18460 +       local_irq_enable();
18463 +static inline void ktimer_softirqd_set_sched_params(unsigned int cpu)
18465 +       struct sched_param param = { .sched_priority = 1 };
18467 +       sched_setscheduler(current, SCHED_FIFO, &param);
18469 +       /* Take over timer pending softirqs when starting */
18470 +       local_irq_disable();
18471 +       current->softirqs_raised = local_softirq_pending() & TIMER_SOFTIRQS;
18472 +       local_irq_enable();
18475 +static inline void ktimer_softirqd_clr_sched_params(unsigned int cpu,
18476 +                                                   bool online)
18478 +       struct sched_param param = { .sched_priority = 0 };
18480 +       sched_setscheduler(current, SCHED_NORMAL, &param);
18483 +static int ktimer_softirqd_should_run(unsigned int cpu)
18485 +       return current->softirqs_raised;
18488 +#endif /* PREEMPT_RT_FULL */
18490   * Enter an interrupt context.
18491   */
18492  void irq_enter(void)
18493 @@ -341,9 +784,9 @@
18494                  * Prevent raise_softirq from needlessly waking up ksoftirqd
18495                  * here, as softirq will be serviced on return from interrupt.
18496                  */
18497 -               local_bh_disable();
18498 +               local_bh_disable_nort();
18499                 tick_irq_enter();
18500 -               _local_bh_enable();
18501 +               _local_bh_enable_nort();
18502         }
18504         __irq_enter();
18505 @@ -351,6 +794,7 @@
18507  static inline void invoke_softirq(void)
18509 +#ifndef CONFIG_PREEMPT_RT_FULL
18510         if (ksoftirqd_running())
18511                 return;
18513 @@ -373,6 +817,18 @@
18514         } else {
18515                 wakeup_softirqd();
18516         }
18517 +#else /* PREEMPT_RT_FULL */
18518 +       unsigned long flags;
18520 +       local_irq_save(flags);
18521 +       if (__this_cpu_read(ksoftirqd) &&
18522 +                       __this_cpu_read(ksoftirqd)->softirqs_raised)
18523 +               wakeup_softirqd();
18524 +       if (__this_cpu_read(ktimer_softirqd) &&
18525 +                       __this_cpu_read(ktimer_softirqd)->softirqs_raised)
18526 +               wakeup_timer_softirqd();
18527 +       local_irq_restore(flags);
18528 +#endif
18531  static inline void tick_irq_exit(void)
18532 @@ -409,26 +865,6 @@
18533         trace_hardirq_exit(); /* must be last! */
18537 - * This function must run with irqs disabled!
18538 - */
18539 -inline void raise_softirq_irqoff(unsigned int nr)
18541 -       __raise_softirq_irqoff(nr);
18543 -       /*
18544 -        * If we're in an interrupt or softirq, we're done
18545 -        * (this also catches softirq-disabled code). We will
18546 -        * actually run the softirq once we return from
18547 -        * the irq or softirq.
18548 -        *
18549 -        * Otherwise we wake up ksoftirqd to make sure we
18550 -        * schedule the softirq soon.
18551 -        */
18552 -       if (!in_interrupt())
18553 -               wakeup_softirqd();
18556  void raise_softirq(unsigned int nr)
18558         unsigned long flags;
18559 @@ -438,12 +874,6 @@
18560         local_irq_restore(flags);
18563 -void __raise_softirq_irqoff(unsigned int nr)
18565 -       trace_softirq_raise(nr);
18566 -       or_softirq_pending(1UL << nr);
18569  void open_softirq(int nr, void (*action)(struct softirq_action *))
18571         softirq_vec[nr].action = action;
18572 @@ -460,15 +890,45 @@
18573  static DEFINE_PER_CPU(struct tasklet_head, tasklet_vec);
18574  static DEFINE_PER_CPU(struct tasklet_head, tasklet_hi_vec);
18576 +static void inline
18577 +__tasklet_common_schedule(struct tasklet_struct *t, struct tasklet_head *head, unsigned int nr)
18579 +       if (tasklet_trylock(t)) {
18580 +again:
18581 +               /* We may have been preempted before tasklet_trylock
18582 +                * and __tasklet_action may have already run.
18583 +                * So double check the sched bit while the takslet
18584 +                * is locked before adding it to the list.
18585 +                */
18586 +               if (test_bit(TASKLET_STATE_SCHED, &t->state)) {
18587 +                       t->next = NULL;
18588 +                       *head->tail = t;
18589 +                       head->tail = &(t->next);
18590 +                       raise_softirq_irqoff(nr);
18591 +                       tasklet_unlock(t);
18592 +               } else {
18593 +                       /* This is subtle. If we hit the corner case above
18594 +                        * It is possible that we get preempted right here,
18595 +                        * and another task has successfully called
18596 +                        * tasklet_schedule(), then this function, and
18597 +                        * failed on the trylock. Thus we must be sure
18598 +                        * before releasing the tasklet lock, that the
18599 +                        * SCHED_BIT is clear. Otherwise the tasklet
18600 +                        * may get its SCHED_BIT set, but not added to the
18601 +                        * list
18602 +                        */
18603 +                       if (!tasklet_tryunlock(t))
18604 +                               goto again;
18605 +               }
18606 +       }
18609  void __tasklet_schedule(struct tasklet_struct *t)
18611         unsigned long flags;
18613         local_irq_save(flags);
18614 -       t->next = NULL;
18615 -       *__this_cpu_read(tasklet_vec.tail) = t;
18616 -       __this_cpu_write(tasklet_vec.tail, &(t->next));
18617 -       raise_softirq_irqoff(TASKLET_SOFTIRQ);
18618 +       __tasklet_common_schedule(t, this_cpu_ptr(&tasklet_vec), TASKLET_SOFTIRQ);
18619         local_irq_restore(flags);
18621  EXPORT_SYMBOL(__tasklet_schedule);
18622 @@ -478,10 +938,7 @@
18623         unsigned long flags;
18625         local_irq_save(flags);
18626 -       t->next = NULL;
18627 -       *__this_cpu_read(tasklet_hi_vec.tail) = t;
18628 -       __this_cpu_write(tasklet_hi_vec.tail,  &(t->next));
18629 -       raise_softirq_irqoff(HI_SOFTIRQ);
18630 +       __tasklet_common_schedule(t, this_cpu_ptr(&tasklet_hi_vec), HI_SOFTIRQ);
18631         local_irq_restore(flags);
18633  EXPORT_SYMBOL(__tasklet_hi_schedule);
18634 @@ -490,82 +947,122 @@
18636         BUG_ON(!irqs_disabled());
18638 -       t->next = __this_cpu_read(tasklet_hi_vec.head);
18639 -       __this_cpu_write(tasklet_hi_vec.head, t);
18640 -       __raise_softirq_irqoff(HI_SOFTIRQ);
18641 +       __tasklet_hi_schedule(t);
18643  EXPORT_SYMBOL(__tasklet_hi_schedule_first);
18645 -static __latent_entropy void tasklet_action(struct softirq_action *a)
18646 +void  tasklet_enable(struct tasklet_struct *t)
18648 -       struct tasklet_struct *list;
18649 +       if (!atomic_dec_and_test(&t->count))
18650 +               return;
18651 +       if (test_and_clear_bit(TASKLET_STATE_PENDING, &t->state))
18652 +               tasklet_schedule(t);
18654 +EXPORT_SYMBOL(tasklet_enable);
18656 -       local_irq_disable();
18657 -       list = __this_cpu_read(tasklet_vec.head);
18658 -       __this_cpu_write(tasklet_vec.head, NULL);
18659 -       __this_cpu_write(tasklet_vec.tail, this_cpu_ptr(&tasklet_vec.head));
18660 -       local_irq_enable();
18661 +static void __tasklet_action(struct softirq_action *a,
18662 +                            struct tasklet_struct *list)
18664 +       int loops = 1000000;
18666         while (list) {
18667                 struct tasklet_struct *t = list;
18669                 list = list->next;
18671 -               if (tasklet_trylock(t)) {
18672 -                       if (!atomic_read(&t->count)) {
18673 -                               if (!test_and_clear_bit(TASKLET_STATE_SCHED,
18674 -                                                       &t->state))
18675 -                                       BUG();
18676 -                               t->func(t->data);
18677 -                               tasklet_unlock(t);
18678 -                               continue;
18679 -                       }
18680 -                       tasklet_unlock(t);
18681 +               /*
18682 +                * Should always succeed - after a tasklist got on the
18683 +                * list (after getting the SCHED bit set from 0 to 1),
18684 +                * nothing but the tasklet softirq it got queued to can
18685 +                * lock it:
18686 +                */
18687 +               if (!tasklet_trylock(t)) {
18688 +                       WARN_ON(1);
18689 +                       continue;
18690                 }
18692 -               local_irq_disable();
18693                 t->next = NULL;
18694 -               *__this_cpu_read(tasklet_vec.tail) = t;
18695 -               __this_cpu_write(tasklet_vec.tail, &(t->next));
18696 -               __raise_softirq_irqoff(TASKLET_SOFTIRQ);
18697 -               local_irq_enable();
18699 +               /*
18700 +                * If we cannot handle the tasklet because it's disabled,
18701 +                * mark it as pending. tasklet_enable() will later
18702 +                * re-schedule the tasklet.
18703 +                */
18704 +               if (unlikely(atomic_read(&t->count))) {
18705 +out_disabled:
18706 +                       /* implicit unlock: */
18707 +                       wmb();
18708 +                       t->state = TASKLET_STATEF_PENDING;
18709 +                       continue;
18710 +               }
18712 +               /*
18713 +                * After this point on the tasklet might be rescheduled
18714 +                * on another CPU, but it can only be added to another
18715 +                * CPU's tasklet list if we unlock the tasklet (which we
18716 +                * dont do yet).
18717 +                */
18718 +               if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
18719 +                       WARN_ON(1);
18721 +again:
18722 +               t->func(t->data);
18724 +               /*
18725 +                * Try to unlock the tasklet. We must use cmpxchg, because
18726 +                * another CPU might have scheduled or disabled the tasklet.
18727 +                * We only allow the STATE_RUN -> 0 transition here.
18728 +                */
18729 +               while (!tasklet_tryunlock(t)) {
18730 +                       /*
18731 +                        * If it got disabled meanwhile, bail out:
18732 +                        */
18733 +                       if (atomic_read(&t->count))
18734 +                               goto out_disabled;
18735 +                       /*
18736 +                        * If it got scheduled meanwhile, re-execute
18737 +                        * the tasklet function:
18738 +                        */
18739 +                       if (test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
18740 +                               goto again;
18741 +                       if (!--loops) {
18742 +                               printk("hm, tasklet state: %08lx\n", t->state);
18743 +                               WARN_ON(1);
18744 +                               tasklet_unlock(t);
18745 +                               break;
18746 +                       }
18747 +               }
18748         }
18751 +static void tasklet_action(struct softirq_action *a)
18753 +       struct tasklet_struct *list;
18755 +       local_irq_disable();
18757 +       list = __this_cpu_read(tasklet_vec.head);
18758 +       __this_cpu_write(tasklet_vec.head, NULL);
18759 +       __this_cpu_write(tasklet_vec.tail, this_cpu_ptr(&tasklet_vec.head));
18761 +       local_irq_enable();
18763 +       __tasklet_action(a, list);
18766  static __latent_entropy void tasklet_hi_action(struct softirq_action *a)
18768         struct tasklet_struct *list;
18770         local_irq_disable();
18772         list = __this_cpu_read(tasklet_hi_vec.head);
18773         __this_cpu_write(tasklet_hi_vec.head, NULL);
18774         __this_cpu_write(tasklet_hi_vec.tail, this_cpu_ptr(&tasklet_hi_vec.head));
18775 -       local_irq_enable();
18777 -       while (list) {
18778 -               struct tasklet_struct *t = list;
18780 -               list = list->next;
18782 -               if (tasklet_trylock(t)) {
18783 -                       if (!atomic_read(&t->count)) {
18784 -                               if (!test_and_clear_bit(TASKLET_STATE_SCHED,
18785 -                                                       &t->state))
18786 -                                       BUG();
18787 -                               t->func(t->data);
18788 -                               tasklet_unlock(t);
18789 -                               continue;
18790 -                       }
18791 -                       tasklet_unlock(t);
18792 -               }
18793 +       local_irq_enable();
18795 -               local_irq_disable();
18796 -               t->next = NULL;
18797 -               *__this_cpu_read(tasklet_hi_vec.tail) = t;
18798 -               __this_cpu_write(tasklet_hi_vec.tail, &(t->next));
18799 -               __raise_softirq_irqoff(HI_SOFTIRQ);
18800 -               local_irq_enable();
18801 -       }
18802 +       __tasklet_action(a, list);
18805  void tasklet_init(struct tasklet_struct *t,
18806 @@ -586,7 +1083,7 @@
18808         while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) {
18809                 do {
18810 -                       yield();
18811 +                       msleep(1);
18812                 } while (test_bit(TASKLET_STATE_SCHED, &t->state));
18813         }
18814         tasklet_unlock_wait(t);
18815 @@ -660,25 +1157,26 @@
18816         open_softirq(HI_SOFTIRQ, tasklet_hi_action);
18819 -static int ksoftirqd_should_run(unsigned int cpu)
18820 +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
18821 +void tasklet_unlock_wait(struct tasklet_struct *t)
18823 -       return local_softirq_pending();
18826 -static void run_ksoftirqd(unsigned int cpu)
18828 -       local_irq_disable();
18829 -       if (local_softirq_pending()) {
18830 +       while (test_bit(TASKLET_STATE_RUN, &(t)->state)) {
18831                 /*
18832 -                * We can safely run softirq on inline stack, as we are not deep
18833 -                * in the task stack here.
18834 +                * Hack for now to avoid this busy-loop:
18835                  */
18836 -               __do_softirq();
18837 -               local_irq_enable();
18838 -               cond_resched_rcu_qs();
18839 -               return;
18840 +#ifdef CONFIG_PREEMPT_RT_FULL
18841 +               msleep(1);
18842 +#else
18843 +               barrier();
18844 +#endif
18845         }
18846 -       local_irq_enable();
18848 +EXPORT_SYMBOL(tasklet_unlock_wait);
18849 +#endif
18851 +static int ksoftirqd_should_run(unsigned int cpu)
18853 +       return ksoftirqd_softirq_pending();
18856  #ifdef CONFIG_HOTPLUG_CPU
18857 @@ -745,17 +1243,31 @@
18859  static struct smp_hotplug_thread softirq_threads = {
18860         .store                  = &ksoftirqd,
18861 +       .setup                  = ksoftirqd_set_sched_params,
18862         .thread_should_run      = ksoftirqd_should_run,
18863         .thread_fn              = run_ksoftirqd,
18864         .thread_comm            = "ksoftirqd/%u",
18865  };
18867 +#ifdef CONFIG_PREEMPT_RT_FULL
18868 +static struct smp_hotplug_thread softirq_timer_threads = {
18869 +       .store                  = &ktimer_softirqd,
18870 +       .setup                  = ktimer_softirqd_set_sched_params,
18871 +       .cleanup                = ktimer_softirqd_clr_sched_params,
18872 +       .thread_should_run      = ktimer_softirqd_should_run,
18873 +       .thread_fn              = run_ksoftirqd,
18874 +       .thread_comm            = "ktimersoftd/%u",
18876 +#endif
18878  static __init int spawn_ksoftirqd(void)
18880         cpuhp_setup_state_nocalls(CPUHP_SOFTIRQ_DEAD, "softirq:dead", NULL,
18881                                   takeover_tasklets);
18882         BUG_ON(smpboot_register_percpu_thread(&softirq_threads));
18884 +#ifdef CONFIG_PREEMPT_RT_FULL
18885 +       BUG_ON(smpboot_register_percpu_thread(&softirq_timer_threads));
18886 +#endif
18887         return 0;
18889  early_initcall(spawn_ksoftirqd);
18890 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/stop_machine.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/stop_machine.c
18891 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/stop_machine.c   2017-04-16 10:38:30.000000000 +0200
18892 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/stop_machine.c        2017-04-18 17:54:26.000000000 +0200
18893 @@ -36,7 +36,7 @@
18894  struct cpu_stopper {
18895         struct task_struct      *thread;
18897 -       spinlock_t              lock;
18898 +       raw_spinlock_t          lock;
18899         bool                    enabled;        /* is this stopper enabled? */
18900         struct list_head        works;          /* list of pending works */
18902 @@ -78,14 +78,14 @@
18903         unsigned long flags;
18904         bool enabled;
18906 -       spin_lock_irqsave(&stopper->lock, flags);
18907 +       raw_spin_lock_irqsave(&stopper->lock, flags);
18908         enabled = stopper->enabled;
18909         if (enabled)
18910                 __cpu_stop_queue_work(stopper, work);
18911         else if (work->done)
18912                 cpu_stop_signal_done(work->done);
18913 -       spin_unlock_irqrestore(&stopper->lock, flags);
18915 +       raw_spin_unlock_irqrestore(&stopper->lock, flags);
18916         return enabled;
18919 @@ -231,8 +231,8 @@
18920         struct cpu_stopper *stopper2 = per_cpu_ptr(&cpu_stopper, cpu2);
18921         int err;
18922  retry:
18923 -       spin_lock_irq(&stopper1->lock);
18924 -       spin_lock_nested(&stopper2->lock, SINGLE_DEPTH_NESTING);
18925 +       raw_spin_lock_irq(&stopper1->lock);
18926 +       raw_spin_lock_nested(&stopper2->lock, SINGLE_DEPTH_NESTING);
18928         err = -ENOENT;
18929         if (!stopper1->enabled || !stopper2->enabled)
18930 @@ -255,8 +255,8 @@
18931         __cpu_stop_queue_work(stopper1, work1);
18932         __cpu_stop_queue_work(stopper2, work2);
18933  unlock:
18934 -       spin_unlock(&stopper2->lock);
18935 -       spin_unlock_irq(&stopper1->lock);
18936 +       raw_spin_unlock(&stopper2->lock);
18937 +       raw_spin_unlock_irq(&stopper1->lock);
18939         if (unlikely(err == -EDEADLK)) {
18940                 while (stop_cpus_in_progress)
18941 @@ -448,9 +448,9 @@
18942         unsigned long flags;
18943         int run;
18945 -       spin_lock_irqsave(&stopper->lock, flags);
18946 +       raw_spin_lock_irqsave(&stopper->lock, flags);
18947         run = !list_empty(&stopper->works);
18948 -       spin_unlock_irqrestore(&stopper->lock, flags);
18949 +       raw_spin_unlock_irqrestore(&stopper->lock, flags);
18950         return run;
18953 @@ -461,13 +461,13 @@
18955  repeat:
18956         work = NULL;
18957 -       spin_lock_irq(&stopper->lock);
18958 +       raw_spin_lock_irq(&stopper->lock);
18959         if (!list_empty(&stopper->works)) {
18960                 work = list_first_entry(&stopper->works,
18961                                         struct cpu_stop_work, list);
18962                 list_del_init(&work->list);
18963         }
18964 -       spin_unlock_irq(&stopper->lock);
18965 +       raw_spin_unlock_irq(&stopper->lock);
18967         if (work) {
18968                 cpu_stop_fn_t fn = work->fn;
18969 @@ -475,6 +475,8 @@
18970                 struct cpu_stop_done *done = work->done;
18971                 int ret;
18973 +               /* XXX */
18975                 /* cpu stop callbacks must not sleep, make in_atomic() == T */
18976                 preempt_count_inc();
18977                 ret = fn(arg);
18978 @@ -541,7 +543,7 @@
18979         for_each_possible_cpu(cpu) {
18980                 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
18982 -               spin_lock_init(&stopper->lock);
18983 +               raw_spin_lock_init(&stopper->lock);
18984                 INIT_LIST_HEAD(&stopper->works);
18985         }
18987 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/time/hrtimer.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/time/hrtimer.c
18988 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/time/hrtimer.c   2017-04-16 10:38:30.000000000 +0200
18989 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/time/hrtimer.c        2017-04-18 17:54:26.000000000 +0200
18990 @@ -53,6 +53,7 @@
18991  #include <asm/uaccess.h>
18993  #include <trace/events/timer.h>
18994 +#include <trace/events/hist.h>
18996  #include "tick-internal.h"
18998 @@ -695,6 +696,29 @@
18999         retrigger_next_event(NULL);
19002 +#ifdef CONFIG_PREEMPT_RT_FULL
19004 +static struct swork_event clock_set_delay_work;
19006 +static void run_clock_set_delay(struct swork_event *event)
19008 +       clock_was_set();
19011 +void clock_was_set_delayed(void)
19013 +       swork_queue(&clock_set_delay_work);
19016 +static __init int create_clock_set_delay_thread(void)
19018 +       WARN_ON(swork_get());
19019 +       INIT_SWORK(&clock_set_delay_work, run_clock_set_delay);
19020 +       return 0;
19022 +early_initcall(create_clock_set_delay_thread);
19023 +#else /* PREEMPT_RT_FULL */
19025  static void clock_was_set_work(struct work_struct *work)
19027         clock_was_set();
19028 @@ -710,6 +734,7 @@
19030         schedule_work(&hrtimer_work);
19032 +#endif
19034  #else
19036 @@ -719,11 +744,8 @@
19037  static inline void hrtimer_switch_to_hres(void) { }
19038  static inline void
19039  hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { }
19040 -static inline int hrtimer_reprogram(struct hrtimer *timer,
19041 -                                   struct hrtimer_clock_base *base)
19043 -       return 0;
19045 +static inline void hrtimer_reprogram(struct hrtimer *timer,
19046 +                                    struct hrtimer_clock_base *base) { }
19047  static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { }
19048  static inline void retrigger_next_event(void *arg) { }
19050 @@ -855,6 +877,32 @@
19052  EXPORT_SYMBOL_GPL(hrtimer_forward);
19054 +#ifdef CONFIG_PREEMPT_RT_BASE
19055 +# define wake_up_timer_waiters(b)      wake_up(&(b)->wait)
19057 +/**
19058 + * hrtimer_wait_for_timer - Wait for a running timer
19059 + *
19060 + * @timer:     timer to wait for
19061 + *
19062 + * The function waits in case the timers callback function is
19063 + * currently executed on the waitqueue of the timer base. The
19064 + * waitqueue is woken up after the timer callback function has
19065 + * finished execution.
19066 + */
19067 +void hrtimer_wait_for_timer(const struct hrtimer *timer)
19069 +       struct hrtimer_clock_base *base = timer->base;
19071 +       if (base && base->cpu_base && !timer->irqsafe)
19072 +               wait_event(base->cpu_base->wait,
19073 +                               !(hrtimer_callback_running(timer)));
19076 +#else
19077 +# define wake_up_timer_waiters(b)      do { } while (0)
19078 +#endif
19080  /*
19081   * enqueue_hrtimer - internal function to (re)start a timer
19082   *
19083 @@ -896,6 +944,11 @@
19084         if (!(state & HRTIMER_STATE_ENQUEUED))
19085                 return;
19087 +       if (unlikely(!list_empty(&timer->cb_entry))) {
19088 +               list_del_init(&timer->cb_entry);
19089 +               return;
19090 +       }
19092         if (!timerqueue_del(&base->active, &timer->node))
19093                 cpu_base->active_bases &= ~(1 << base->index);
19095 @@ -991,7 +1044,16 @@
19096         new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED);
19098         timer_stats_hrtimer_set_start_info(timer);
19099 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
19100 +       {
19101 +               ktime_t now = new_base->get_time();
19103 +               if (ktime_to_ns(tim) < ktime_to_ns(now))
19104 +                       timer->praecox = now;
19105 +               else
19106 +                       timer->praecox = ktime_set(0, 0);
19107 +       }
19108 +#endif
19109         leftmost = enqueue_hrtimer(timer, new_base);
19110         if (!leftmost)
19111                 goto unlock;
19112 @@ -1063,7 +1125,7 @@
19114                 if (ret >= 0)
19115                         return ret;
19116 -               cpu_relax();
19117 +               hrtimer_wait_for_timer(timer);
19118         }
19120  EXPORT_SYMBOL_GPL(hrtimer_cancel);
19121 @@ -1127,6 +1189,7 @@
19123         base = hrtimer_clockid_to_base(clock_id);
19124         timer->base = &cpu_base->clock_base[base];
19125 +       INIT_LIST_HEAD(&timer->cb_entry);
19126         timerqueue_init(&timer->node);
19128  #ifdef CONFIG_TIMER_STATS
19129 @@ -1167,6 +1230,7 @@
19130                 seq = raw_read_seqcount_begin(&cpu_base->seq);
19132                 if (timer->state != HRTIMER_STATE_INACTIVE ||
19133 +                   cpu_base->running_soft == timer ||
19134                     cpu_base->running == timer)
19135                         return true;
19137 @@ -1265,10 +1329,112 @@
19138         cpu_base->running = NULL;
19141 +#ifdef CONFIG_PREEMPT_RT_BASE
19142 +static void hrtimer_rt_reprogram(int restart, struct hrtimer *timer,
19143 +                                struct hrtimer_clock_base *base)
19145 +       int leftmost;
19147 +       if (restart != HRTIMER_NORESTART &&
19148 +           !(timer->state & HRTIMER_STATE_ENQUEUED)) {
19150 +               leftmost = enqueue_hrtimer(timer, base);
19151 +               if (!leftmost)
19152 +                       return;
19153 +#ifdef CONFIG_HIGH_RES_TIMERS
19154 +               if (!hrtimer_is_hres_active(timer)) {
19155 +                       /*
19156 +                        * Kick to reschedule the next tick to handle the new timer
19157 +                        * on dynticks target.
19158 +                        */
19159 +                       if (base->cpu_base->nohz_active)
19160 +                               wake_up_nohz_cpu(base->cpu_base->cpu);
19161 +               } else {
19163 +                       hrtimer_reprogram(timer, base);
19164 +               }
19165 +#endif
19166 +       }
19170 + * The changes in mainline which removed the callback modes from
19171 + * hrtimer are not yet working with -rt. The non wakeup_process()
19172 + * based callbacks which involve sleeping locks need to be treated
19173 + * seperately.
19174 + */
19175 +static void hrtimer_rt_run_pending(void)
19177 +       enum hrtimer_restart (*fn)(struct hrtimer *);
19178 +       struct hrtimer_cpu_base *cpu_base;
19179 +       struct hrtimer_clock_base *base;
19180 +       struct hrtimer *timer;
19181 +       int index, restart;
19183 +       local_irq_disable();
19184 +       cpu_base = &per_cpu(hrtimer_bases, smp_processor_id());
19186 +       raw_spin_lock(&cpu_base->lock);
19188 +       for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) {
19189 +               base = &cpu_base->clock_base[index];
19191 +               while (!list_empty(&base->expired)) {
19192 +                       timer = list_first_entry(&base->expired,
19193 +                                                struct hrtimer, cb_entry);
19195 +                       /*
19196 +                        * Same as the above __run_hrtimer function
19197 +                        * just we run with interrupts enabled.
19198 +                        */
19199 +                       debug_deactivate(timer);
19200 +                       cpu_base->running_soft = timer;
19201 +                       raw_write_seqcount_barrier(&cpu_base->seq);
19203 +                       __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0);
19204 +                       timer_stats_account_hrtimer(timer);
19205 +                       fn = timer->function;
19207 +                       raw_spin_unlock_irq(&cpu_base->lock);
19208 +                       restart = fn(timer);
19209 +                       raw_spin_lock_irq(&cpu_base->lock);
19211 +                       hrtimer_rt_reprogram(restart, timer, base);
19212 +                       raw_write_seqcount_barrier(&cpu_base->seq);
19214 +                       WARN_ON_ONCE(cpu_base->running_soft != timer);
19215 +                       cpu_base->running_soft = NULL;
19216 +               }
19217 +       }
19219 +       raw_spin_unlock_irq(&cpu_base->lock);
19221 +       wake_up_timer_waiters(cpu_base);
19224 +static int hrtimer_rt_defer(struct hrtimer *timer)
19226 +       if (timer->irqsafe)
19227 +               return 0;
19229 +       __remove_hrtimer(timer, timer->base, timer->state, 0);
19230 +       list_add_tail(&timer->cb_entry, &timer->base->expired);
19231 +       return 1;
19234 +#else
19236 +static inline int hrtimer_rt_defer(struct hrtimer *timer) { return 0; }
19238 +#endif
19240 +static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer);
19242  static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)
19244         struct hrtimer_clock_base *base = cpu_base->clock_base;
19245         unsigned int active = cpu_base->active_bases;
19246 +       int raise = 0;
19248         for (; active; base++, active >>= 1) {
19249                 struct timerqueue_node *node;
19250 @@ -1284,6 +1450,15 @@
19252                         timer = container_of(node, struct hrtimer, node);
19254 +                       trace_hrtimer_interrupt(raw_smp_processor_id(),
19255 +                           ktime_to_ns(ktime_sub(ktime_to_ns(timer->praecox) ?
19256 +                               timer->praecox : hrtimer_get_expires(timer),
19257 +                               basenow)),
19258 +                           current,
19259 +                           timer->function == hrtimer_wakeup ?
19260 +                           container_of(timer, struct hrtimer_sleeper,
19261 +                               timer)->task : NULL);
19263                         /*
19264                          * The immediate goal for using the softexpires is
19265                          * minimizing wakeups, not running timers at the
19266 @@ -1299,9 +1474,14 @@
19267                         if (basenow.tv64 < hrtimer_get_softexpires_tv64(timer))
19268                                 break;
19270 -                       __run_hrtimer(cpu_base, base, timer, &basenow);
19271 +                       if (!hrtimer_rt_defer(timer))
19272 +                               __run_hrtimer(cpu_base, base, timer, &basenow);
19273 +                       else
19274 +                               raise = 1;
19275                 }
19276         }
19277 +       if (raise)
19278 +               raise_softirq_irqoff(HRTIMER_SOFTIRQ);
19281  #ifdef CONFIG_HIGH_RES_TIMERS
19282 @@ -1464,16 +1644,18 @@
19283  void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
19285         sl->timer.function = hrtimer_wakeup;
19286 +       sl->timer.irqsafe = 1;
19287         sl->task = task;
19289  EXPORT_SYMBOL_GPL(hrtimer_init_sleeper);
19291 -static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode)
19292 +static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode,
19293 +                               unsigned long state)
19295         hrtimer_init_sleeper(t, current);
19297         do {
19298 -               set_current_state(TASK_INTERRUPTIBLE);
19299 +               set_current_state(state);
19300                 hrtimer_start_expires(&t->timer, mode);
19302                 if (likely(t->task))
19303 @@ -1515,7 +1697,8 @@
19304                                 HRTIMER_MODE_ABS);
19305         hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires);
19307 -       if (do_nanosleep(&t, HRTIMER_MODE_ABS))
19308 +       /* cpu_chill() does not care about restart state. */
19309 +       if (do_nanosleep(&t, HRTIMER_MODE_ABS, TASK_INTERRUPTIBLE))
19310                 goto out;
19312         rmtp = restart->nanosleep.rmtp;
19313 @@ -1532,8 +1715,10 @@
19314         return ret;
19317 -long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
19318 -                      const enum hrtimer_mode mode, const clockid_t clockid)
19319 +static long
19320 +__hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
19321 +                   const enum hrtimer_mode mode, const clockid_t clockid,
19322 +                   unsigned long state)
19324         struct restart_block *restart;
19325         struct hrtimer_sleeper t;
19326 @@ -1546,7 +1731,7 @@
19328         hrtimer_init_on_stack(&t.timer, clockid, mode);
19329         hrtimer_set_expires_range_ns(&t.timer, timespec_to_ktime(*rqtp), slack);
19330 -       if (do_nanosleep(&t, mode))
19331 +       if (do_nanosleep(&t, mode, state))
19332                 goto out;
19334         /* Absolute timers do not update the rmtp value and restart: */
19335 @@ -1573,6 +1758,12 @@
19336         return ret;
19339 +long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
19340 +                      const enum hrtimer_mode mode, const clockid_t clockid)
19342 +       return __hrtimer_nanosleep(rqtp, rmtp, mode, clockid, TASK_INTERRUPTIBLE);
19345  SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp,
19346                 struct timespec __user *, rmtp)
19348 @@ -1587,6 +1778,26 @@
19349         return hrtimer_nanosleep(&tu, rmtp, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
19352 +#ifdef CONFIG_PREEMPT_RT_FULL
19354 + * Sleep for 1 ms in hope whoever holds what we want will let it go.
19355 + */
19356 +void cpu_chill(void)
19358 +       struct timespec tu = {
19359 +               .tv_nsec = NSEC_PER_MSEC,
19360 +       };
19361 +       unsigned int freeze_flag = current->flags & PF_NOFREEZE;
19363 +       current->flags |= PF_NOFREEZE;
19364 +       __hrtimer_nanosleep(&tu, NULL, HRTIMER_MODE_REL, CLOCK_MONOTONIC,
19365 +                           TASK_UNINTERRUPTIBLE);
19366 +       if (!freeze_flag)
19367 +               current->flags &= ~PF_NOFREEZE;
19369 +EXPORT_SYMBOL(cpu_chill);
19370 +#endif
19372  /*
19373   * Functions related to boot-time initialization:
19374   */
19375 @@ -1598,10 +1809,14 @@
19376         for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
19377                 cpu_base->clock_base[i].cpu_base = cpu_base;
19378                 timerqueue_init_head(&cpu_base->clock_base[i].active);
19379 +               INIT_LIST_HEAD(&cpu_base->clock_base[i].expired);
19380         }
19382         cpu_base->cpu = cpu;
19383         hrtimer_init_hres(cpu_base);
19384 +#ifdef CONFIG_PREEMPT_RT_BASE
19385 +       init_waitqueue_head(&cpu_base->wait);
19386 +#endif
19387         return 0;
19390 @@ -1671,9 +1886,26 @@
19392  #endif /* CONFIG_HOTPLUG_CPU */
19394 +#ifdef CONFIG_PREEMPT_RT_BASE
19396 +static void run_hrtimer_softirq(struct softirq_action *h)
19398 +       hrtimer_rt_run_pending();
19401 +static void hrtimers_open_softirq(void)
19403 +       open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq);
19406 +#else
19407 +static void hrtimers_open_softirq(void) { }
19408 +#endif
19410  void __init hrtimers_init(void)
19412         hrtimers_prepare_cpu(smp_processor_id());
19413 +       hrtimers_open_softirq();
19416  /**
19417 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/time/itimer.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/time/itimer.c
19418 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/time/itimer.c    2017-04-16 10:38:30.000000000 +0200
19419 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/time/itimer.c 2017-04-18 17:54:26.000000000 +0200
19420 @@ -213,6 +213,7 @@
19421                 /* We are sharing ->siglock with it_real_fn() */
19422                 if (hrtimer_try_to_cancel(timer) < 0) {
19423                         spin_unlock_irq(&tsk->sighand->siglock);
19424 +                       hrtimer_wait_for_timer(&tsk->signal->real_timer);
19425                         goto again;
19426                 }
19427                 expires = timeval_to_ktime(value->it_value);
19428 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/time/jiffies.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/time/jiffies.c
19429 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/time/jiffies.c   2017-04-16 10:38:30.000000000 +0200
19430 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/time/jiffies.c        2017-04-18 17:54:26.000000000 +0200
19431 @@ -74,7 +74,8 @@
19432         .max_cycles     = 10,
19433  };
19435 -__cacheline_aligned_in_smp DEFINE_SEQLOCK(jiffies_lock);
19436 +__cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(jiffies_lock);
19437 +__cacheline_aligned_in_smp seqcount_t jiffies_seq;
19439  #if (BITS_PER_LONG < 64)
19440  u64 get_jiffies_64(void)
19441 @@ -83,9 +84,9 @@
19442         u64 ret;
19444         do {
19445 -               seq = read_seqbegin(&jiffies_lock);
19446 +               seq = read_seqcount_begin(&jiffies_seq);
19447                 ret = jiffies_64;
19448 -       } while (read_seqretry(&jiffies_lock, seq));
19449 +       } while (read_seqcount_retry(&jiffies_seq, seq));
19450         return ret;
19452  EXPORT_SYMBOL(get_jiffies_64);
19453 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/time/ntp.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/time/ntp.c
19454 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/time/ntp.c       2017-04-16 10:38:30.000000000 +0200
19455 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/time/ntp.c    2017-04-18 17:54:26.000000000 +0200
19456 @@ -17,6 +17,7 @@
19457  #include <linux/module.h>
19458  #include <linux/rtc.h>
19459  #include <linux/math64.h>
19460 +#include <linux/swork.h>
19462  #include "ntp_internal.h"
19463  #include "timekeeping_internal.h"
19464 @@ -568,10 +569,35 @@
19465                            &sync_cmos_work, timespec64_to_jiffies(&next));
19468 +#ifdef CONFIG_PREEMPT_RT_FULL
19470 +static void run_clock_set_delay(struct swork_event *event)
19472 +       queue_delayed_work(system_power_efficient_wq, &sync_cmos_work, 0);
19475 +static struct swork_event ntp_cmos_swork;
19477 +void ntp_notify_cmos_timer(void)
19479 +       swork_queue(&ntp_cmos_swork);
19482 +static __init int create_cmos_delay_thread(void)
19484 +       WARN_ON(swork_get());
19485 +       INIT_SWORK(&ntp_cmos_swork, run_clock_set_delay);
19486 +       return 0;
19488 +early_initcall(create_cmos_delay_thread);
19490 +#else
19492  void ntp_notify_cmos_timer(void)
19494         queue_delayed_work(system_power_efficient_wq, &sync_cmos_work, 0);
19496 +#endif /* CONFIG_PREEMPT_RT_FULL */
19498  #else
19499  void ntp_notify_cmos_timer(void) { }
19500 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/time/posix-cpu-timers.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/time/posix-cpu-timers.c
19501 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/time/posix-cpu-timers.c  2017-04-16 10:38:30.000000000 +0200
19502 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/time/posix-cpu-timers.c       2017-04-18 17:54:26.000000000 +0200
19503 @@ -3,6 +3,7 @@
19504   */
19506  #include <linux/sched.h>
19507 +#include <linux/sched/rt.h>
19508  #include <linux/posix-timers.h>
19509  #include <linux/errno.h>
19510  #include <linux/math64.h>
19511 @@ -620,7 +621,7 @@
19512         /*
19513          * Disarm any old timer after extracting its expiry time.
19514          */
19515 -       WARN_ON_ONCE(!irqs_disabled());
19516 +       WARN_ON_ONCE_NONRT(!irqs_disabled());
19518         ret = 0;
19519         old_incr = timer->it.cpu.incr;
19520 @@ -1064,7 +1065,7 @@
19521         /*
19522          * Now re-arm for the new expiry time.
19523          */
19524 -       WARN_ON_ONCE(!irqs_disabled());
19525 +       WARN_ON_ONCE_NONRT(!irqs_disabled());
19526         arm_timer(timer);
19527         unlock_task_sighand(p, &flags);
19529 @@ -1153,13 +1154,13 @@
19530   * already updated our counts.  We need to check if any timers fire now.
19531   * Interrupts are disabled.
19532   */
19533 -void run_posix_cpu_timers(struct task_struct *tsk)
19534 +static void __run_posix_cpu_timers(struct task_struct *tsk)
19536         LIST_HEAD(firing);
19537         struct k_itimer *timer, *next;
19538         unsigned long flags;
19540 -       WARN_ON_ONCE(!irqs_disabled());
19541 +       WARN_ON_ONCE_NONRT(!irqs_disabled());
19543         /*
19544          * The fast path checks that there are no expired thread or thread
19545 @@ -1213,6 +1214,190 @@
19546         }
19549 +#ifdef CONFIG_PREEMPT_RT_BASE
19550 +#include <linux/kthread.h>
19551 +#include <linux/cpu.h>
19552 +DEFINE_PER_CPU(struct task_struct *, posix_timer_task);
19553 +DEFINE_PER_CPU(struct task_struct *, posix_timer_tasklist);
19555 +static int posix_cpu_timers_thread(void *data)
19557 +       int cpu = (long)data;
19559 +       BUG_ON(per_cpu(posix_timer_task,cpu) != current);
19561 +       while (!kthread_should_stop()) {
19562 +               struct task_struct *tsk = NULL;
19563 +               struct task_struct *next = NULL;
19565 +               if (cpu_is_offline(cpu))
19566 +                       goto wait_to_die;
19568 +               /* grab task list */
19569 +               raw_local_irq_disable();
19570 +               tsk = per_cpu(posix_timer_tasklist, cpu);
19571 +               per_cpu(posix_timer_tasklist, cpu) = NULL;
19572 +               raw_local_irq_enable();
19574 +               /* its possible the list is empty, just return */
19575 +               if (!tsk) {
19576 +                       set_current_state(TASK_INTERRUPTIBLE);
19577 +                       schedule();
19578 +                       __set_current_state(TASK_RUNNING);
19579 +                       continue;
19580 +               }
19582 +               /* Process task list */
19583 +               while (1) {
19584 +                       /* save next */
19585 +                       next = tsk->posix_timer_list;
19587 +                       /* run the task timers, clear its ptr and
19588 +                        * unreference it
19589 +                        */
19590 +                       __run_posix_cpu_timers(tsk);
19591 +                       tsk->posix_timer_list = NULL;
19592 +                       put_task_struct(tsk);
19594 +                       /* check if this is the last on the list */
19595 +                       if (next == tsk)
19596 +                               break;
19597 +                       tsk = next;
19598 +               }
19599 +       }
19600 +       return 0;
19602 +wait_to_die:
19603 +       /* Wait for kthread_stop */
19604 +       set_current_state(TASK_INTERRUPTIBLE);
19605 +       while (!kthread_should_stop()) {
19606 +               schedule();
19607 +               set_current_state(TASK_INTERRUPTIBLE);
19608 +       }
19609 +       __set_current_state(TASK_RUNNING);
19610 +       return 0;
19613 +static inline int __fastpath_timer_check(struct task_struct *tsk)
19615 +       /* tsk == current, ensure it is safe to use ->signal/sighand */
19616 +       if (unlikely(tsk->exit_state))
19617 +               return 0;
19619 +       if (!task_cputime_zero(&tsk->cputime_expires))
19620 +                       return 1;
19622 +       if (!task_cputime_zero(&tsk->signal->cputime_expires))
19623 +                       return 1;
19625 +       return 0;
19628 +void run_posix_cpu_timers(struct task_struct *tsk)
19630 +       unsigned long cpu = smp_processor_id();
19631 +       struct task_struct *tasklist;
19633 +       BUG_ON(!irqs_disabled());
19634 +       if(!per_cpu(posix_timer_task, cpu))
19635 +               return;
19636 +       /* get per-cpu references */
19637 +       tasklist = per_cpu(posix_timer_tasklist, cpu);
19639 +       /* check to see if we're already queued */
19640 +       if (!tsk->posix_timer_list && __fastpath_timer_check(tsk)) {
19641 +               get_task_struct(tsk);
19642 +               if (tasklist) {
19643 +                       tsk->posix_timer_list = tasklist;
19644 +               } else {
19645 +                       /*
19646 +                        * The list is terminated by a self-pointing
19647 +                        * task_struct
19648 +                        */
19649 +                       tsk->posix_timer_list = tsk;
19650 +               }
19651 +               per_cpu(posix_timer_tasklist, cpu) = tsk;
19653 +               wake_up_process(per_cpu(posix_timer_task, cpu));
19654 +       }
19658 + * posix_cpu_thread_call - callback that gets triggered when a CPU is added.
19659 + * Here we can start up the necessary migration thread for the new CPU.
19660 + */
19661 +static int posix_cpu_thread_call(struct notifier_block *nfb,
19662 +                                unsigned long action, void *hcpu)
19664 +       int cpu = (long)hcpu;
19665 +       struct task_struct *p;
19666 +       struct sched_param param;
19668 +       switch (action) {
19669 +       case CPU_UP_PREPARE:
19670 +               p = kthread_create(posix_cpu_timers_thread, hcpu,
19671 +                                       "posixcputmr/%d",cpu);
19672 +               if (IS_ERR(p))
19673 +                       return NOTIFY_BAD;
19674 +               p->flags |= PF_NOFREEZE;
19675 +               kthread_bind(p, cpu);
19676 +               /* Must be high prio to avoid getting starved */
19677 +               param.sched_priority = MAX_RT_PRIO-1;
19678 +               sched_setscheduler(p, SCHED_FIFO, &param);
19679 +               per_cpu(posix_timer_task,cpu) = p;
19680 +               break;
19681 +       case CPU_ONLINE:
19682 +               /* Strictly unneccessary, as first user will wake it. */
19683 +               wake_up_process(per_cpu(posix_timer_task,cpu));
19684 +               break;
19685 +#ifdef CONFIG_HOTPLUG_CPU
19686 +       case CPU_UP_CANCELED:
19687 +               /* Unbind it from offline cpu so it can run.  Fall thru. */
19688 +               kthread_bind(per_cpu(posix_timer_task, cpu),
19689 +                            cpumask_any(cpu_online_mask));
19690 +               kthread_stop(per_cpu(posix_timer_task,cpu));
19691 +               per_cpu(posix_timer_task,cpu) = NULL;
19692 +               break;
19693 +       case CPU_DEAD:
19694 +               kthread_stop(per_cpu(posix_timer_task,cpu));
19695 +               per_cpu(posix_timer_task,cpu) = NULL;
19696 +               break;
19697 +#endif
19698 +       }
19699 +       return NOTIFY_OK;
19702 +/* Register at highest priority so that task migration (migrate_all_tasks)
19703 + * happens before everything else.
19704 + */
19705 +static struct notifier_block posix_cpu_thread_notifier = {
19706 +       .notifier_call = posix_cpu_thread_call,
19707 +       .priority = 10
19710 +static int __init posix_cpu_thread_init(void)
19712 +       void *hcpu = (void *)(long)smp_processor_id();
19713 +       /* Start one for boot CPU. */
19714 +       unsigned long cpu;
19716 +       /* init the per-cpu posix_timer_tasklets */
19717 +       for_each_possible_cpu(cpu)
19718 +               per_cpu(posix_timer_tasklist, cpu) = NULL;
19720 +       posix_cpu_thread_call(&posix_cpu_thread_notifier, CPU_UP_PREPARE, hcpu);
19721 +       posix_cpu_thread_call(&posix_cpu_thread_notifier, CPU_ONLINE, hcpu);
19722 +       register_cpu_notifier(&posix_cpu_thread_notifier);
19723 +       return 0;
19725 +early_initcall(posix_cpu_thread_init);
19726 +#else /* CONFIG_PREEMPT_RT_BASE */
19727 +void run_posix_cpu_timers(struct task_struct *tsk)
19729 +       __run_posix_cpu_timers(tsk);
19731 +#endif /* CONFIG_PREEMPT_RT_BASE */
19733  /*
19734   * Set one of the process-wide special case CPU timers or RLIMIT_CPU.
19735   * The tsk->sighand->siglock must be held by the caller.
19736 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/time/posix-timers.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/time/posix-timers.c
19737 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/time/posix-timers.c      2017-04-16 10:38:30.000000000 +0200
19738 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/time/posix-timers.c   2017-04-18 17:54:26.000000000 +0200
19739 @@ -506,6 +506,7 @@
19740  static struct pid *good_sigevent(sigevent_t * event)
19742         struct task_struct *rtn = current->group_leader;
19743 +       int sig = event->sigev_signo;
19745         if ((event->sigev_notify & SIGEV_THREAD_ID ) &&
19746                 (!(rtn = find_task_by_vpid(event->sigev_notify_thread_id)) ||
19747 @@ -514,7 +515,8 @@
19748                 return NULL;
19750         if (((event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) &&
19751 -           ((event->sigev_signo <= 0) || (event->sigev_signo > SIGRTMAX)))
19752 +           (sig <= 0 || sig > SIGRTMAX || sig_kernel_only(sig) ||
19753 +            sig_kernel_coredump(sig)))
19754                 return NULL;
19756         return task_pid(rtn);
19757 @@ -826,6 +828,20 @@
19758         return overrun;
19762 + * Protected by RCU!
19763 + */
19764 +static void timer_wait_for_callback(struct k_clock *kc, struct k_itimer *timr)
19766 +#ifdef CONFIG_PREEMPT_RT_FULL
19767 +       if (kc->timer_set == common_timer_set)
19768 +               hrtimer_wait_for_timer(&timr->it.real.timer);
19769 +       else
19770 +               /* FIXME: Whacky hack for posix-cpu-timers */
19771 +               schedule_timeout(1);
19772 +#endif
19775  /* Set a POSIX.1b interval timer. */
19776  /* timr->it_lock is taken. */
19777  static int
19778 @@ -903,6 +919,7 @@
19779         if (!timr)
19780                 return -EINVAL;
19782 +       rcu_read_lock();
19783         kc = clockid_to_kclock(timr->it_clock);
19784         if (WARN_ON_ONCE(!kc || !kc->timer_set))
19785                 error = -EINVAL;
19786 @@ -911,9 +928,12 @@
19788         unlock_timer(timr, flag);
19789         if (error == TIMER_RETRY) {
19790 +               timer_wait_for_callback(kc, timr);
19791                 rtn = NULL;     // We already got the old time...
19792 +               rcu_read_unlock();
19793                 goto retry;
19794         }
19795 +       rcu_read_unlock();
19797         if (old_setting && !error &&
19798             copy_to_user(old_setting, &old_spec, sizeof (old_spec)))
19799 @@ -951,10 +971,15 @@
19800         if (!timer)
19801                 return -EINVAL;
19803 +       rcu_read_lock();
19804         if (timer_delete_hook(timer) == TIMER_RETRY) {
19805                 unlock_timer(timer, flags);
19806 +               timer_wait_for_callback(clockid_to_kclock(timer->it_clock),
19807 +                                       timer);
19808 +               rcu_read_unlock();
19809                 goto retry_delete;
19810         }
19811 +       rcu_read_unlock();
19813         spin_lock(&current->sighand->siglock);
19814         list_del(&timer->list);
19815 @@ -980,8 +1005,18 @@
19816  retry_delete:
19817         spin_lock_irqsave(&timer->it_lock, flags);
19819 +       /* On RT we can race with a deletion */
19820 +       if (!timer->it_signal) {
19821 +               unlock_timer(timer, flags);
19822 +               return;
19823 +       }
19825         if (timer_delete_hook(timer) == TIMER_RETRY) {
19826 +               rcu_read_lock();
19827                 unlock_timer(timer, flags);
19828 +               timer_wait_for_callback(clockid_to_kclock(timer->it_clock),
19829 +                                       timer);
19830 +               rcu_read_unlock();
19831                 goto retry_delete;
19832         }
19833         list_del(&timer->list);
19834 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/time/tick-broadcast-hrtimer.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/time/tick-broadcast-hrtimer.c
19835 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/time/tick-broadcast-hrtimer.c    2017-04-16 10:38:30.000000000 +0200
19836 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/time/tick-broadcast-hrtimer.c 2017-04-18 17:54:26.000000000 +0200
19837 @@ -107,5 +107,6 @@
19839         hrtimer_init(&bctimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
19840         bctimer.function = bc_handler;
19841 +       bctimer.irqsafe = true;
19842         clockevents_register_device(&ce_broadcast_hrtimer);
19844 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/time/tick-common.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/time/tick-common.c
19845 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/time/tick-common.c       2017-04-16 10:38:30.000000000 +0200
19846 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/time/tick-common.c    2017-04-18 17:54:26.000000000 +0200
19847 @@ -79,13 +79,15 @@
19848  static void tick_periodic(int cpu)
19850         if (tick_do_timer_cpu == cpu) {
19851 -               write_seqlock(&jiffies_lock);
19852 +               raw_spin_lock(&jiffies_lock);
19853 +               write_seqcount_begin(&jiffies_seq);
19855                 /* Keep track of the next tick event */
19856                 tick_next_period = ktime_add(tick_next_period, tick_period);
19858                 do_timer(1);
19859 -               write_sequnlock(&jiffies_lock);
19860 +               write_seqcount_end(&jiffies_seq);
19861 +               raw_spin_unlock(&jiffies_lock);
19862                 update_wall_time();
19863         }
19865 @@ -157,9 +159,9 @@
19866                 ktime_t next;
19868                 do {
19869 -                       seq = read_seqbegin(&jiffies_lock);
19870 +                       seq = read_seqcount_begin(&jiffies_seq);
19871                         next = tick_next_period;
19872 -               } while (read_seqretry(&jiffies_lock, seq));
19873 +               } while (read_seqcount_retry(&jiffies_seq, seq));
19875                 clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT);
19877 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/time/tick-sched.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/time/tick-sched.c
19878 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/time/tick-sched.c        2017-04-16 10:38:30.000000000 +0200
19879 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/time/tick-sched.c     2017-04-18 17:54:26.000000000 +0200
19880 @@ -62,7 +62,8 @@
19881                 return;
19883         /* Reevaluate with jiffies_lock held */
19884 -       write_seqlock(&jiffies_lock);
19885 +       raw_spin_lock(&jiffies_lock);
19886 +       write_seqcount_begin(&jiffies_seq);
19888         delta = ktime_sub(now, last_jiffies_update);
19889         if (delta.tv64 >= tick_period.tv64) {
19890 @@ -85,10 +86,12 @@
19891                 /* Keep the tick_next_period variable up to date */
19892                 tick_next_period = ktime_add(last_jiffies_update, tick_period);
19893         } else {
19894 -               write_sequnlock(&jiffies_lock);
19895 +               write_seqcount_end(&jiffies_seq);
19896 +               raw_spin_unlock(&jiffies_lock);
19897                 return;
19898         }
19899 -       write_sequnlock(&jiffies_lock);
19900 +       write_seqcount_end(&jiffies_seq);
19901 +       raw_spin_unlock(&jiffies_lock);
19902         update_wall_time();
19905 @@ -99,12 +102,14 @@
19907         ktime_t period;
19909 -       write_seqlock(&jiffies_lock);
19910 +       raw_spin_lock(&jiffies_lock);
19911 +       write_seqcount_begin(&jiffies_seq);
19912         /* Did we start the jiffies update yet ? */
19913         if (last_jiffies_update.tv64 == 0)
19914                 last_jiffies_update = tick_next_period;
19915         period = last_jiffies_update;
19916 -       write_sequnlock(&jiffies_lock);
19917 +       write_seqcount_end(&jiffies_seq);
19918 +       raw_spin_unlock(&jiffies_lock);
19919         return period;
19922 @@ -215,6 +220,7 @@
19924  static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = {
19925         .func = nohz_full_kick_func,
19926 +       .flags = IRQ_WORK_HARD_IRQ,
19927  };
19929  /*
19930 @@ -673,10 +679,10 @@
19932         /* Read jiffies and the time when jiffies were updated last */
19933         do {
19934 -               seq = read_seqbegin(&jiffies_lock);
19935 +               seq = read_seqcount_begin(&jiffies_seq);
19936                 basemono = last_jiffies_update.tv64;
19937                 basejiff = jiffies;
19938 -       } while (read_seqretry(&jiffies_lock, seq));
19939 +       } while (read_seqcount_retry(&jiffies_seq, seq));
19940         ts->last_jiffies = basejiff;
19942         if (rcu_needs_cpu(basemono, &next_rcu) ||
19943 @@ -877,14 +883,7 @@
19944                 return false;
19946         if (unlikely(local_softirq_pending() && cpu_online(cpu))) {
19947 -               static int ratelimit;
19949 -               if (ratelimit < 10 &&
19950 -                   (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) {
19951 -                       pr_warn("NOHZ: local_softirq_pending %02x\n",
19952 -                               (unsigned int) local_softirq_pending());
19953 -                       ratelimit++;
19954 -               }
19955 +               softirq_check_pending_idle();
19956                 return false;
19957         }
19959 @@ -1193,6 +1192,7 @@
19960          * Emulate tick processing via per-CPU hrtimers:
19961          */
19962         hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
19963 +       ts->sched_timer.irqsafe = 1;
19964         ts->sched_timer.function = tick_sched_timer;
19966         /* Get the next period (per-CPU) */
19967 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/time/timekeeping.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/time/timekeeping.c
19968 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/time/timekeeping.c       2017-04-16 10:38:30.000000000 +0200
19969 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/time/timekeeping.c    2017-04-18 17:54:26.000000000 +0200
19970 @@ -2328,8 +2328,10 @@
19971   */
19972  void xtime_update(unsigned long ticks)
19974 -       write_seqlock(&jiffies_lock);
19975 +       raw_spin_lock(&jiffies_lock);
19976 +       write_seqcount_begin(&jiffies_seq);
19977         do_timer(ticks);
19978 -       write_sequnlock(&jiffies_lock);
19979 +       write_seqcount_end(&jiffies_seq);
19980 +       raw_spin_unlock(&jiffies_lock);
19981         update_wall_time();
19983 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/time/timekeeping.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/time/timekeeping.h
19984 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/time/timekeeping.h       2017-04-16 10:38:30.000000000 +0200
19985 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/time/timekeeping.h    2017-04-18 17:54:26.000000000 +0200
19986 @@ -19,7 +19,8 @@
19987  extern void do_timer(unsigned long ticks);
19988  extern void update_wall_time(void);
19990 -extern seqlock_t jiffies_lock;
19991 +extern raw_spinlock_t jiffies_lock;
19992 +extern seqcount_t jiffies_seq;
19994  #define CS_NAME_LEN    32
19996 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/time/timer.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/time/timer.c
19997 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/time/timer.c     2017-04-16 10:38:30.000000000 +0200
19998 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/time/timer.c  2017-04-18 17:54:26.000000000 +0200
19999 @@ -193,8 +193,11 @@
20000  #endif
20002  struct timer_base {
20003 -       spinlock_t              lock;
20004 +       raw_spinlock_t          lock;
20005         struct timer_list       *running_timer;
20006 +#ifdef CONFIG_PREEMPT_RT_FULL
20007 +       struct swait_queue_head wait_for_running_timer;
20008 +#endif
20009         unsigned long           clk;
20010         unsigned long           next_expiry;
20011         unsigned int            cpu;
20012 @@ -203,6 +206,8 @@
20013         bool                    is_idle;
20014         DECLARE_BITMAP(pending_map, WHEEL_SIZE);
20015         struct hlist_head       vectors[WHEEL_SIZE];
20016 +       struct hlist_head       expired_lists[LVL_DEPTH];
20017 +       int                     expired_count;
20018  } ____cacheline_aligned;
20020  static DEFINE_PER_CPU(struct timer_base, timer_bases[NR_BASES]);
20021 @@ -948,10 +953,10 @@
20023                 if (!(tf & TIMER_MIGRATING)) {
20024                         base = get_timer_base(tf);
20025 -                       spin_lock_irqsave(&base->lock, *flags);
20026 +                       raw_spin_lock_irqsave(&base->lock, *flags);
20027                         if (timer->flags == tf)
20028                                 return base;
20029 -                       spin_unlock_irqrestore(&base->lock, *flags);
20030 +                       raw_spin_unlock_irqrestore(&base->lock, *flags);
20031                 }
20032                 cpu_relax();
20033         }
20034 @@ -1023,9 +1028,9 @@
20035                         /* See the comment in lock_timer_base() */
20036                         timer->flags |= TIMER_MIGRATING;
20038 -                       spin_unlock(&base->lock);
20039 +                       raw_spin_unlock(&base->lock);
20040                         base = new_base;
20041 -                       spin_lock(&base->lock);
20042 +                       raw_spin_lock(&base->lock);
20043                         WRITE_ONCE(timer->flags,
20044                                    (timer->flags & ~TIMER_BASEMASK) | base->cpu);
20045                 }
20046 @@ -1050,7 +1055,7 @@
20047         }
20049  out_unlock:
20050 -       spin_unlock_irqrestore(&base->lock, flags);
20051 +       raw_spin_unlock_irqrestore(&base->lock, flags);
20053         return ret;
20055 @@ -1144,19 +1149,46 @@
20056         if (base != new_base) {
20057                 timer->flags |= TIMER_MIGRATING;
20059 -               spin_unlock(&base->lock);
20060 +               raw_spin_unlock(&base->lock);
20061                 base = new_base;
20062 -               spin_lock(&base->lock);
20063 +               raw_spin_lock(&base->lock);
20064                 WRITE_ONCE(timer->flags,
20065                            (timer->flags & ~TIMER_BASEMASK) | cpu);
20066         }
20068         debug_activate(timer, timer->expires);
20069         internal_add_timer(base, timer);
20070 -       spin_unlock_irqrestore(&base->lock, flags);
20071 +       raw_spin_unlock_irqrestore(&base->lock, flags);
20073  EXPORT_SYMBOL_GPL(add_timer_on);
20075 +#ifdef CONFIG_PREEMPT_RT_FULL
20077 + * Wait for a running timer
20078 + */
20079 +static void wait_for_running_timer(struct timer_list *timer)
20081 +       struct timer_base *base;
20082 +       u32 tf = timer->flags;
20084 +       if (tf & TIMER_MIGRATING)
20085 +               return;
20087 +       base = get_timer_base(tf);
20088 +       swait_event(base->wait_for_running_timer,
20089 +                  base->running_timer != timer);
20092 +# define wakeup_timer_waiters(b)       swake_up_all(&(b)->wait_for_running_timer)
20093 +#else
20094 +static inline void wait_for_running_timer(struct timer_list *timer)
20096 +       cpu_relax();
20099 +# define wakeup_timer_waiters(b)       do { } while (0)
20100 +#endif
20102  /**
20103   * del_timer - deactive a timer.
20104   * @timer: the timer to be deactivated
20105 @@ -1180,7 +1212,7 @@
20106         if (timer_pending(timer)) {
20107                 base = lock_timer_base(timer, &flags);
20108                 ret = detach_if_pending(timer, base, true);
20109 -               spin_unlock_irqrestore(&base->lock, flags);
20110 +               raw_spin_unlock_irqrestore(&base->lock, flags);
20111         }
20113         return ret;
20114 @@ -1208,13 +1240,13 @@
20115                 timer_stats_timer_clear_start_info(timer);
20116                 ret = detach_if_pending(timer, base, true);
20117         }
20118 -       spin_unlock_irqrestore(&base->lock, flags);
20119 +       raw_spin_unlock_irqrestore(&base->lock, flags);
20121         return ret;
20123  EXPORT_SYMBOL(try_to_del_timer_sync);
20125 -#ifdef CONFIG_SMP
20126 +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
20127  /**
20128   * del_timer_sync - deactivate a timer and wait for the handler to finish.
20129   * @timer: the timer to be deactivated
20130 @@ -1274,7 +1306,7 @@
20131                 int ret = try_to_del_timer_sync(timer);
20132                 if (ret >= 0)
20133                         return ret;
20134 -               cpu_relax();
20135 +               wait_for_running_timer(timer);
20136         }
20138  EXPORT_SYMBOL(del_timer_sync);
20139 @@ -1323,7 +1355,8 @@
20140         }
20143 -static void expire_timers(struct timer_base *base, struct hlist_head *head)
20144 +static inline void __expire_timers(struct timer_base *base,
20145 +                                  struct hlist_head *head)
20147         while (!hlist_empty(head)) {
20148                 struct timer_list *timer;
20149 @@ -1339,33 +1372,53 @@
20150                 fn = timer->function;
20151                 data = timer->data;
20153 -               if (timer->flags & TIMER_IRQSAFE) {
20154 -                       spin_unlock(&base->lock);
20155 +               if (!IS_ENABLED(CONFIG_PREEMPT_RT_FULL) &&
20156 +                   timer->flags & TIMER_IRQSAFE) {
20157 +                       raw_spin_unlock(&base->lock);
20158                         call_timer_fn(timer, fn, data);
20159 -                       spin_lock(&base->lock);
20160 +                       base->running_timer = NULL;
20161 +                       raw_spin_lock(&base->lock);
20162                 } else {
20163 -                       spin_unlock_irq(&base->lock);
20164 +                       raw_spin_unlock_irq(&base->lock);
20165                         call_timer_fn(timer, fn, data);
20166 -                       spin_lock_irq(&base->lock);
20167 +                       base->running_timer = NULL;
20168 +                       raw_spin_lock_irq(&base->lock);
20169                 }
20170         }
20173 -static int __collect_expired_timers(struct timer_base *base,
20174 -                                   struct hlist_head *heads)
20175 +static void expire_timers(struct timer_base *base)
20177 +       struct hlist_head *head;
20179 +       while (base->expired_count--) {
20180 +               head = base->expired_lists + base->expired_count;
20181 +               __expire_timers(base, head);
20182 +       }
20183 +       base->expired_count = 0;
20186 +static void __collect_expired_timers(struct timer_base *base)
20188         unsigned long clk = base->clk;
20189         struct hlist_head *vec;
20190 -       int i, levels = 0;
20191 +       int i;
20192         unsigned int idx;
20194 +       /*
20195 +        * expire_timers() must be called at least once before we can
20196 +        * collect more timers
20197 +        */
20198 +       if (WARN_ON(base->expired_count))
20199 +               return;
20201         for (i = 0; i < LVL_DEPTH; i++) {
20202                 idx = (clk & LVL_MASK) + i * LVL_SIZE;
20204                 if (__test_and_clear_bit(idx, base->pending_map)) {
20205                         vec = base->vectors + idx;
20206 -                       hlist_move_list(vec, heads++);
20207 -                       levels++;
20208 +                       hlist_move_list(vec,
20209 +                               &base->expired_lists[base->expired_count++]);
20210                 }
20211                 /* Is it time to look at the next level? */
20212                 if (clk & LVL_CLK_MASK)
20213 @@ -1373,7 +1426,6 @@
20214                 /* Shift clock for the next level granularity */
20215                 clk >>= LVL_CLK_SHIFT;
20216         }
20217 -       return levels;
20220  #ifdef CONFIG_NO_HZ_COMMON
20221 @@ -1515,7 +1567,7 @@
20222         if (cpu_is_offline(smp_processor_id()))
20223                 return expires;
20225 -       spin_lock(&base->lock);
20226 +       raw_spin_lock(&base->lock);
20227         nextevt = __next_timer_interrupt(base);
20228         is_max_delta = (nextevt == base->clk + NEXT_TIMER_MAX_DELTA);
20229         base->next_expiry = nextevt;
20230 @@ -1543,7 +1595,7 @@
20231                 if ((expires - basem) > TICK_NSEC)
20232                         base->is_idle = true;
20233         }
20234 -       spin_unlock(&base->lock);
20235 +       raw_spin_unlock(&base->lock);
20237         return cmp_next_hrtimer_event(basem, expires);
20239 @@ -1566,8 +1618,7 @@
20240         base->is_idle = false;
20243 -static int collect_expired_timers(struct timer_base *base,
20244 -                                 struct hlist_head *heads)
20245 +static void collect_expired_timers(struct timer_base *base)
20247         /*
20248          * NOHZ optimization. After a long idle sleep we need to forward the
20249 @@ -1584,20 +1635,49 @@
20250                 if (time_after(next, jiffies)) {
20251                         /* The call site will increment clock! */
20252                         base->clk = jiffies - 1;
20253 -                       return 0;
20254 +                       return;
20255                 }
20256                 base->clk = next;
20257         }
20258 -       return __collect_expired_timers(base, heads);
20259 +       __collect_expired_timers(base);
20261  #else
20262 -static inline int collect_expired_timers(struct timer_base *base,
20263 -                                        struct hlist_head *heads)
20264 +static inline void collect_expired_timers(struct timer_base *base)
20266 -       return __collect_expired_timers(base, heads);
20267 +       __collect_expired_timers(base);
20269  #endif
20271 +static int find_expired_timers(struct timer_base *base)
20273 +       const unsigned long int end_clk = jiffies;
20275 +       while (!base->expired_count && time_after_eq(end_clk, base->clk)) {
20276 +               collect_expired_timers(base);
20277 +               base->clk++;
20278 +       }
20280 +       return base->expired_count;
20283 +/* Called from CPU tick routine to quickly collect expired timers */
20284 +static int tick_find_expired(struct timer_base *base)
20286 +       int count;
20288 +       raw_spin_lock(&base->lock);
20290 +       if (unlikely(time_after(jiffies, base->clk + HZ))) {
20291 +               /* defer to ktimersoftd; don't spend too long in irq context */
20292 +               count = -1;
20293 +       } else
20294 +               count = find_expired_timers(base);
20296 +       raw_spin_unlock(&base->lock);
20298 +       return count;
20301  /*
20302   * Called from the timer interrupt handler to charge one tick to the current
20303   * process.  user_tick is 1 if the tick is user time, 0 for system.
20304 @@ -1608,13 +1688,13 @@
20306         /* Note: this timer irq context must be accounted for as well. */
20307         account_process_tick(p, user_tick);
20308 +       scheduler_tick();
20309         run_local_timers();
20310         rcu_check_callbacks(user_tick);
20311 -#ifdef CONFIG_IRQ_WORK
20312 +#if defined(CONFIG_IRQ_WORK)
20313         if (in_irq())
20314                 irq_work_tick();
20315  #endif
20316 -       scheduler_tick();
20317         run_posix_cpu_timers(p);
20320 @@ -1624,24 +1704,13 @@
20321   */
20322  static inline void __run_timers(struct timer_base *base)
20324 -       struct hlist_head heads[LVL_DEPTH];
20325 -       int levels;
20327 -       if (!time_after_eq(jiffies, base->clk))
20328 -               return;
20329 +       raw_spin_lock_irq(&base->lock);
20331 -       spin_lock_irq(&base->lock);
20332 +       while (find_expired_timers(base))
20333 +               expire_timers(base);
20335 -       while (time_after_eq(jiffies, base->clk)) {
20337 -               levels = collect_expired_timers(base, heads);
20338 -               base->clk++;
20340 -               while (levels--)
20341 -                       expire_timers(base, heads + levels);
20342 -       }
20343 -       base->running_timer = NULL;
20344 -       spin_unlock_irq(&base->lock);
20345 +       raw_spin_unlock_irq(&base->lock);
20346 +       wakeup_timer_waiters(base);
20349  /*
20350 @@ -1651,6 +1720,8 @@
20352         struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
20354 +       irq_work_tick_soft();
20356         __run_timers(base);
20357         if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active)
20358                 __run_timers(this_cpu_ptr(&timer_bases[BASE_DEF]));
20359 @@ -1665,12 +1736,12 @@
20361         hrtimer_run_queues();
20362         /* Raise the softirq only if required. */
20363 -       if (time_before(jiffies, base->clk)) {
20364 +       if (time_before(jiffies, base->clk) || !tick_find_expired(base)) {
20365                 if (!IS_ENABLED(CONFIG_NO_HZ_COMMON) || !base->nohz_active)
20366                         return;
20367                 /* CPU is awake, so check the deferrable base. */
20368                 base++;
20369 -               if (time_before(jiffies, base->clk))
20370 +               if (time_before(jiffies, base->clk) || !tick_find_expired(base))
20371                         return;
20372         }
20373         raise_softirq(TIMER_SOFTIRQ);
20374 @@ -1836,16 +1907,17 @@
20375                  * The caller is globally serialized and nobody else
20376                  * takes two locks at once, deadlock is not possible.
20377                  */
20378 -               spin_lock_irq(&new_base->lock);
20379 -               spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
20380 +               raw_spin_lock_irq(&new_base->lock);
20381 +               raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
20383                 BUG_ON(old_base->running_timer);
20384 +               BUG_ON(old_base->expired_count);
20386                 for (i = 0; i < WHEEL_SIZE; i++)
20387                         migrate_timer_list(new_base, old_base->vectors + i);
20389 -               spin_unlock(&old_base->lock);
20390 -               spin_unlock_irq(&new_base->lock);
20391 +               raw_spin_unlock(&old_base->lock);
20392 +               raw_spin_unlock_irq(&new_base->lock);
20393                 put_cpu_ptr(&timer_bases);
20394         }
20395         return 0;
20396 @@ -1861,8 +1933,12 @@
20397         for (i = 0; i < NR_BASES; i++) {
20398                 base = per_cpu_ptr(&timer_bases[i], cpu);
20399                 base->cpu = cpu;
20400 -               spin_lock_init(&base->lock);
20401 +               raw_spin_lock_init(&base->lock);
20402                 base->clk = jiffies;
20403 +#ifdef CONFIG_PREEMPT_RT_FULL
20404 +               init_swait_queue_head(&base->wait_for_running_timer);
20405 +#endif
20406 +               base->expired_count = 0;
20407         }
20410 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/trace/Kconfig linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/trace/Kconfig
20411 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/trace/Kconfig    2017-04-16 10:38:30.000000000 +0200
20412 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/trace/Kconfig 2017-04-18 17:54:26.000000000 +0200
20413 @@ -182,6 +182,24 @@
20414           enabled. This option and the preempt-off timing option can be
20415           used together or separately.)
20417 +config INTERRUPT_OFF_HIST
20418 +       bool "Interrupts-off Latency Histogram"
20419 +       depends on IRQSOFF_TRACER
20420 +       help
20421 +         This option generates continuously updated histograms (one per cpu)
20422 +         of the duration of time periods with interrupts disabled. The
20423 +         histograms are disabled by default. To enable them, write a non-zero
20424 +         number to
20426 +             /sys/kernel/debug/tracing/latency_hist/enable/preemptirqsoff
20428 +         If PREEMPT_OFF_HIST is also selected, additional histograms (one
20429 +         per cpu) are generated that accumulate the duration of time periods
20430 +         when both interrupts and preemption are disabled. The histogram data
20431 +         will be located in the debug file system at
20433 +             /sys/kernel/debug/tracing/latency_hist/irqsoff
20435  config PREEMPT_TRACER
20436         bool "Preemption-off Latency Tracer"
20437         default n
20438 @@ -206,6 +224,24 @@
20439           enabled. This option and the irqs-off timing option can be
20440           used together or separately.)
20442 +config PREEMPT_OFF_HIST
20443 +       bool "Preemption-off Latency Histogram"
20444 +       depends on PREEMPT_TRACER
20445 +       help
20446 +         This option generates continuously updated histograms (one per cpu)
20447 +         of the duration of time periods with preemption disabled. The
20448 +         histograms are disabled by default. To enable them, write a non-zero
20449 +         number to
20451 +             /sys/kernel/debug/tracing/latency_hist/enable/preemptirqsoff
20453 +         If INTERRUPT_OFF_HIST is also selected, additional histograms (one
20454 +         per cpu) are generated that accumulate the duration of time periods
20455 +         when both interrupts and preemption are disabled. The histogram data
20456 +         will be located in the debug file system at
20458 +             /sys/kernel/debug/tracing/latency_hist/preemptoff
20460  config SCHED_TRACER
20461         bool "Scheduling Latency Tracer"
20462         select GENERIC_TRACER
20463 @@ -251,6 +287,74 @@
20464          file. Every time a latency is greater than tracing_thresh, it will
20465          be recorded into the ring buffer.
20467 +config WAKEUP_LATENCY_HIST
20468 +       bool "Scheduling Latency Histogram"
20469 +       depends on SCHED_TRACER
20470 +       help
20471 +         This option generates continuously updated histograms (one per cpu)
20472 +         of the scheduling latency of the highest priority task.
20473 +         The histograms are disabled by default. To enable them, write a
20474 +         non-zero number to
20476 +             /sys/kernel/debug/tracing/latency_hist/enable/wakeup
20478 +         Two different algorithms are used, one to determine the latency of
20479 +         processes that exclusively use the highest priority of the system and
20480 +         another one to determine the latency of processes that share the
20481 +         highest system priority with other processes. The former is used to
20482 +         improve hardware and system software, the latter to optimize the
20483 +         priority design of a given system. The histogram data will be
20484 +         located in the debug file system at
20486 +             /sys/kernel/debug/tracing/latency_hist/wakeup
20488 +         and
20490 +             /sys/kernel/debug/tracing/latency_hist/wakeup/sharedprio
20492 +         If both Scheduling Latency Histogram and Missed Timer Offsets
20493 +         Histogram are selected, additional histogram data will be collected
20494 +         that contain, in addition to the wakeup latency, the timer latency, in
20495 +         case the wakeup was triggered by an expired timer. These histograms
20496 +         are available in the
20498 +             /sys/kernel/debug/tracing/latency_hist/timerandwakeup
20500 +         directory. They reflect the apparent interrupt and scheduling latency
20501 +         and are best suitable to determine the worst-case latency of a given
20502 +         system. To enable these histograms, write a non-zero number to
20504 +             /sys/kernel/debug/tracing/latency_hist/enable/timerandwakeup
20506 +config MISSED_TIMER_OFFSETS_HIST
20507 +       depends on HIGH_RES_TIMERS
20508 +       select GENERIC_TRACER
20509 +       bool "Missed Timer Offsets Histogram"
20510 +       help
20511 +         Generate a histogram of missed timer offsets in microseconds. The
20512 +         histograms are disabled by default. To enable them, write a non-zero
20513 +         number to
20515 +             /sys/kernel/debug/tracing/latency_hist/enable/missed_timer_offsets
20517 +         The histogram data will be located in the debug file system at
20519 +             /sys/kernel/debug/tracing/latency_hist/missed_timer_offsets
20521 +         If both Scheduling Latency Histogram and Missed Timer Offsets
20522 +         Histogram are selected, additional histogram data will be collected
20523 +         that contain, in addition to the wakeup latency, the timer latency, in
20524 +         case the wakeup was triggered by an expired timer. These histograms
20525 +         are available in the
20527 +             /sys/kernel/debug/tracing/latency_hist/timerandwakeup
20529 +         directory. They reflect the apparent interrupt and scheduling latency
20530 +         and are best suitable to determine the worst-case latency of a given
20531 +         system. To enable these histograms, write a non-zero number to
20533 +             /sys/kernel/debug/tracing/latency_hist/enable/timerandwakeup
20535  config ENABLE_DEFAULT_TRACERS
20536         bool "Trace process context switches and events"
20537         depends on !GENERIC_TRACER
20538 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/trace/Makefile linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/trace/Makefile
20539 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/trace/Makefile   2017-04-16 10:38:30.000000000 +0200
20540 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/trace/Makefile        2017-04-18 17:54:26.000000000 +0200
20541 @@ -38,6 +38,10 @@
20542  obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o
20543  obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o
20544  obj-$(CONFIG_HWLAT_TRACER) += trace_hwlat.o
20545 +obj-$(CONFIG_INTERRUPT_OFF_HIST) += latency_hist.o
20546 +obj-$(CONFIG_PREEMPT_OFF_HIST) += latency_hist.o
20547 +obj-$(CONFIG_WAKEUP_LATENCY_HIST) += latency_hist.o
20548 +obj-$(CONFIG_MISSED_TIMER_OFFSETS_HIST) += latency_hist.o
20549  obj-$(CONFIG_NOP_TRACER) += trace_nop.o
20550  obj-$(CONFIG_STACK_TRACER) += trace_stack.o
20551  obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
20552 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/trace/latency_hist.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/trace/latency_hist.c
20553 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/trace/latency_hist.c     1970-01-01 01:00:00.000000000 +0100
20554 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/trace/latency_hist.c  2017-04-18 17:54:26.000000000 +0200
20555 @@ -0,0 +1,1178 @@
20557 + * kernel/trace/latency_hist.c
20558 + *
20559 + * Add support for histograms of preemption-off latency and
20560 + * interrupt-off latency and wakeup latency, it depends on
20561 + * Real-Time Preemption Support.
20562 + *
20563 + *  Copyright (C) 2005 MontaVista Software, Inc.
20564 + *  Yi Yang <yyang@ch.mvista.com>
20565 + *
20566 + *  Converted to work with the new latency tracer.
20567 + *  Copyright (C) 2008 Red Hat, Inc.
20568 + *    Steven Rostedt <srostedt@redhat.com>
20569 + *
20570 + */
20571 +#include <linux/module.h>
20572 +#include <linux/debugfs.h>
20573 +#include <linux/seq_file.h>
20574 +#include <linux/percpu.h>
20575 +#include <linux/kallsyms.h>
20576 +#include <linux/uaccess.h>
20577 +#include <linux/sched.h>
20578 +#include <linux/sched/rt.h>
20579 +#include <linux/slab.h>
20580 +#include <linux/atomic.h>
20581 +#include <asm/div64.h>
20583 +#include "trace.h"
20584 +#include <trace/events/sched.h>
20586 +#define NSECS_PER_USECS 1000L
20588 +#define CREATE_TRACE_POINTS
20589 +#include <trace/events/hist.h>
20591 +enum {
20592 +       IRQSOFF_LATENCY = 0,
20593 +       PREEMPTOFF_LATENCY,
20594 +       PREEMPTIRQSOFF_LATENCY,
20595 +       WAKEUP_LATENCY,
20596 +       WAKEUP_LATENCY_SHAREDPRIO,
20597 +       MISSED_TIMER_OFFSETS,
20598 +       TIMERANDWAKEUP_LATENCY,
20599 +       MAX_LATENCY_TYPE,
20602 +#define MAX_ENTRY_NUM 10240
20604 +struct hist_data {
20605 +       atomic_t hist_mode; /* 0 log, 1 don't log */
20606 +       long offset; /* set it to MAX_ENTRY_NUM/2 for a bipolar scale */
20607 +       long min_lat;
20608 +       long max_lat;
20609 +       unsigned long long below_hist_bound_samples;
20610 +       unsigned long long above_hist_bound_samples;
20611 +       long long accumulate_lat;
20612 +       unsigned long long total_samples;
20613 +       unsigned long long hist_array[MAX_ENTRY_NUM];
20616 +struct enable_data {
20617 +       int latency_type;
20618 +       int enabled;
20621 +static char *latency_hist_dir_root = "latency_hist";
20623 +#ifdef CONFIG_INTERRUPT_OFF_HIST
20624 +static DEFINE_PER_CPU(struct hist_data, irqsoff_hist);
20625 +static char *irqsoff_hist_dir = "irqsoff";
20626 +static DEFINE_PER_CPU(cycles_t, hist_irqsoff_start);
20627 +static DEFINE_PER_CPU(int, hist_irqsoff_counting);
20628 +#endif
20630 +#ifdef CONFIG_PREEMPT_OFF_HIST
20631 +static DEFINE_PER_CPU(struct hist_data, preemptoff_hist);
20632 +static char *preemptoff_hist_dir = "preemptoff";
20633 +static DEFINE_PER_CPU(cycles_t, hist_preemptoff_start);
20634 +static DEFINE_PER_CPU(int, hist_preemptoff_counting);
20635 +#endif
20637 +#if defined(CONFIG_PREEMPT_OFF_HIST) && defined(CONFIG_INTERRUPT_OFF_HIST)
20638 +static DEFINE_PER_CPU(struct hist_data, preemptirqsoff_hist);
20639 +static char *preemptirqsoff_hist_dir = "preemptirqsoff";
20640 +static DEFINE_PER_CPU(cycles_t, hist_preemptirqsoff_start);
20641 +static DEFINE_PER_CPU(int, hist_preemptirqsoff_counting);
20642 +#endif
20644 +#if defined(CONFIG_PREEMPT_OFF_HIST) || defined(CONFIG_INTERRUPT_OFF_HIST)
20645 +static notrace void probe_preemptirqsoff_hist(void *v, int reason, int start);
20646 +static struct enable_data preemptirqsoff_enabled_data = {
20647 +       .latency_type = PREEMPTIRQSOFF_LATENCY,
20648 +       .enabled = 0,
20650 +#endif
20652 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
20653 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
20654 +struct maxlatproc_data {
20655 +       char comm[FIELD_SIZEOF(struct task_struct, comm)];
20656 +       char current_comm[FIELD_SIZEOF(struct task_struct, comm)];
20657 +       int pid;
20658 +       int current_pid;
20659 +       int prio;
20660 +       int current_prio;
20661 +       long latency;
20662 +       long timeroffset;
20663 +       cycle_t timestamp;
20665 +#endif
20667 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
20668 +static DEFINE_PER_CPU(struct hist_data, wakeup_latency_hist);
20669 +static DEFINE_PER_CPU(struct hist_data, wakeup_latency_hist_sharedprio);
20670 +static char *wakeup_latency_hist_dir = "wakeup";
20671 +static char *wakeup_latency_hist_dir_sharedprio = "sharedprio";
20672 +static notrace void probe_wakeup_latency_hist_start(void *v,
20673 +       struct task_struct *p);
20674 +static notrace void probe_wakeup_latency_hist_stop(void *v,
20675 +       bool preempt, struct task_struct *prev, struct task_struct *next);
20676 +static notrace void probe_sched_migrate_task(void *,
20677 +       struct task_struct *task, int cpu);
20678 +static struct enable_data wakeup_latency_enabled_data = {
20679 +       .latency_type = WAKEUP_LATENCY,
20680 +       .enabled = 0,
20682 +static DEFINE_PER_CPU(struct maxlatproc_data, wakeup_maxlatproc);
20683 +static DEFINE_PER_CPU(struct maxlatproc_data, wakeup_maxlatproc_sharedprio);
20684 +static DEFINE_PER_CPU(struct task_struct *, wakeup_task);
20685 +static DEFINE_PER_CPU(int, wakeup_sharedprio);
20686 +static unsigned long wakeup_pid;
20687 +#endif
20689 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
20690 +static DEFINE_PER_CPU(struct hist_data, missed_timer_offsets);
20691 +static char *missed_timer_offsets_dir = "missed_timer_offsets";
20692 +static notrace void probe_hrtimer_interrupt(void *v, int cpu,
20693 +       long long offset, struct task_struct *curr, struct task_struct *task);
20694 +static struct enable_data missed_timer_offsets_enabled_data = {
20695 +       .latency_type = MISSED_TIMER_OFFSETS,
20696 +       .enabled = 0,
20698 +static DEFINE_PER_CPU(struct maxlatproc_data, missed_timer_offsets_maxlatproc);
20699 +static unsigned long missed_timer_offsets_pid;
20700 +#endif
20702 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
20703 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
20704 +static DEFINE_PER_CPU(struct hist_data, timerandwakeup_latency_hist);
20705 +static char *timerandwakeup_latency_hist_dir = "timerandwakeup";
20706 +static struct enable_data timerandwakeup_enabled_data = {
20707 +       .latency_type = TIMERANDWAKEUP_LATENCY,
20708 +       .enabled = 0,
20710 +static DEFINE_PER_CPU(struct maxlatproc_data, timerandwakeup_maxlatproc);
20711 +#endif
20713 +void notrace latency_hist(int latency_type, int cpu, long latency,
20714 +                         long timeroffset, cycle_t stop,
20715 +                         struct task_struct *p)
20717 +       struct hist_data *my_hist;
20718 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
20719 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
20720 +       struct maxlatproc_data *mp = NULL;
20721 +#endif
20723 +       if (!cpu_possible(cpu) || latency_type < 0 ||
20724 +           latency_type >= MAX_LATENCY_TYPE)
20725 +               return;
20727 +       switch (latency_type) {
20728 +#ifdef CONFIG_INTERRUPT_OFF_HIST
20729 +       case IRQSOFF_LATENCY:
20730 +               my_hist = &per_cpu(irqsoff_hist, cpu);
20731 +               break;
20732 +#endif
20733 +#ifdef CONFIG_PREEMPT_OFF_HIST
20734 +       case PREEMPTOFF_LATENCY:
20735 +               my_hist = &per_cpu(preemptoff_hist, cpu);
20736 +               break;
20737 +#endif
20738 +#if defined(CONFIG_PREEMPT_OFF_HIST) && defined(CONFIG_INTERRUPT_OFF_HIST)
20739 +       case PREEMPTIRQSOFF_LATENCY:
20740 +               my_hist = &per_cpu(preemptirqsoff_hist, cpu);
20741 +               break;
20742 +#endif
20743 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
20744 +       case WAKEUP_LATENCY:
20745 +               my_hist = &per_cpu(wakeup_latency_hist, cpu);
20746 +               mp = &per_cpu(wakeup_maxlatproc, cpu);
20747 +               break;
20748 +       case WAKEUP_LATENCY_SHAREDPRIO:
20749 +               my_hist = &per_cpu(wakeup_latency_hist_sharedprio, cpu);
20750 +               mp = &per_cpu(wakeup_maxlatproc_sharedprio, cpu);
20751 +               break;
20752 +#endif
20753 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
20754 +       case MISSED_TIMER_OFFSETS:
20755 +               my_hist = &per_cpu(missed_timer_offsets, cpu);
20756 +               mp = &per_cpu(missed_timer_offsets_maxlatproc, cpu);
20757 +               break;
20758 +#endif
20759 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
20760 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
20761 +       case TIMERANDWAKEUP_LATENCY:
20762 +               my_hist = &per_cpu(timerandwakeup_latency_hist, cpu);
20763 +               mp = &per_cpu(timerandwakeup_maxlatproc, cpu);
20764 +               break;
20765 +#endif
20767 +       default:
20768 +               return;
20769 +       }
20771 +       latency += my_hist->offset;
20773 +       if (atomic_read(&my_hist->hist_mode) == 0)
20774 +               return;
20776 +       if (latency < 0 || latency >= MAX_ENTRY_NUM) {
20777 +               if (latency < 0)
20778 +                       my_hist->below_hist_bound_samples++;
20779 +               else
20780 +                       my_hist->above_hist_bound_samples++;
20781 +       } else
20782 +               my_hist->hist_array[latency]++;
20784 +       if (unlikely(latency > my_hist->max_lat ||
20785 +           my_hist->min_lat == LONG_MAX)) {
20786 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
20787 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
20788 +               if (latency_type == WAKEUP_LATENCY ||
20789 +                   latency_type == WAKEUP_LATENCY_SHAREDPRIO ||
20790 +                   latency_type == MISSED_TIMER_OFFSETS ||
20791 +                   latency_type == TIMERANDWAKEUP_LATENCY) {
20792 +                       strncpy(mp->comm, p->comm, sizeof(mp->comm));
20793 +                       strncpy(mp->current_comm, current->comm,
20794 +                           sizeof(mp->current_comm));
20795 +                       mp->pid = task_pid_nr(p);
20796 +                       mp->current_pid = task_pid_nr(current);
20797 +                       mp->prio = p->prio;
20798 +                       mp->current_prio = current->prio;
20799 +                       mp->latency = latency;
20800 +                       mp->timeroffset = timeroffset;
20801 +                       mp->timestamp = stop;
20802 +               }
20803 +#endif
20804 +               my_hist->max_lat = latency;
20805 +       }
20806 +       if (unlikely(latency < my_hist->min_lat))
20807 +               my_hist->min_lat = latency;
20808 +       my_hist->total_samples++;
20809 +       my_hist->accumulate_lat += latency;
20812 +static void *l_start(struct seq_file *m, loff_t *pos)
20814 +       loff_t *index_ptr = NULL;
20815 +       loff_t index = *pos;
20816 +       struct hist_data *my_hist = m->private;
20818 +       if (index == 0) {
20819 +               char minstr[32], avgstr[32], maxstr[32];
20821 +               atomic_dec(&my_hist->hist_mode);
20823 +               if (likely(my_hist->total_samples)) {
20824 +                       long avg = (long) div64_s64(my_hist->accumulate_lat,
20825 +                           my_hist->total_samples);
20826 +                       snprintf(minstr, sizeof(minstr), "%ld",
20827 +                           my_hist->min_lat - my_hist->offset);
20828 +                       snprintf(avgstr, sizeof(avgstr), "%ld",
20829 +                           avg - my_hist->offset);
20830 +                       snprintf(maxstr, sizeof(maxstr), "%ld",
20831 +                           my_hist->max_lat - my_hist->offset);
20832 +               } else {
20833 +                       strcpy(minstr, "<undef>");
20834 +                       strcpy(avgstr, minstr);
20835 +                       strcpy(maxstr, minstr);
20836 +               }
20838 +               seq_printf(m, "#Minimum latency: %s microseconds\n"
20839 +                          "#Average latency: %s microseconds\n"
20840 +                          "#Maximum latency: %s microseconds\n"
20841 +                          "#Total samples: %llu\n"
20842 +                          "#There are %llu samples lower than %ld"
20843 +                          " microseconds.\n"
20844 +                          "#There are %llu samples greater or equal"
20845 +                          " than %ld microseconds.\n"
20846 +                          "#usecs\t%16s\n",
20847 +                          minstr, avgstr, maxstr,
20848 +                          my_hist->total_samples,
20849 +                          my_hist->below_hist_bound_samples,
20850 +                          -my_hist->offset,
20851 +                          my_hist->above_hist_bound_samples,
20852 +                          MAX_ENTRY_NUM - my_hist->offset,
20853 +                          "samples");
20854 +       }
20855 +       if (index < MAX_ENTRY_NUM) {
20856 +               index_ptr = kmalloc(sizeof(loff_t), GFP_KERNEL);
20857 +               if (index_ptr)
20858 +                       *index_ptr = index;
20859 +       }
20861 +       return index_ptr;
20864 +static void *l_next(struct seq_file *m, void *p, loff_t *pos)
20866 +       loff_t *index_ptr = p;
20867 +       struct hist_data *my_hist = m->private;
20869 +       if (++*pos >= MAX_ENTRY_NUM) {
20870 +               atomic_inc(&my_hist->hist_mode);
20871 +               return NULL;
20872 +       }
20873 +       *index_ptr = *pos;
20874 +       return index_ptr;
20877 +static void l_stop(struct seq_file *m, void *p)
20879 +       kfree(p);
20882 +static int l_show(struct seq_file *m, void *p)
20884 +       int index = *(loff_t *) p;
20885 +       struct hist_data *my_hist = m->private;
20887 +       seq_printf(m, "%6ld\t%16llu\n", index - my_hist->offset,
20888 +           my_hist->hist_array[index]);
20889 +       return 0;
20892 +static const struct seq_operations latency_hist_seq_op = {
20893 +       .start = l_start,
20894 +       .next  = l_next,
20895 +       .stop  = l_stop,
20896 +       .show  = l_show
20899 +static int latency_hist_open(struct inode *inode, struct file *file)
20901 +       int ret;
20903 +       ret = seq_open(file, &latency_hist_seq_op);
20904 +       if (!ret) {
20905 +               struct seq_file *seq = file->private_data;
20906 +               seq->private = inode->i_private;
20907 +       }
20908 +       return ret;
20911 +static const struct file_operations latency_hist_fops = {
20912 +       .open = latency_hist_open,
20913 +       .read = seq_read,
20914 +       .llseek = seq_lseek,
20915 +       .release = seq_release,
20918 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
20919 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
20920 +static void clear_maxlatprocdata(struct maxlatproc_data *mp)
20922 +       mp->comm[0] = mp->current_comm[0] = '\0';
20923 +       mp->prio = mp->current_prio = mp->pid = mp->current_pid =
20924 +           mp->latency = mp->timeroffset = -1;
20925 +       mp->timestamp = 0;
20927 +#endif
20929 +static void hist_reset(struct hist_data *hist)
20931 +       atomic_dec(&hist->hist_mode);
20933 +       memset(hist->hist_array, 0, sizeof(hist->hist_array));
20934 +       hist->below_hist_bound_samples = 0ULL;
20935 +       hist->above_hist_bound_samples = 0ULL;
20936 +       hist->min_lat = LONG_MAX;
20937 +       hist->max_lat = LONG_MIN;
20938 +       hist->total_samples = 0ULL;
20939 +       hist->accumulate_lat = 0LL;
20941 +       atomic_inc(&hist->hist_mode);
20944 +static ssize_t
20945 +latency_hist_reset(struct file *file, const char __user *a,
20946 +                  size_t size, loff_t *off)
20948 +       int cpu;
20949 +       struct hist_data *hist = NULL;
20950 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
20951 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
20952 +       struct maxlatproc_data *mp = NULL;
20953 +#endif
20954 +       off_t latency_type = (off_t) file->private_data;
20956 +       for_each_online_cpu(cpu) {
20958 +               switch (latency_type) {
20959 +#ifdef CONFIG_PREEMPT_OFF_HIST
20960 +               case PREEMPTOFF_LATENCY:
20961 +                       hist = &per_cpu(preemptoff_hist, cpu);
20962 +                       break;
20963 +#endif
20964 +#ifdef CONFIG_INTERRUPT_OFF_HIST
20965 +               case IRQSOFF_LATENCY:
20966 +                       hist = &per_cpu(irqsoff_hist, cpu);
20967 +                       break;
20968 +#endif
20969 +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
20970 +               case PREEMPTIRQSOFF_LATENCY:
20971 +                       hist = &per_cpu(preemptirqsoff_hist, cpu);
20972 +                       break;
20973 +#endif
20974 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
20975 +               case WAKEUP_LATENCY:
20976 +                       hist = &per_cpu(wakeup_latency_hist, cpu);
20977 +                       mp = &per_cpu(wakeup_maxlatproc, cpu);
20978 +                       break;
20979 +               case WAKEUP_LATENCY_SHAREDPRIO:
20980 +                       hist = &per_cpu(wakeup_latency_hist_sharedprio, cpu);
20981 +                       mp = &per_cpu(wakeup_maxlatproc_sharedprio, cpu);
20982 +                       break;
20983 +#endif
20984 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
20985 +               case MISSED_TIMER_OFFSETS:
20986 +                       hist = &per_cpu(missed_timer_offsets, cpu);
20987 +                       mp = &per_cpu(missed_timer_offsets_maxlatproc, cpu);
20988 +                       break;
20989 +#endif
20990 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
20991 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
20992 +               case TIMERANDWAKEUP_LATENCY:
20993 +                       hist = &per_cpu(timerandwakeup_latency_hist, cpu);
20994 +                       mp = &per_cpu(timerandwakeup_maxlatproc, cpu);
20995 +                       break;
20996 +#endif
20997 +               }
20999 +               hist_reset(hist);
21000 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
21001 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
21002 +               if (latency_type == WAKEUP_LATENCY ||
21003 +                   latency_type == WAKEUP_LATENCY_SHAREDPRIO ||
21004 +                   latency_type == MISSED_TIMER_OFFSETS ||
21005 +                   latency_type == TIMERANDWAKEUP_LATENCY)
21006 +                       clear_maxlatprocdata(mp);
21007 +#endif
21008 +       }
21010 +       return size;
21013 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
21014 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
21015 +static ssize_t
21016 +show_pid(struct file *file, char __user *ubuf, size_t cnt, loff_t *ppos)
21018 +       char buf[64];
21019 +       int r;
21020 +       unsigned long *this_pid = file->private_data;
21022 +       r = snprintf(buf, sizeof(buf), "%lu\n", *this_pid);
21023 +       return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
21026 +static ssize_t do_pid(struct file *file, const char __user *ubuf,
21027 +                     size_t cnt, loff_t *ppos)
21029 +       char buf[64];
21030 +       unsigned long pid;
21031 +       unsigned long *this_pid = file->private_data;
21033 +       if (cnt >= sizeof(buf))
21034 +               return -EINVAL;
21036 +       if (copy_from_user(&buf, ubuf, cnt))
21037 +               return -EFAULT;
21039 +       buf[cnt] = '\0';
21041 +       if (kstrtoul(buf, 10, &pid))
21042 +               return -EINVAL;
21044 +       *this_pid = pid;
21046 +       return cnt;
21048 +#endif
21050 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
21051 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
21052 +static ssize_t
21053 +show_maxlatproc(struct file *file, char __user *ubuf, size_t cnt, loff_t *ppos)
21055 +       int r;
21056 +       struct maxlatproc_data *mp = file->private_data;
21057 +       int strmaxlen = (TASK_COMM_LEN * 2) + (8 * 8);
21058 +       unsigned long long t;
21059 +       unsigned long usecs, secs;
21060 +       char *buf;
21062 +       if (mp->pid == -1 || mp->current_pid == -1) {
21063 +               buf = "(none)\n";
21064 +               return simple_read_from_buffer(ubuf, cnt, ppos, buf,
21065 +                   strlen(buf));
21066 +       }
21068 +       buf = kmalloc(strmaxlen, GFP_KERNEL);
21069 +       if (buf == NULL)
21070 +               return -ENOMEM;
21072 +       t = ns2usecs(mp->timestamp);
21073 +       usecs = do_div(t, USEC_PER_SEC);
21074 +       secs = (unsigned long) t;
21075 +       r = snprintf(buf, strmaxlen,
21076 +           "%d %d %ld (%ld) %s <- %d %d %s %lu.%06lu\n", mp->pid,
21077 +           MAX_RT_PRIO-1 - mp->prio, mp->latency, mp->timeroffset, mp->comm,
21078 +           mp->current_pid, MAX_RT_PRIO-1 - mp->current_prio, mp->current_comm,
21079 +           secs, usecs);
21080 +       r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
21081 +       kfree(buf);
21082 +       return r;
21084 +#endif
21086 +static ssize_t
21087 +show_enable(struct file *file, char __user *ubuf, size_t cnt, loff_t *ppos)
21089 +       char buf[64];
21090 +       struct enable_data *ed = file->private_data;
21091 +       int r;
21093 +       r = snprintf(buf, sizeof(buf), "%d\n", ed->enabled);
21094 +       return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
21097 +static ssize_t
21098 +do_enable(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos)
21100 +       char buf[64];
21101 +       long enable;
21102 +       struct enable_data *ed = file->private_data;
21104 +       if (cnt >= sizeof(buf))
21105 +               return -EINVAL;
21107 +       if (copy_from_user(&buf, ubuf, cnt))
21108 +               return -EFAULT;
21110 +       buf[cnt] = 0;
21112 +       if (kstrtoul(buf, 10, &enable))
21113 +               return -EINVAL;
21115 +       if ((enable && ed->enabled) || (!enable && !ed->enabled))
21116 +               return cnt;
21118 +       if (enable) {
21119 +               int ret;
21121 +               switch (ed->latency_type) {
21122 +#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST)
21123 +               case PREEMPTIRQSOFF_LATENCY:
21124 +                       ret = register_trace_preemptirqsoff_hist(
21125 +                           probe_preemptirqsoff_hist, NULL);
21126 +                       if (ret) {
21127 +                               pr_info("wakeup trace: Couldn't assign "
21128 +                                   "probe_preemptirqsoff_hist "
21129 +                                   "to trace_preemptirqsoff_hist\n");
21130 +                               return ret;
21131 +                       }
21132 +                       break;
21133 +#endif
21134 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
21135 +               case WAKEUP_LATENCY:
21136 +                       ret = register_trace_sched_wakeup(
21137 +                           probe_wakeup_latency_hist_start, NULL);
21138 +                       if (ret) {
21139 +                               pr_info("wakeup trace: Couldn't assign "
21140 +                                   "probe_wakeup_latency_hist_start "
21141 +                                   "to trace_sched_wakeup\n");
21142 +                               return ret;
21143 +                       }
21144 +                       ret = register_trace_sched_wakeup_new(
21145 +                           probe_wakeup_latency_hist_start, NULL);
21146 +                       if (ret) {
21147 +                               pr_info("wakeup trace: Couldn't assign "
21148 +                                   "probe_wakeup_latency_hist_start "
21149 +                                   "to trace_sched_wakeup_new\n");
21150 +                               unregister_trace_sched_wakeup(
21151 +                                   probe_wakeup_latency_hist_start, NULL);
21152 +                               return ret;
21153 +                       }
21154 +                       ret = register_trace_sched_switch(
21155 +                           probe_wakeup_latency_hist_stop, NULL);
21156 +                       if (ret) {
21157 +                               pr_info("wakeup trace: Couldn't assign "
21158 +                                   "probe_wakeup_latency_hist_stop "
21159 +                                   "to trace_sched_switch\n");
21160 +                               unregister_trace_sched_wakeup(
21161 +                                   probe_wakeup_latency_hist_start, NULL);
21162 +                               unregister_trace_sched_wakeup_new(
21163 +                                   probe_wakeup_latency_hist_start, NULL);
21164 +                               return ret;
21165 +                       }
21166 +                       ret = register_trace_sched_migrate_task(
21167 +                           probe_sched_migrate_task, NULL);
21168 +                       if (ret) {
21169 +                               pr_info("wakeup trace: Couldn't assign "
21170 +                                   "probe_sched_migrate_task "
21171 +                                   "to trace_sched_migrate_task\n");
21172 +                               unregister_trace_sched_wakeup(
21173 +                                   probe_wakeup_latency_hist_start, NULL);
21174 +                               unregister_trace_sched_wakeup_new(
21175 +                                   probe_wakeup_latency_hist_start, NULL);
21176 +                               unregister_trace_sched_switch(
21177 +                                   probe_wakeup_latency_hist_stop, NULL);
21178 +                               return ret;
21179 +                       }
21180 +                       break;
21181 +#endif
21182 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
21183 +               case MISSED_TIMER_OFFSETS:
21184 +                       ret = register_trace_hrtimer_interrupt(
21185 +                           probe_hrtimer_interrupt, NULL);
21186 +                       if (ret) {
21187 +                               pr_info("wakeup trace: Couldn't assign "
21188 +                                   "probe_hrtimer_interrupt "
21189 +                                   "to trace_hrtimer_interrupt\n");
21190 +                               return ret;
21191 +                       }
21192 +                       break;
21193 +#endif
21194 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
21195 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
21196 +               case TIMERANDWAKEUP_LATENCY:
21197 +                       if (!wakeup_latency_enabled_data.enabled ||
21198 +                           !missed_timer_offsets_enabled_data.enabled)
21199 +                               return -EINVAL;
21200 +                       break;
21201 +#endif
21202 +               default:
21203 +                       break;
21204 +               }
21205 +       } else {
21206 +               switch (ed->latency_type) {
21207 +#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST)
21208 +               case PREEMPTIRQSOFF_LATENCY:
21209 +                       {
21210 +                               int cpu;
21212 +                               unregister_trace_preemptirqsoff_hist(
21213 +                                   probe_preemptirqsoff_hist, NULL);
21214 +                               for_each_online_cpu(cpu) {
21215 +#ifdef CONFIG_INTERRUPT_OFF_HIST
21216 +                                       per_cpu(hist_irqsoff_counting,
21217 +                                           cpu) = 0;
21218 +#endif
21219 +#ifdef CONFIG_PREEMPT_OFF_HIST
21220 +                                       per_cpu(hist_preemptoff_counting,
21221 +                                           cpu) = 0;
21222 +#endif
21223 +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
21224 +                                       per_cpu(hist_preemptirqsoff_counting,
21225 +                                           cpu) = 0;
21226 +#endif
21227 +                               }
21228 +                       }
21229 +                       break;
21230 +#endif
21231 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
21232 +               case WAKEUP_LATENCY:
21233 +                       {
21234 +                               int cpu;
21236 +                               unregister_trace_sched_wakeup(
21237 +                                   probe_wakeup_latency_hist_start, NULL);
21238 +                               unregister_trace_sched_wakeup_new(
21239 +                                   probe_wakeup_latency_hist_start, NULL);
21240 +                               unregister_trace_sched_switch(
21241 +                                   probe_wakeup_latency_hist_stop, NULL);
21242 +                               unregister_trace_sched_migrate_task(
21243 +                                   probe_sched_migrate_task, NULL);
21245 +                               for_each_online_cpu(cpu) {
21246 +                                       per_cpu(wakeup_task, cpu) = NULL;
21247 +                                       per_cpu(wakeup_sharedprio, cpu) = 0;
21248 +                               }
21249 +                       }
21250 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
21251 +                       timerandwakeup_enabled_data.enabled = 0;
21252 +#endif
21253 +                       break;
21254 +#endif
21255 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
21256 +               case MISSED_TIMER_OFFSETS:
21257 +                       unregister_trace_hrtimer_interrupt(
21258 +                           probe_hrtimer_interrupt, NULL);
21259 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
21260 +                       timerandwakeup_enabled_data.enabled = 0;
21261 +#endif
21262 +                       break;
21263 +#endif
21264 +               default:
21265 +                       break;
21266 +               }
21267 +       }
21268 +       ed->enabled = enable;
21269 +       return cnt;
21272 +static const struct file_operations latency_hist_reset_fops = {
21273 +       .open = tracing_open_generic,
21274 +       .write = latency_hist_reset,
21277 +static const struct file_operations enable_fops = {
21278 +       .open = tracing_open_generic,
21279 +       .read = show_enable,
21280 +       .write = do_enable,
21283 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
21284 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
21285 +static const struct file_operations pid_fops = {
21286 +       .open = tracing_open_generic,
21287 +       .read = show_pid,
21288 +       .write = do_pid,
21291 +static const struct file_operations maxlatproc_fops = {
21292 +       .open = tracing_open_generic,
21293 +       .read = show_maxlatproc,
21295 +#endif
21297 +#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST)
21298 +static notrace void probe_preemptirqsoff_hist(void *v, int reason,
21299 +       int starthist)
21301 +       int cpu = raw_smp_processor_id();
21302 +       int time_set = 0;
21304 +       if (starthist) {
21305 +               cycle_t uninitialized_var(start);
21307 +               if (!preempt_count() && !irqs_disabled())
21308 +                       return;
21310 +#ifdef CONFIG_INTERRUPT_OFF_HIST
21311 +               if ((reason == IRQS_OFF || reason == TRACE_START) &&
21312 +                   !per_cpu(hist_irqsoff_counting, cpu)) {
21313 +                       per_cpu(hist_irqsoff_counting, cpu) = 1;
21314 +                       start = ftrace_now(cpu);
21315 +                       time_set++;
21316 +                       per_cpu(hist_irqsoff_start, cpu) = start;
21317 +               }
21318 +#endif
21320 +#ifdef CONFIG_PREEMPT_OFF_HIST
21321 +               if ((reason == PREEMPT_OFF || reason == TRACE_START) &&
21322 +                   !per_cpu(hist_preemptoff_counting, cpu)) {
21323 +                       per_cpu(hist_preemptoff_counting, cpu) = 1;
21324 +                       if (!(time_set++))
21325 +                               start = ftrace_now(cpu);
21326 +                       per_cpu(hist_preemptoff_start, cpu) = start;
21327 +               }
21328 +#endif
21330 +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
21331 +               if (per_cpu(hist_irqsoff_counting, cpu) &&
21332 +                   per_cpu(hist_preemptoff_counting, cpu) &&
21333 +                   !per_cpu(hist_preemptirqsoff_counting, cpu)) {
21334 +                       per_cpu(hist_preemptirqsoff_counting, cpu) = 1;
21335 +                       if (!time_set)
21336 +                               start = ftrace_now(cpu);
21337 +                       per_cpu(hist_preemptirqsoff_start, cpu) = start;
21338 +               }
21339 +#endif
21340 +       } else {
21341 +               cycle_t uninitialized_var(stop);
21343 +#ifdef CONFIG_INTERRUPT_OFF_HIST
21344 +               if ((reason == IRQS_ON || reason == TRACE_STOP) &&
21345 +                   per_cpu(hist_irqsoff_counting, cpu)) {
21346 +                       cycle_t start = per_cpu(hist_irqsoff_start, cpu);
21348 +                       stop = ftrace_now(cpu);
21349 +                       time_set++;
21350 +                       if (start) {
21351 +                               long latency = ((long) (stop - start)) /
21352 +                                   NSECS_PER_USECS;
21354 +                               latency_hist(IRQSOFF_LATENCY, cpu, latency, 0,
21355 +                                   stop, NULL);
21356 +                       }
21357 +                       per_cpu(hist_irqsoff_counting, cpu) = 0;
21358 +               }
21359 +#endif
21361 +#ifdef CONFIG_PREEMPT_OFF_HIST
21362 +               if ((reason == PREEMPT_ON || reason == TRACE_STOP) &&
21363 +                   per_cpu(hist_preemptoff_counting, cpu)) {
21364 +                       cycle_t start = per_cpu(hist_preemptoff_start, cpu);
21366 +                       if (!(time_set++))
21367 +                               stop = ftrace_now(cpu);
21368 +                       if (start) {
21369 +                               long latency = ((long) (stop - start)) /
21370 +                                   NSECS_PER_USECS;
21372 +                               latency_hist(PREEMPTOFF_LATENCY, cpu, latency,
21373 +                                   0, stop, NULL);
21374 +                       }
21375 +                       per_cpu(hist_preemptoff_counting, cpu) = 0;
21376 +               }
21377 +#endif
21379 +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
21380 +               if ((!per_cpu(hist_irqsoff_counting, cpu) ||
21381 +                    !per_cpu(hist_preemptoff_counting, cpu)) &&
21382 +                  per_cpu(hist_preemptirqsoff_counting, cpu)) {
21383 +                       cycle_t start = per_cpu(hist_preemptirqsoff_start, cpu);
21385 +                       if (!time_set)
21386 +                               stop = ftrace_now(cpu);
21387 +                       if (start) {
21388 +                               long latency = ((long) (stop - start)) /
21389 +                                   NSECS_PER_USECS;
21391 +                               latency_hist(PREEMPTIRQSOFF_LATENCY, cpu,
21392 +                                   latency, 0, stop, NULL);
21393 +                       }
21394 +                       per_cpu(hist_preemptirqsoff_counting, cpu) = 0;
21395 +               }
21396 +#endif
21397 +       }
21399 +#endif
21401 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
21402 +static DEFINE_RAW_SPINLOCK(wakeup_lock);
21403 +static notrace void probe_sched_migrate_task(void *v, struct task_struct *task,
21404 +       int cpu)
21406 +       int old_cpu = task_cpu(task);
21408 +       if (cpu != old_cpu) {
21409 +               unsigned long flags;
21410 +               struct task_struct *cpu_wakeup_task;
21412 +               raw_spin_lock_irqsave(&wakeup_lock, flags);
21414 +               cpu_wakeup_task = per_cpu(wakeup_task, old_cpu);
21415 +               if (task == cpu_wakeup_task) {
21416 +                       put_task_struct(cpu_wakeup_task);
21417 +                       per_cpu(wakeup_task, old_cpu) = NULL;
21418 +                       cpu_wakeup_task = per_cpu(wakeup_task, cpu) = task;
21419 +                       get_task_struct(cpu_wakeup_task);
21420 +               }
21422 +               raw_spin_unlock_irqrestore(&wakeup_lock, flags);
21423 +       }
21426 +static notrace void probe_wakeup_latency_hist_start(void *v,
21427 +       struct task_struct *p)
21429 +       unsigned long flags;
21430 +       struct task_struct *curr = current;
21431 +       int cpu = task_cpu(p);
21432 +       struct task_struct *cpu_wakeup_task;
21434 +       raw_spin_lock_irqsave(&wakeup_lock, flags);
21436 +       cpu_wakeup_task = per_cpu(wakeup_task, cpu);
21438 +       if (wakeup_pid) {
21439 +               if ((cpu_wakeup_task && p->prio == cpu_wakeup_task->prio) ||
21440 +                   p->prio == curr->prio)
21441 +                       per_cpu(wakeup_sharedprio, cpu) = 1;
21442 +               if (likely(wakeup_pid != task_pid_nr(p)))
21443 +                       goto out;
21444 +       } else {
21445 +               if (likely(!rt_task(p)) ||
21446 +                   (cpu_wakeup_task && p->prio > cpu_wakeup_task->prio) ||
21447 +                   p->prio > curr->prio)
21448 +                       goto out;
21449 +               if ((cpu_wakeup_task && p->prio == cpu_wakeup_task->prio) ||
21450 +                   p->prio == curr->prio)
21451 +                       per_cpu(wakeup_sharedprio, cpu) = 1;
21452 +       }
21454 +       if (cpu_wakeup_task)
21455 +               put_task_struct(cpu_wakeup_task);
21456 +       cpu_wakeup_task = per_cpu(wakeup_task, cpu) = p;
21457 +       get_task_struct(cpu_wakeup_task);
21458 +       cpu_wakeup_task->preempt_timestamp_hist =
21459 +               ftrace_now(raw_smp_processor_id());
21460 +out:
21461 +       raw_spin_unlock_irqrestore(&wakeup_lock, flags);
21464 +static notrace void probe_wakeup_latency_hist_stop(void *v,
21465 +       bool preempt, struct task_struct *prev, struct task_struct *next)
21467 +       unsigned long flags;
21468 +       int cpu = task_cpu(next);
21469 +       long latency;
21470 +       cycle_t stop;
21471 +       struct task_struct *cpu_wakeup_task;
21473 +       raw_spin_lock_irqsave(&wakeup_lock, flags);
21475 +       cpu_wakeup_task = per_cpu(wakeup_task, cpu);
21477 +       if (cpu_wakeup_task == NULL)
21478 +               goto out;
21480 +       /* Already running? */
21481 +       if (unlikely(current == cpu_wakeup_task))
21482 +               goto out_reset;
21484 +       if (next != cpu_wakeup_task) {
21485 +               if (next->prio < cpu_wakeup_task->prio)
21486 +                       goto out_reset;
21488 +               if (next->prio == cpu_wakeup_task->prio)
21489 +                       per_cpu(wakeup_sharedprio, cpu) = 1;
21491 +               goto out;
21492 +       }
21494 +       if (current->prio == cpu_wakeup_task->prio)
21495 +               per_cpu(wakeup_sharedprio, cpu) = 1;
21497 +       /*
21498 +        * The task we are waiting for is about to be switched to.
21499 +        * Calculate latency and store it in histogram.
21500 +        */
21501 +       stop = ftrace_now(raw_smp_processor_id());
21503 +       latency = ((long) (stop - next->preempt_timestamp_hist)) /
21504 +           NSECS_PER_USECS;
21506 +       if (per_cpu(wakeup_sharedprio, cpu)) {
21507 +               latency_hist(WAKEUP_LATENCY_SHAREDPRIO, cpu, latency, 0, stop,
21508 +                   next);
21509 +               per_cpu(wakeup_sharedprio, cpu) = 0;
21510 +       } else {
21511 +               latency_hist(WAKEUP_LATENCY, cpu, latency, 0, stop, next);
21512 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
21513 +               if (timerandwakeup_enabled_data.enabled) {
21514 +                       latency_hist(TIMERANDWAKEUP_LATENCY, cpu,
21515 +                           next->timer_offset + latency, next->timer_offset,
21516 +                           stop, next);
21517 +               }
21518 +#endif
21519 +       }
21521 +out_reset:
21522 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
21523 +       next->timer_offset = 0;
21524 +#endif
21525 +       put_task_struct(cpu_wakeup_task);
21526 +       per_cpu(wakeup_task, cpu) = NULL;
21527 +out:
21528 +       raw_spin_unlock_irqrestore(&wakeup_lock, flags);
21530 +#endif
21532 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
21533 +static notrace void probe_hrtimer_interrupt(void *v, int cpu,
21534 +       long long latency_ns, struct task_struct *curr,
21535 +       struct task_struct *task)
21537 +       if (latency_ns <= 0 && task != NULL && rt_task(task) &&
21538 +           (task->prio < curr->prio ||
21539 +           (task->prio == curr->prio &&
21540 +           !cpumask_test_cpu(cpu, &task->cpus_allowed)))) {
21541 +               long latency;
21542 +               cycle_t now;
21544 +               if (missed_timer_offsets_pid) {
21545 +                       if (likely(missed_timer_offsets_pid !=
21546 +                           task_pid_nr(task)))
21547 +                               return;
21548 +               }
21550 +               now = ftrace_now(cpu);
21551 +               latency = (long) div_s64(-latency_ns, NSECS_PER_USECS);
21552 +               latency_hist(MISSED_TIMER_OFFSETS, cpu, latency, latency, now,
21553 +                   task);
21554 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
21555 +               task->timer_offset = latency;
21556 +#endif
21557 +       }
21559 +#endif
21561 +static __init int latency_hist_init(void)
21563 +       struct dentry *latency_hist_root = NULL;
21564 +       struct dentry *dentry;
21565 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
21566 +       struct dentry *dentry_sharedprio;
21567 +#endif
21568 +       struct dentry *entry;
21569 +       struct dentry *enable_root;
21570 +       int i = 0;
21571 +       struct hist_data *my_hist;
21572 +       char name[64];
21573 +       char *cpufmt = "CPU%d";
21574 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
21575 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
21576 +       char *cpufmt_maxlatproc = "max_latency-CPU%d";
21577 +       struct maxlatproc_data *mp = NULL;
21578 +#endif
21580 +       dentry = tracing_init_dentry();
21581 +       latency_hist_root = debugfs_create_dir(latency_hist_dir_root, dentry);
21582 +       enable_root = debugfs_create_dir("enable", latency_hist_root);
21584 +#ifdef CONFIG_INTERRUPT_OFF_HIST
21585 +       dentry = debugfs_create_dir(irqsoff_hist_dir, latency_hist_root);
21586 +       for_each_possible_cpu(i) {
21587 +               sprintf(name, cpufmt, i);
21588 +               entry = debugfs_create_file(name, 0444, dentry,
21589 +                   &per_cpu(irqsoff_hist, i), &latency_hist_fops);
21590 +               my_hist = &per_cpu(irqsoff_hist, i);
21591 +               atomic_set(&my_hist->hist_mode, 1);
21592 +               my_hist->min_lat = LONG_MAX;
21593 +       }
21594 +       entry = debugfs_create_file("reset", 0644, dentry,
21595 +           (void *)IRQSOFF_LATENCY, &latency_hist_reset_fops);
21596 +#endif
21598 +#ifdef CONFIG_PREEMPT_OFF_HIST
21599 +       dentry = debugfs_create_dir(preemptoff_hist_dir,
21600 +           latency_hist_root);
21601 +       for_each_possible_cpu(i) {
21602 +               sprintf(name, cpufmt, i);
21603 +               entry = debugfs_create_file(name, 0444, dentry,
21604 +                   &per_cpu(preemptoff_hist, i), &latency_hist_fops);
21605 +               my_hist = &per_cpu(preemptoff_hist, i);
21606 +               atomic_set(&my_hist->hist_mode, 1);
21607 +               my_hist->min_lat = LONG_MAX;
21608 +       }
21609 +       entry = debugfs_create_file("reset", 0644, dentry,
21610 +           (void *)PREEMPTOFF_LATENCY, &latency_hist_reset_fops);
21611 +#endif
21613 +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
21614 +       dentry = debugfs_create_dir(preemptirqsoff_hist_dir,
21615 +           latency_hist_root);
21616 +       for_each_possible_cpu(i) {
21617 +               sprintf(name, cpufmt, i);
21618 +               entry = debugfs_create_file(name, 0444, dentry,
21619 +                   &per_cpu(preemptirqsoff_hist, i), &latency_hist_fops);
21620 +               my_hist = &per_cpu(preemptirqsoff_hist, i);
21621 +               atomic_set(&my_hist->hist_mode, 1);
21622 +               my_hist->min_lat = LONG_MAX;
21623 +       }
21624 +       entry = debugfs_create_file("reset", 0644, dentry,
21625 +           (void *)PREEMPTIRQSOFF_LATENCY, &latency_hist_reset_fops);
21626 +#endif
21628 +#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST)
21629 +       entry = debugfs_create_file("preemptirqsoff", 0644,
21630 +           enable_root, (void *)&preemptirqsoff_enabled_data,
21631 +           &enable_fops);
21632 +#endif
21634 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
21635 +       dentry = debugfs_create_dir(wakeup_latency_hist_dir,
21636 +           latency_hist_root);
21637 +       dentry_sharedprio = debugfs_create_dir(
21638 +           wakeup_latency_hist_dir_sharedprio, dentry);
21639 +       for_each_possible_cpu(i) {
21640 +               sprintf(name, cpufmt, i);
21642 +               entry = debugfs_create_file(name, 0444, dentry,
21643 +                   &per_cpu(wakeup_latency_hist, i),
21644 +                   &latency_hist_fops);
21645 +               my_hist = &per_cpu(wakeup_latency_hist, i);
21646 +               atomic_set(&my_hist->hist_mode, 1);
21647 +               my_hist->min_lat = LONG_MAX;
21649 +               entry = debugfs_create_file(name, 0444, dentry_sharedprio,
21650 +                   &per_cpu(wakeup_latency_hist_sharedprio, i),
21651 +                   &latency_hist_fops);
21652 +               my_hist = &per_cpu(wakeup_latency_hist_sharedprio, i);
21653 +               atomic_set(&my_hist->hist_mode, 1);
21654 +               my_hist->min_lat = LONG_MAX;
21656 +               sprintf(name, cpufmt_maxlatproc, i);
21658 +               mp = &per_cpu(wakeup_maxlatproc, i);
21659 +               entry = debugfs_create_file(name, 0444, dentry, mp,
21660 +                   &maxlatproc_fops);
21661 +               clear_maxlatprocdata(mp);
21663 +               mp = &per_cpu(wakeup_maxlatproc_sharedprio, i);
21664 +               entry = debugfs_create_file(name, 0444, dentry_sharedprio, mp,
21665 +                   &maxlatproc_fops);
21666 +               clear_maxlatprocdata(mp);
21667 +       }
21668 +       entry = debugfs_create_file("pid", 0644, dentry,
21669 +           (void *)&wakeup_pid, &pid_fops);
21670 +       entry = debugfs_create_file("reset", 0644, dentry,
21671 +           (void *)WAKEUP_LATENCY, &latency_hist_reset_fops);
21672 +       entry = debugfs_create_file("reset", 0644, dentry_sharedprio,
21673 +           (void *)WAKEUP_LATENCY_SHAREDPRIO, &latency_hist_reset_fops);
21674 +       entry = debugfs_create_file("wakeup", 0644,
21675 +           enable_root, (void *)&wakeup_latency_enabled_data,
21676 +           &enable_fops);
21677 +#endif
21679 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
21680 +       dentry = debugfs_create_dir(missed_timer_offsets_dir,
21681 +           latency_hist_root);
21682 +       for_each_possible_cpu(i) {
21683 +               sprintf(name, cpufmt, i);
21684 +               entry = debugfs_create_file(name, 0444, dentry,
21685 +                   &per_cpu(missed_timer_offsets, i), &latency_hist_fops);
21686 +               my_hist = &per_cpu(missed_timer_offsets, i);
21687 +               atomic_set(&my_hist->hist_mode, 1);
21688 +               my_hist->min_lat = LONG_MAX;
21690 +               sprintf(name, cpufmt_maxlatproc, i);
21691 +               mp = &per_cpu(missed_timer_offsets_maxlatproc, i);
21692 +               entry = debugfs_create_file(name, 0444, dentry, mp,
21693 +                   &maxlatproc_fops);
21694 +               clear_maxlatprocdata(mp);
21695 +       }
21696 +       entry = debugfs_create_file("pid", 0644, dentry,
21697 +           (void *)&missed_timer_offsets_pid, &pid_fops);
21698 +       entry = debugfs_create_file("reset", 0644, dentry,
21699 +           (void *)MISSED_TIMER_OFFSETS, &latency_hist_reset_fops);
21700 +       entry = debugfs_create_file("missed_timer_offsets", 0644,
21701 +           enable_root, (void *)&missed_timer_offsets_enabled_data,
21702 +           &enable_fops);
21703 +#endif
21705 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
21706 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
21707 +       dentry = debugfs_create_dir(timerandwakeup_latency_hist_dir,
21708 +           latency_hist_root);
21709 +       for_each_possible_cpu(i) {
21710 +               sprintf(name, cpufmt, i);
21711 +               entry = debugfs_create_file(name, 0444, dentry,
21712 +                   &per_cpu(timerandwakeup_latency_hist, i),
21713 +                   &latency_hist_fops);
21714 +               my_hist = &per_cpu(timerandwakeup_latency_hist, i);
21715 +               atomic_set(&my_hist->hist_mode, 1);
21716 +               my_hist->min_lat = LONG_MAX;
21718 +               sprintf(name, cpufmt_maxlatproc, i);
21719 +               mp = &per_cpu(timerandwakeup_maxlatproc, i);
21720 +               entry = debugfs_create_file(name, 0444, dentry, mp,
21721 +                   &maxlatproc_fops);
21722 +               clear_maxlatprocdata(mp);
21723 +       }
21724 +       entry = debugfs_create_file("reset", 0644, dentry,
21725 +           (void *)TIMERANDWAKEUP_LATENCY, &latency_hist_reset_fops);
21726 +       entry = debugfs_create_file("timerandwakeup", 0644,
21727 +           enable_root, (void *)&timerandwakeup_enabled_data,
21728 +           &enable_fops);
21729 +#endif
21730 +       return 0;
21733 +device_initcall(latency_hist_init);
21734 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/trace/trace.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/trace/trace.c
21735 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/trace/trace.c    2017-04-16 10:38:30.000000000 +0200
21736 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/trace/trace.c 2017-04-18 17:54:26.000000000 +0200
21737 @@ -1897,6 +1897,7 @@
21738         struct task_struct *tsk = current;
21740         entry->preempt_count            = pc & 0xff;
21741 +       entry->preempt_lazy_count       = preempt_lazy_count();
21742         entry->pid                      = (tsk) ? tsk->pid : 0;
21743         entry->flags =
21744  #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
21745 @@ -1907,8 +1908,11 @@
21746                 ((pc & NMI_MASK    ) ? TRACE_FLAG_NMI     : 0) |
21747                 ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) |
21748                 ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) |
21749 -               (tif_need_resched() ? TRACE_FLAG_NEED_RESCHED : 0) |
21750 +               (tif_need_resched_now() ? TRACE_FLAG_NEED_RESCHED : 0) |
21751 +               (need_resched_lazy() ? TRACE_FLAG_NEED_RESCHED_LAZY : 0) |
21752                 (test_preempt_need_resched() ? TRACE_FLAG_PREEMPT_RESCHED : 0);
21754 +       entry->migrate_disable = (tsk) ? __migrate_disabled(tsk) & 0xFF : 0;
21756  EXPORT_SYMBOL_GPL(tracing_generic_entry_update);
21758 @@ -2892,14 +2896,17 @@
21760  static void print_lat_help_header(struct seq_file *m)
21762 -       seq_puts(m, "#                  _------=> CPU#            \n"
21763 -                   "#                 / _-----=> irqs-off        \n"
21764 -                   "#                | / _----=> need-resched    \n"
21765 -                   "#                || / _---=> hardirq/softirq \n"
21766 -                   "#                ||| / _--=> preempt-depth   \n"
21767 -                   "#                |||| /     delay            \n"
21768 -                   "#  cmd     pid   ||||| time  |   caller      \n"
21769 -                   "#     \\   /      |||||  \\    |   /         \n");
21770 +       seq_puts(m, "#                  _--------=> CPU#              \n"
21771 +                   "#                 / _-------=> irqs-off          \n"
21772 +                   "#                | / _------=> need-resched      \n"
21773 +                   "#                || / _-----=> need-resched_lazy \n"
21774 +                   "#                ||| / _----=> hardirq/softirq   \n"
21775 +                   "#                |||| / _---=> preempt-depth     \n"
21776 +                   "#                ||||| / _--=> preempt-lazy-depth\n"
21777 +                   "#                |||||| / _-=> migrate-disable   \n"
21778 +                   "#                ||||||| /     delay             \n"
21779 +                   "# cmd     pid    |||||||| time   |  caller       \n"
21780 +                   "#     \\   /      ||||||||   \\    |  /            \n");
21783  static void print_event_info(struct trace_buffer *buf, struct seq_file *m)
21784 @@ -2925,11 +2932,14 @@
21785         print_event_info(buf, m);
21786         seq_puts(m, "#                              _-----=> irqs-off\n"
21787                     "#                             / _----=> need-resched\n"
21788 -                   "#                            | / _---=> hardirq/softirq\n"
21789 -                   "#                            || / _--=> preempt-depth\n"
21790 -                   "#                            ||| /     delay\n"
21791 -                   "#           TASK-PID   CPU#  ||||    TIMESTAMP  FUNCTION\n"
21792 -                   "#              | |       |   ||||       |         |\n");
21793 +                   "#                            |/  _-----=> need-resched_lazy\n"
21794 +                   "#                            || / _---=> hardirq/softirq\n"
21795 +                   "#                            ||| / _--=> preempt-depth\n"
21796 +                   "#                            |||| / _-=> preempt-lazy-depth\n"
21797 +                   "#                            ||||| / _-=> migrate-disable   \n"
21798 +                   "#                            |||||| /    delay\n"
21799 +                   "#           TASK-PID   CPU#  |||||||   TIMESTAMP  FUNCTION\n"
21800 +                   "#              | |       |   |||||||      |         |\n");
21803  void
21804 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/trace/trace.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/trace/trace.h
21805 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/trace/trace.h    2017-04-16 10:38:30.000000000 +0200
21806 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/trace/trace.h 2017-04-18 17:54:26.000000000 +0200
21807 @@ -124,6 +124,7 @@
21808   *  NEED_RESCHED       - reschedule is requested
21809   *  HARDIRQ            - inside an interrupt handler
21810   *  SOFTIRQ            - inside a softirq handler
21811 + *  NEED_RESCHED_LAZY  - lazy reschedule is requested
21812   */
21813  enum trace_flag_type {
21814         TRACE_FLAG_IRQS_OFF             = 0x01,
21815 @@ -133,6 +134,7 @@
21816         TRACE_FLAG_SOFTIRQ              = 0x10,
21817         TRACE_FLAG_PREEMPT_RESCHED      = 0x20,
21818         TRACE_FLAG_NMI                  = 0x40,
21819 +       TRACE_FLAG_NEED_RESCHED_LAZY    = 0x80,
21820  };
21822  #define TRACE_BUF_SIZE         1024
21823 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/trace/trace_events.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/trace/trace_events.c
21824 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/trace/trace_events.c     2017-04-16 10:38:30.000000000 +0200
21825 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/trace/trace_events.c  2017-04-18 17:54:26.000000000 +0200
21826 @@ -187,6 +187,8 @@
21827         __common_field(unsigned char, flags);
21828         __common_field(unsigned char, preempt_count);
21829         __common_field(int, pid);
21830 +       __common_field(unsigned short, migrate_disable);
21831 +       __common_field(unsigned short, padding);
21833         return ret;
21835 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/trace/trace_irqsoff.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/trace/trace_irqsoff.c
21836 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/trace/trace_irqsoff.c    2017-04-16 10:38:30.000000000 +0200
21837 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/trace/trace_irqsoff.c 2017-04-18 17:54:26.000000000 +0200
21838 @@ -13,6 +13,7 @@
21839  #include <linux/uaccess.h>
21840  #include <linux/module.h>
21841  #include <linux/ftrace.h>
21842 +#include <trace/events/hist.h>
21844  #include "trace.h"
21846 @@ -424,11 +425,13 @@
21848         if (preempt_trace() || irq_trace())
21849                 start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
21850 +       trace_preemptirqsoff_hist_rcuidle(TRACE_START, 1);
21852  EXPORT_SYMBOL_GPL(start_critical_timings);
21854  void stop_critical_timings(void)
21856 +       trace_preemptirqsoff_hist_rcuidle(TRACE_STOP, 0);
21857         if (preempt_trace() || irq_trace())
21858                 stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
21860 @@ -438,6 +441,7 @@
21861  #ifdef CONFIG_PROVE_LOCKING
21862  void time_hardirqs_on(unsigned long a0, unsigned long a1)
21864 +       trace_preemptirqsoff_hist_rcuidle(IRQS_ON, 0);
21865         if (!preempt_trace() && irq_trace())
21866                 stop_critical_timing(a0, a1);
21868 @@ -446,6 +450,7 @@
21870         if (!preempt_trace() && irq_trace())
21871                 start_critical_timing(a0, a1);
21872 +       trace_preemptirqsoff_hist_rcuidle(IRQS_OFF, 1);
21875  #else /* !CONFIG_PROVE_LOCKING */
21876 @@ -471,6 +476,7 @@
21877   */
21878  void trace_hardirqs_on(void)
21880 +       trace_preemptirqsoff_hist(IRQS_ON, 0);
21881         if (!preempt_trace() && irq_trace())
21882                 stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
21884 @@ -480,11 +486,13 @@
21886         if (!preempt_trace() && irq_trace())
21887                 start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
21888 +       trace_preemptirqsoff_hist(IRQS_OFF, 1);
21890  EXPORT_SYMBOL(trace_hardirqs_off);
21892  __visible void trace_hardirqs_on_caller(unsigned long caller_addr)
21894 +       trace_preemptirqsoff_hist(IRQS_ON, 0);
21895         if (!preempt_trace() && irq_trace())
21896                 stop_critical_timing(CALLER_ADDR0, caller_addr);
21898 @@ -494,6 +502,7 @@
21900         if (!preempt_trace() && irq_trace())
21901                 start_critical_timing(CALLER_ADDR0, caller_addr);
21902 +       trace_preemptirqsoff_hist(IRQS_OFF, 1);
21904  EXPORT_SYMBOL(trace_hardirqs_off_caller);
21906 @@ -503,12 +512,14 @@
21907  #ifdef CONFIG_PREEMPT_TRACER
21908  void trace_preempt_on(unsigned long a0, unsigned long a1)
21910 +       trace_preemptirqsoff_hist(PREEMPT_ON, 0);
21911         if (preempt_trace() && !irq_trace())
21912                 stop_critical_timing(a0, a1);
21915  void trace_preempt_off(unsigned long a0, unsigned long a1)
21917 +       trace_preemptirqsoff_hist(PREEMPT_ON, 1);
21918         if (preempt_trace() && !irq_trace())
21919                 start_critical_timing(a0, a1);
21921 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/trace/trace_output.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/trace/trace_output.c
21922 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/trace/trace_output.c     2017-04-16 10:38:30.000000000 +0200
21923 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/trace/trace_output.c  2017-04-18 17:54:26.000000000 +0200
21924 @@ -386,6 +386,7 @@
21926         char hardsoft_irq;
21927         char need_resched;
21928 +       char need_resched_lazy;
21929         char irqs_off;
21930         int hardirq;
21931         int softirq;
21932 @@ -416,6 +417,9 @@
21933                 break;
21934         }
21936 +       need_resched_lazy =
21937 +               (entry->flags & TRACE_FLAG_NEED_RESCHED_LAZY) ? 'L' : '.';
21939         hardsoft_irq =
21940                 (nmi && hardirq)     ? 'Z' :
21941                 nmi                  ? 'z' :
21942 @@ -424,14 +428,25 @@
21943                 softirq              ? 's' :
21944                                        '.' ;
21946 -       trace_seq_printf(s, "%c%c%c",
21947 -                        irqs_off, need_resched, hardsoft_irq);
21948 +       trace_seq_printf(s, "%c%c%c%c",
21949 +                        irqs_off, need_resched, need_resched_lazy,
21950 +                        hardsoft_irq);
21952         if (entry->preempt_count)
21953                 trace_seq_printf(s, "%x", entry->preempt_count);
21954         else
21955                 trace_seq_putc(s, '.');
21957 +       if (entry->preempt_lazy_count)
21958 +               trace_seq_printf(s, "%x", entry->preempt_lazy_count);
21959 +       else
21960 +               trace_seq_putc(s, '.');
21962 +       if (entry->migrate_disable)
21963 +               trace_seq_printf(s, "%x", entry->migrate_disable);
21964 +       else
21965 +               trace_seq_putc(s, '.');
21967         return !trace_seq_has_overflowed(s);
21970 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/user.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/user.c
21971 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/user.c   2017-04-16 10:38:30.000000000 +0200
21972 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/user.c        2017-04-18 17:54:26.000000000 +0200
21973 @@ -161,11 +161,11 @@
21974         if (!up)
21975                 return;
21977 -       local_irq_save(flags);
21978 +       local_irq_save_nort(flags);
21979         if (atomic_dec_and_lock(&up->__count, &uidhash_lock))
21980                 free_user(up, flags);
21981         else
21982 -               local_irq_restore(flags);
21983 +               local_irq_restore_nort(flags);
21986  struct user_struct *alloc_uid(kuid_t uid)
21987 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/watchdog.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/watchdog.c
21988 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/watchdog.c       2017-04-16 10:38:30.000000000 +0200
21989 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/watchdog.c    2017-04-18 17:54:26.000000000 +0200
21990 @@ -315,6 +315,8 @@
21992  #ifdef CONFIG_HARDLOCKUP_DETECTOR
21994 +static DEFINE_RAW_SPINLOCK(watchdog_output_lock);
21996  static struct perf_event_attr wd_hw_attr = {
21997         .type           = PERF_TYPE_HARDWARE,
21998         .config         = PERF_COUNT_HW_CPU_CYCLES,
21999 @@ -348,6 +350,13 @@
22000                 /* only print hardlockups once */
22001                 if (__this_cpu_read(hard_watchdog_warn) == true)
22002                         return;
22003 +               /*
22004 +                * If early-printk is enabled then make sure we do not
22005 +                * lock up in printk() and kill console logging:
22006 +                */
22007 +               printk_kill();
22009 +               raw_spin_lock(&watchdog_output_lock);
22011                 pr_emerg("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
22012                 print_modules();
22013 @@ -365,6 +374,7 @@
22014                                 !test_and_set_bit(0, &hardlockup_allcpu_dumped))
22015                         trigger_allbutself_cpu_backtrace();
22017 +               raw_spin_unlock(&watchdog_output_lock);
22018                 if (hardlockup_panic)
22019                         nmi_panic(regs, "Hard LOCKUP");
22021 @@ -512,6 +522,7 @@
22022         /* kick off the timer for the hardlockup detector */
22023         hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
22024         hrtimer->function = watchdog_timer_fn;
22025 +       hrtimer->irqsafe = 1;
22027         /* Enable the perf event */
22028         watchdog_nmi_enable(cpu);
22029 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/workqueue.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/workqueue.c
22030 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/workqueue.c      2017-04-16 10:38:30.000000000 +0200
22031 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/workqueue.c   2017-04-18 17:54:26.000000000 +0200
22032 @@ -48,6 +48,8 @@
22033  #include <linux/nodemask.h>
22034  #include <linux/moduleparam.h>
22035  #include <linux/uaccess.h>
22036 +#include <linux/locallock.h>
22037 +#include <linux/delay.h>
22039  #include "workqueue_internal.h"
22041 @@ -121,11 +123,16 @@
22042   *    cpu or grabbing pool->lock is enough for read access.  If
22043   *    POOL_DISASSOCIATED is set, it's identical to L.
22044   *
22045 + *    On RT we need the extra protection via rt_lock_idle_list() for
22046 + *    the list manipulations against read access from
22047 + *    wq_worker_sleeping(). All other places are nicely serialized via
22048 + *    pool->lock.
22049 + *
22050   * A: pool->attach_mutex protected.
22051   *
22052   * PL: wq_pool_mutex protected.
22053   *
22054 - * PR: wq_pool_mutex protected for writes.  Sched-RCU protected for reads.
22055 + * PR: wq_pool_mutex protected for writes.  RCU protected for reads.
22056   *
22057   * PW: wq_pool_mutex and wq->mutex protected for writes.  Either for reads.
22058   *
22059 @@ -134,7 +141,7 @@
22060   *
22061   * WQ: wq->mutex protected.
22062   *
22063 - * WR: wq->mutex protected for writes.  Sched-RCU protected for reads.
22064 + * WR: wq->mutex protected for writes.  RCU protected for reads.
22065   *
22066   * MD: wq_mayday_lock protected.
22067   */
22068 @@ -185,7 +192,7 @@
22069         atomic_t                nr_running ____cacheline_aligned_in_smp;
22071         /*
22072 -        * Destruction of pool is sched-RCU protected to allow dereferences
22073 +        * Destruction of pool is RCU protected to allow dereferences
22074          * from get_work_pool().
22075          */
22076         struct rcu_head         rcu;
22077 @@ -214,7 +221,7 @@
22078         /*
22079          * Release of unbound pwq is punted to system_wq.  See put_pwq()
22080          * and pwq_unbound_release_workfn() for details.  pool_workqueue
22081 -        * itself is also sched-RCU protected so that the first pwq can be
22082 +        * itself is also RCU protected so that the first pwq can be
22083          * determined without grabbing wq->mutex.
22084          */
22085         struct work_struct      unbound_release_work;
22086 @@ -348,6 +355,8 @@
22087  struct workqueue_struct *system_freezable_power_efficient_wq __read_mostly;
22088  EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq);
22090 +static DEFINE_LOCAL_IRQ_LOCK(pendingb_lock);
22092  static int worker_thread(void *__worker);
22093  static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
22095 @@ -355,20 +364,20 @@
22096  #include <trace/events/workqueue.h>
22098  #define assert_rcu_or_pool_mutex()                                     \
22099 -       RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() &&                 \
22100 +       RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&                       \
22101                          !lockdep_is_held(&wq_pool_mutex),              \
22102 -                        "sched RCU or wq_pool_mutex should be held")
22103 +                        "RCU or wq_pool_mutex should be held")
22105  #define assert_rcu_or_wq_mutex(wq)                                     \
22106 -       RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() &&                 \
22107 +       RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&                       \
22108                          !lockdep_is_held(&wq->mutex),                  \
22109 -                        "sched RCU or wq->mutex should be held")
22110 +                        "RCU or wq->mutex should be held")
22112  #define assert_rcu_or_wq_mutex_or_pool_mutex(wq)                       \
22113 -       RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() &&                 \
22114 +       RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&                       \
22115                          !lockdep_is_held(&wq->mutex) &&                \
22116                          !lockdep_is_held(&wq_pool_mutex),              \
22117 -                        "sched RCU, wq->mutex or wq_pool_mutex should be held")
22118 +                        "RCU, wq->mutex or wq_pool_mutex should be held")
22120  #define for_each_cpu_worker_pool(pool, cpu)                            \
22121         for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0];               \
22122 @@ -380,7 +389,7 @@
22123   * @pool: iteration cursor
22124   * @pi: integer used for iteration
22125   *
22126 - * This must be called either with wq_pool_mutex held or sched RCU read
22127 + * This must be called either with wq_pool_mutex held or RCU read
22128   * locked.  If the pool needs to be used beyond the locking in effect, the
22129   * caller is responsible for guaranteeing that the pool stays online.
22130   *
22131 @@ -412,7 +421,7 @@
22132   * @pwq: iteration cursor
22133   * @wq: the target workqueue
22134   *
22135 - * This must be called either with wq->mutex held or sched RCU read locked.
22136 + * This must be called either with wq->mutex held or RCU read locked.
22137   * If the pwq needs to be used beyond the locking in effect, the caller is
22138   * responsible for guaranteeing that the pwq stays online.
22139   *
22140 @@ -424,6 +433,31 @@
22141                 if (({ assert_rcu_or_wq_mutex(wq); false; })) { }       \
22142                 else
22144 +#ifdef CONFIG_PREEMPT_RT_BASE
22145 +static inline void rt_lock_idle_list(struct worker_pool *pool)
22147 +       preempt_disable();
22149 +static inline void rt_unlock_idle_list(struct worker_pool *pool)
22151 +       preempt_enable();
22153 +static inline void sched_lock_idle_list(struct worker_pool *pool) { }
22154 +static inline void sched_unlock_idle_list(struct worker_pool *pool) { }
22155 +#else
22156 +static inline void rt_lock_idle_list(struct worker_pool *pool) { }
22157 +static inline void rt_unlock_idle_list(struct worker_pool *pool) { }
22158 +static inline void sched_lock_idle_list(struct worker_pool *pool)
22160 +       spin_lock_irq(&pool->lock);
22162 +static inline void sched_unlock_idle_list(struct worker_pool *pool)
22164 +       spin_unlock_irq(&pool->lock);
22166 +#endif
22169  #ifdef CONFIG_DEBUG_OBJECTS_WORK
22171  static struct debug_obj_descr work_debug_descr;
22172 @@ -548,7 +582,7 @@
22173   * @wq: the target workqueue
22174   * @node: the node ID
22175   *
22176 - * This must be called with any of wq_pool_mutex, wq->mutex or sched RCU
22177 + * This must be called with any of wq_pool_mutex, wq->mutex or RCU
22178   * read locked.
22179   * If the pwq needs to be used beyond the locking in effect, the caller is
22180   * responsible for guaranteeing that the pwq stays online.
22181 @@ -692,8 +726,8 @@
22182   * @work: the work item of interest
22183   *
22184   * Pools are created and destroyed under wq_pool_mutex, and allows read
22185 - * access under sched-RCU read lock.  As such, this function should be
22186 - * called under wq_pool_mutex or with preemption disabled.
22187 + * access under RCU read lock.  As such, this function should be
22188 + * called under wq_pool_mutex or inside of a rcu_read_lock() region.
22189   *
22190   * All fields of the returned pool are accessible as long as the above
22191   * mentioned locking is in effect.  If the returned pool needs to be used
22192 @@ -830,50 +864,45 @@
22193   */
22194  static void wake_up_worker(struct worker_pool *pool)
22196 -       struct worker *worker = first_idle_worker(pool);
22197 +       struct worker *worker;
22199 +       rt_lock_idle_list(pool);
22201 +       worker = first_idle_worker(pool);
22203         if (likely(worker))
22204                 wake_up_process(worker->task);
22206 +       rt_unlock_idle_list(pool);
22209  /**
22210 - * wq_worker_waking_up - a worker is waking up
22211 + * wq_worker_running - a worker is running again
22212   * @task: task waking up
22213 - * @cpu: CPU @task is waking up to
22214   *
22215 - * This function is called during try_to_wake_up() when a worker is
22216 - * being awoken.
22217 - *
22218 - * CONTEXT:
22219 - * spin_lock_irq(rq->lock)
22220 + * This function is called when a worker returns from schedule()
22221   */
22222 -void wq_worker_waking_up(struct task_struct *task, int cpu)
22223 +void wq_worker_running(struct task_struct *task)
22225         struct worker *worker = kthread_data(task);
22227 -       if (!(worker->flags & WORKER_NOT_RUNNING)) {
22228 -               WARN_ON_ONCE(worker->pool->cpu != cpu);
22229 +       if (!worker->sleeping)
22230 +               return;
22231 +       if (!(worker->flags & WORKER_NOT_RUNNING))
22232                 atomic_inc(&worker->pool->nr_running);
22233 -       }
22234 +       worker->sleeping = 0;
22237  /**
22238   * wq_worker_sleeping - a worker is going to sleep
22239   * @task: task going to sleep
22240   *
22241 - * This function is called during schedule() when a busy worker is
22242 - * going to sleep.  Worker on the same cpu can be woken up by
22243 - * returning pointer to its task.
22244 - *
22245 - * CONTEXT:
22246 - * spin_lock_irq(rq->lock)
22247 - *
22248 - * Return:
22249 - * Worker task on @cpu to wake up, %NULL if none.
22250 + * This function is called from schedule() when a busy worker is
22251 + * going to sleep.
22252   */
22253 -struct task_struct *wq_worker_sleeping(struct task_struct *task)
22254 +void wq_worker_sleeping(struct task_struct *task)
22256 -       struct worker *worker = kthread_data(task), *to_wakeup = NULL;
22257 +       struct worker *worker = kthread_data(task);
22258         struct worker_pool *pool;
22260         /*
22261 @@ -882,29 +911,26 @@
22262          * checking NOT_RUNNING.
22263          */
22264         if (worker->flags & WORKER_NOT_RUNNING)
22265 -               return NULL;
22266 +               return;
22268         pool = worker->pool;
22270 -       /* this can only happen on the local cpu */
22271 -       if (WARN_ON_ONCE(pool->cpu != raw_smp_processor_id()))
22272 -               return NULL;
22273 +       if (WARN_ON_ONCE(worker->sleeping))
22274 +               return;
22276 +       worker->sleeping = 1;
22278         /*
22279          * The counterpart of the following dec_and_test, implied mb,
22280          * worklist not empty test sequence is in insert_work().
22281          * Please read comment there.
22282 -        *
22283 -        * NOT_RUNNING is clear.  This means that we're bound to and
22284 -        * running on the local cpu w/ rq lock held and preemption
22285 -        * disabled, which in turn means that none else could be
22286 -        * manipulating idle_list, so dereferencing idle_list without pool
22287 -        * lock is safe.
22288          */
22289         if (atomic_dec_and_test(&pool->nr_running) &&
22290 -           !list_empty(&pool->worklist))
22291 -               to_wakeup = first_idle_worker(pool);
22292 -       return to_wakeup ? to_wakeup->task : NULL;
22293 +           !list_empty(&pool->worklist)) {
22294 +               sched_lock_idle_list(pool);
22295 +               wake_up_worker(pool);
22296 +               sched_unlock_idle_list(pool);
22297 +       }
22300  /**
22301 @@ -1098,12 +1124,14 @@
22303         if (pwq) {
22304                 /*
22305 -                * As both pwqs and pools are sched-RCU protected, the
22306 +                * As both pwqs and pools are RCU protected, the
22307                  * following lock operations are safe.
22308                  */
22309 -               spin_lock_irq(&pwq->pool->lock);
22310 +               rcu_read_lock();
22311 +               local_spin_lock_irq(pendingb_lock, &pwq->pool->lock);
22312                 put_pwq(pwq);
22313 -               spin_unlock_irq(&pwq->pool->lock);
22314 +               local_spin_unlock_irq(pendingb_lock, &pwq->pool->lock);
22315 +               rcu_read_unlock();
22316         }
22319 @@ -1207,7 +1235,7 @@
22320         struct worker_pool *pool;
22321         struct pool_workqueue *pwq;
22323 -       local_irq_save(*flags);
22324 +       local_lock_irqsave(pendingb_lock, *flags);
22326         /* try to steal the timer if it exists */
22327         if (is_dwork) {
22328 @@ -1226,6 +1254,7 @@
22329         if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)))
22330                 return 0;
22332 +       rcu_read_lock();
22333         /*
22334          * The queueing is in progress, or it is already queued. Try to
22335          * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
22336 @@ -1264,14 +1293,16 @@
22337                 set_work_pool_and_keep_pending(work, pool->id);
22339                 spin_unlock(&pool->lock);
22340 +               rcu_read_unlock();
22341                 return 1;
22342         }
22343         spin_unlock(&pool->lock);
22344  fail:
22345 -       local_irq_restore(*flags);
22346 +       rcu_read_unlock();
22347 +       local_unlock_irqrestore(pendingb_lock, *flags);
22348         if (work_is_canceling(work))
22349                 return -ENOENT;
22350 -       cpu_relax();
22351 +       cpu_chill();
22352         return -EAGAIN;
22355 @@ -1373,7 +1404,7 @@
22356          * queued or lose PENDING.  Grabbing PENDING and queueing should
22357          * happen with IRQ disabled.
22358          */
22359 -       WARN_ON_ONCE(!irqs_disabled());
22360 +       WARN_ON_ONCE_NONRT(!irqs_disabled());
22362         debug_work_activate(work);
22364 @@ -1381,6 +1412,7 @@
22365         if (unlikely(wq->flags & __WQ_DRAINING) &&
22366             WARN_ON_ONCE(!is_chained_work(wq)))
22367                 return;
22368 +       rcu_read_lock();
22369  retry:
22370         if (req_cpu == WORK_CPU_UNBOUND)
22371                 cpu = wq_select_unbound_cpu(raw_smp_processor_id());
22372 @@ -1437,10 +1469,8 @@
22373         /* pwq determined, queue */
22374         trace_workqueue_queue_work(req_cpu, pwq, work);
22376 -       if (WARN_ON(!list_empty(&work->entry))) {
22377 -               spin_unlock(&pwq->pool->lock);
22378 -               return;
22379 -       }
22380 +       if (WARN_ON(!list_empty(&work->entry)))
22381 +               goto out;
22383         pwq->nr_in_flight[pwq->work_color]++;
22384         work_flags = work_color_to_flags(pwq->work_color);
22385 @@ -1458,7 +1488,9 @@
22387         insert_work(pwq, work, worklist, work_flags);
22389 +out:
22390         spin_unlock(&pwq->pool->lock);
22391 +       rcu_read_unlock();
22394  /**
22395 @@ -1478,14 +1510,14 @@
22396         bool ret = false;
22397         unsigned long flags;
22399 -       local_irq_save(flags);
22400 +       local_lock_irqsave(pendingb_lock,flags);
22402         if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
22403                 __queue_work(cpu, wq, work);
22404                 ret = true;
22405         }
22407 -       local_irq_restore(flags);
22408 +       local_unlock_irqrestore(pendingb_lock, flags);
22409         return ret;
22411  EXPORT_SYMBOL(queue_work_on);
22412 @@ -1552,14 +1584,14 @@
22413         unsigned long flags;
22415         /* read the comment in __queue_work() */
22416 -       local_irq_save(flags);
22417 +       local_lock_irqsave(pendingb_lock, flags);
22419         if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
22420                 __queue_delayed_work(cpu, wq, dwork, delay);
22421                 ret = true;
22422         }
22424 -       local_irq_restore(flags);
22425 +       local_unlock_irqrestore(pendingb_lock, flags);
22426         return ret;
22428  EXPORT_SYMBOL(queue_delayed_work_on);
22429 @@ -1594,7 +1626,7 @@
22431         if (likely(ret >= 0)) {
22432                 __queue_delayed_work(cpu, wq, dwork, delay);
22433 -               local_irq_restore(flags);
22434 +               local_unlock_irqrestore(pendingb_lock, flags);
22435         }
22437         /* -ENOENT from try_to_grab_pending() becomes %true */
22438 @@ -1627,7 +1659,9 @@
22439         worker->last_active = jiffies;
22441         /* idle_list is LIFO */
22442 +       rt_lock_idle_list(pool);
22443         list_add(&worker->entry, &pool->idle_list);
22444 +       rt_unlock_idle_list(pool);
22446         if (too_many_workers(pool) && !timer_pending(&pool->idle_timer))
22447                 mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);
22448 @@ -1660,7 +1694,9 @@
22449                 return;
22450         worker_clr_flags(worker, WORKER_IDLE);
22451         pool->nr_idle--;
22452 +       rt_lock_idle_list(pool);
22453         list_del_init(&worker->entry);
22454 +       rt_unlock_idle_list(pool);
22457  static struct worker *alloc_worker(int node)
22458 @@ -1826,7 +1862,9 @@
22459         pool->nr_workers--;
22460         pool->nr_idle--;
22462 +       rt_lock_idle_list(pool);
22463         list_del_init(&worker->entry);
22464 +       rt_unlock_idle_list(pool);
22465         worker->flags |= WORKER_DIE;
22466         wake_up_process(worker->task);
22468 @@ -2785,14 +2823,14 @@
22470         might_sleep();
22472 -       local_irq_disable();
22473 +       rcu_read_lock();
22474         pool = get_work_pool(work);
22475         if (!pool) {
22476 -               local_irq_enable();
22477 +               rcu_read_unlock();
22478                 return false;
22479         }
22481 -       spin_lock(&pool->lock);
22482 +       spin_lock_irq(&pool->lock);
22483         /* see the comment in try_to_grab_pending() with the same code */
22484         pwq = get_work_pwq(work);
22485         if (pwq) {
22486 @@ -2821,10 +2859,11 @@
22487         else
22488                 lock_map_acquire_read(&pwq->wq->lockdep_map);
22489         lock_map_release(&pwq->wq->lockdep_map);
22491 +       rcu_read_unlock();
22492         return true;
22493  already_gone:
22494         spin_unlock_irq(&pool->lock);
22495 +       rcu_read_unlock();
22496         return false;
22499 @@ -2911,7 +2950,7 @@
22501         /* tell other tasks trying to grab @work to back off */
22502         mark_work_canceling(work);
22503 -       local_irq_restore(flags);
22504 +       local_unlock_irqrestore(pendingb_lock, flags);
22506         flush_work(work);
22507         clear_work_data(work);
22508 @@ -2966,10 +3005,10 @@
22509   */
22510  bool flush_delayed_work(struct delayed_work *dwork)
22512 -       local_irq_disable();
22513 +       local_lock_irq(pendingb_lock);
22514         if (del_timer_sync(&dwork->timer))
22515                 __queue_work(dwork->cpu, dwork->wq, &dwork->work);
22516 -       local_irq_enable();
22517 +       local_unlock_irq(pendingb_lock);
22518         return flush_work(&dwork->work);
22520  EXPORT_SYMBOL(flush_delayed_work);
22521 @@ -2987,7 +3026,7 @@
22522                 return false;
22524         set_work_pool_and_clear_pending(work, get_work_pool_id(work));
22525 -       local_irq_restore(flags);
22526 +       local_unlock_irqrestore(pendingb_lock, flags);
22527         return ret;
22530 @@ -3245,7 +3284,7 @@
22531   * put_unbound_pool - put a worker_pool
22532   * @pool: worker_pool to put
22533   *
22534 - * Put @pool.  If its refcnt reaches zero, it gets destroyed in sched-RCU
22535 + * Put @pool.  If its refcnt reaches zero, it gets destroyed in RCU
22536   * safe manner.  get_unbound_pool() calls this function on its failure path
22537   * and this function should be able to release pools which went through,
22538   * successfully or not, init_worker_pool().
22539 @@ -3299,8 +3338,8 @@
22540         del_timer_sync(&pool->idle_timer);
22541         del_timer_sync(&pool->mayday_timer);
22543 -       /* sched-RCU protected to allow dereferences from get_work_pool() */
22544 -       call_rcu_sched(&pool->rcu, rcu_free_pool);
22545 +       /* RCU protected to allow dereferences from get_work_pool() */
22546 +       call_rcu(&pool->rcu, rcu_free_pool);
22549  /**
22550 @@ -3407,14 +3446,14 @@
22551         put_unbound_pool(pool);
22552         mutex_unlock(&wq_pool_mutex);
22554 -       call_rcu_sched(&pwq->rcu, rcu_free_pwq);
22555 +       call_rcu(&pwq->rcu, rcu_free_pwq);
22557         /*
22558          * If we're the last pwq going away, @wq is already dead and no one
22559          * is gonna access it anymore.  Schedule RCU free.
22560          */
22561         if (is_last)
22562 -               call_rcu_sched(&wq->rcu, rcu_free_wq);
22563 +               call_rcu(&wq->rcu, rcu_free_wq);
22566  /**
22567 @@ -4064,7 +4103,7 @@
22568                  * The base ref is never dropped on per-cpu pwqs.  Directly
22569                  * schedule RCU free.
22570                  */
22571 -               call_rcu_sched(&wq->rcu, rcu_free_wq);
22572 +               call_rcu(&wq->rcu, rcu_free_wq);
22573         } else {
22574                 /*
22575                  * We're the sole accessor of @wq at this point.  Directly
22576 @@ -4157,7 +4196,8 @@
22577         struct pool_workqueue *pwq;
22578         bool ret;
22580 -       rcu_read_lock_sched();
22581 +       rcu_read_lock();
22582 +       preempt_disable();
22584         if (cpu == WORK_CPU_UNBOUND)
22585                 cpu = smp_processor_id();
22586 @@ -4168,7 +4208,8 @@
22587                 pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu));
22589         ret = !list_empty(&pwq->delayed_works);
22590 -       rcu_read_unlock_sched();
22591 +       preempt_enable();
22592 +       rcu_read_unlock();
22594         return ret;
22596 @@ -4194,15 +4235,15 @@
22597         if (work_pending(work))
22598                 ret |= WORK_BUSY_PENDING;
22600 -       local_irq_save(flags);
22601 +       rcu_read_lock();
22602         pool = get_work_pool(work);
22603         if (pool) {
22604 -               spin_lock(&pool->lock);
22605 +               spin_lock_irqsave(&pool->lock, flags);
22606                 if (find_worker_executing_work(pool, work))
22607                         ret |= WORK_BUSY_RUNNING;
22608 -               spin_unlock(&pool->lock);
22609 +               spin_unlock_irqrestore(&pool->lock, flags);
22610         }
22611 -       local_irq_restore(flags);
22612 +       rcu_read_unlock();
22614         return ret;
22616 @@ -4391,7 +4432,7 @@
22617         unsigned long flags;
22618         int pi;
22620 -       rcu_read_lock_sched();
22621 +       rcu_read_lock();
22623         pr_info("Showing busy workqueues and worker pools:\n");
22625 @@ -4444,7 +4485,7 @@
22626                 spin_unlock_irqrestore(&pool->lock, flags);
22627         }
22629 -       rcu_read_unlock_sched();
22630 +       rcu_read_unlock();
22633  /*
22634 @@ -4782,16 +4823,16 @@
22635                  * nr_active is monotonically decreasing.  It's safe
22636                  * to peek without lock.
22637                  */
22638 -               rcu_read_lock_sched();
22639 +               rcu_read_lock();
22640                 for_each_pwq(pwq, wq) {
22641                         WARN_ON_ONCE(pwq->nr_active < 0);
22642                         if (pwq->nr_active) {
22643                                 busy = true;
22644 -                               rcu_read_unlock_sched();
22645 +                               rcu_read_unlock();
22646                                 goto out_unlock;
22647                         }
22648                 }
22649 -               rcu_read_unlock_sched();
22650 +               rcu_read_unlock();
22651         }
22652  out_unlock:
22653         mutex_unlock(&wq_pool_mutex);
22654 @@ -4981,7 +5022,8 @@
22655         const char *delim = "";
22656         int node, written = 0;
22658 -       rcu_read_lock_sched();
22659 +       get_online_cpus();
22660 +       rcu_read_lock();
22661         for_each_node(node) {
22662                 written += scnprintf(buf + written, PAGE_SIZE - written,
22663                                      "%s%d:%d", delim, node,
22664 @@ -4989,7 +5031,8 @@
22665                 delim = " ";
22666         }
22667         written += scnprintf(buf + written, PAGE_SIZE - written, "\n");
22668 -       rcu_read_unlock_sched();
22669 +       rcu_read_unlock();
22670 +       put_online_cpus();
22672         return written;
22674 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/workqueue_internal.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/workqueue_internal.h
22675 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/kernel/workqueue_internal.h     2017-04-16 10:38:30.000000000 +0200
22676 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/kernel/workqueue_internal.h  2017-04-18 17:54:26.000000000 +0200
22677 @@ -43,6 +43,7 @@
22678         unsigned long           last_active;    /* L: last active timestamp */
22679         unsigned int            flags;          /* X: flags */
22680         int                     id;             /* I: worker id */
22681 +       int                     sleeping;       /* None */
22683         /*
22684          * Opaque string set with work_set_desc().  Printed out with task
22685 @@ -68,7 +69,7 @@
22686   * Scheduler hooks for concurrency managed workqueue.  Only to be used from
22687   * sched/core.c and workqueue.c.
22688   */
22689 -void wq_worker_waking_up(struct task_struct *task, int cpu);
22690 -struct task_struct *wq_worker_sleeping(struct task_struct *task);
22691 +void wq_worker_running(struct task_struct *task);
22692 +void wq_worker_sleeping(struct task_struct *task);
22694  #endif /* _KERNEL_WORKQUEUE_INTERNAL_H */
22695 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/lib/Kconfig linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/lib/Kconfig
22696 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/lib/Kconfig     2017-04-16 10:38:30.000000000 +0200
22697 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/lib/Kconfig  2017-04-18 17:54:26.000000000 +0200
22698 @@ -400,6 +400,7 @@
22700  config CPUMASK_OFFSTACK
22701         bool "Force CPU masks off stack" if DEBUG_PER_CPU_MAPS
22702 +       depends on !PREEMPT_RT_FULL
22703         help
22704           Use dynamic allocation for cpumask_var_t, instead of putting
22705           them on the stack.  This is a bit more expensive, but avoids
22706 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/lib/debugobjects.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/lib/debugobjects.c
22707 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/lib/debugobjects.c      2017-04-16 10:38:30.000000000 +0200
22708 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/lib/debugobjects.c   2017-04-18 17:54:26.000000000 +0200
22709 @@ -308,7 +308,10 @@
22710         struct debug_obj *obj;
22711         unsigned long flags;
22713 -       fill_pool();
22714 +#ifdef CONFIG_PREEMPT_RT_FULL
22715 +       if (preempt_count() == 0 && !irqs_disabled())
22716 +#endif
22717 +               fill_pool();
22719         db = get_bucket((unsigned long) addr);
22721 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/lib/idr.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/lib/idr.c
22722 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/lib/idr.c       2017-04-16 10:38:30.000000000 +0200
22723 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/lib/idr.c    2017-04-18 17:54:26.000000000 +0200
22724 @@ -30,6 +30,7 @@
22725  #include <linux/idr.h>
22726  #include <linux/spinlock.h>
22727  #include <linux/percpu.h>
22728 +#include <linux/locallock.h>
22730  #define MAX_IDR_SHIFT          (sizeof(int) * 8 - 1)
22731  #define MAX_IDR_BIT            (1U << MAX_IDR_SHIFT)
22732 @@ -45,6 +46,37 @@
22733  static DEFINE_PER_CPU(int, idr_preload_cnt);
22734  static DEFINE_SPINLOCK(simple_ida_lock);
22736 +#ifdef CONFIG_PREEMPT_RT_FULL
22737 +static DEFINE_LOCAL_IRQ_LOCK(idr_lock);
22739 +static inline void idr_preload_lock(void)
22741 +       local_lock(idr_lock);
22744 +static inline void idr_preload_unlock(void)
22746 +       local_unlock(idr_lock);
22749 +void idr_preload_end(void)
22751 +       idr_preload_unlock();
22753 +EXPORT_SYMBOL(idr_preload_end);
22754 +#else
22755 +static inline void idr_preload_lock(void)
22757 +       preempt_disable();
22760 +static inline void idr_preload_unlock(void)
22762 +       preempt_enable();
22764 +#endif
22767  /* the maximum ID which can be allocated given idr->layers */
22768  static int idr_max(int layers)
22770 @@ -115,14 +147,14 @@
22771          * context.  See idr_preload() for details.
22772          */
22773         if (!in_interrupt()) {
22774 -               preempt_disable();
22775 +               idr_preload_lock();
22776                 new = __this_cpu_read(idr_preload_head);
22777                 if (new) {
22778                         __this_cpu_write(idr_preload_head, new->ary[0]);
22779                         __this_cpu_dec(idr_preload_cnt);
22780                         new->ary[0] = NULL;
22781                 }
22782 -               preempt_enable();
22783 +               idr_preload_unlock();
22784                 if (new)
22785                         return new;
22786         }
22787 @@ -366,7 +398,6 @@
22788         idr_mark_full(pa, id);
22792  /**
22793   * idr_preload - preload for idr_alloc()
22794   * @gfp_mask: allocation mask to use for preloading
22795 @@ -401,7 +432,7 @@
22796         WARN_ON_ONCE(in_interrupt());
22797         might_sleep_if(gfpflags_allow_blocking(gfp_mask));
22799 -       preempt_disable();
22800 +       idr_preload_lock();
22802         /*
22803          * idr_alloc() is likely to succeed w/o full idr_layer buffer and
22804 @@ -413,9 +444,9 @@
22805         while (__this_cpu_read(idr_preload_cnt) < MAX_IDR_FREE) {
22806                 struct idr_layer *new;
22808 -               preempt_enable();
22809 +               idr_preload_unlock();
22810                 new = kmem_cache_zalloc(idr_layer_cache, gfp_mask);
22811 -               preempt_disable();
22812 +               idr_preload_lock();
22813                 if (!new)
22814                         break;
22816 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/lib/irq_poll.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/lib/irq_poll.c
22817 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/lib/irq_poll.c  2017-04-16 10:38:30.000000000 +0200
22818 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/lib/irq_poll.c       2017-04-18 17:54:26.000000000 +0200
22819 @@ -36,6 +36,7 @@
22820         list_add_tail(&iop->list, this_cpu_ptr(&blk_cpu_iopoll));
22821         __raise_softirq_irqoff(IRQ_POLL_SOFTIRQ);
22822         local_irq_restore(flags);
22823 +       preempt_check_resched_rt();
22825  EXPORT_SYMBOL(irq_poll_sched);
22827 @@ -71,6 +72,7 @@
22828         local_irq_save(flags);
22829         __irq_poll_complete(iop);
22830         local_irq_restore(flags);
22831 +       preempt_check_resched_rt();
22833  EXPORT_SYMBOL(irq_poll_complete);
22835 @@ -95,6 +97,7 @@
22836                 }
22838                 local_irq_enable();
22839 +               preempt_check_resched_rt();
22841                 /* Even though interrupts have been re-enabled, this
22842                  * access is safe because interrupts can only add new
22843 @@ -132,6 +135,7 @@
22844                 __raise_softirq_irqoff(IRQ_POLL_SOFTIRQ);
22846         local_irq_enable();
22847 +       preempt_check_resched_rt();
22850  /**
22851 @@ -195,6 +199,7 @@
22852                          this_cpu_ptr(&blk_cpu_iopoll));
22853         __raise_softirq_irqoff(IRQ_POLL_SOFTIRQ);
22854         local_irq_enable();
22855 +       preempt_check_resched_rt();
22857         return 0;
22859 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/lib/locking-selftest.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/lib/locking-selftest.c
22860 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/lib/locking-selftest.c  2017-04-16 10:38:30.000000000 +0200
22861 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/lib/locking-selftest.c       2017-04-18 17:54:26.000000000 +0200
22862 @@ -590,6 +590,8 @@
22863  #include "locking-selftest-spin-hardirq.h"
22864  GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_hard_spin)
22866 +#ifndef CONFIG_PREEMPT_RT_FULL
22868  #include "locking-selftest-rlock-hardirq.h"
22869  GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_hard_rlock)
22871 @@ -605,9 +607,12 @@
22872  #include "locking-selftest-wlock-softirq.h"
22873  GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_soft_wlock)
22875 +#endif
22877  #undef E1
22878  #undef E2
22880 +#ifndef CONFIG_PREEMPT_RT_FULL
22881  /*
22882   * Enabling hardirqs with a softirq-safe lock held:
22883   */
22884 @@ -640,6 +645,8 @@
22885  #undef E1
22886  #undef E2
22888 +#endif
22890  /*
22891   * Enabling irqs with an irq-safe lock held:
22892   */
22893 @@ -663,6 +670,8 @@
22894  #include "locking-selftest-spin-hardirq.h"
22895  GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_hard_spin)
22897 +#ifndef CONFIG_PREEMPT_RT_FULL
22899  #include "locking-selftest-rlock-hardirq.h"
22900  GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_hard_rlock)
22902 @@ -678,6 +687,8 @@
22903  #include "locking-selftest-wlock-softirq.h"
22904  GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_soft_wlock)
22906 +#endif
22908  #undef E1
22909  #undef E2
22911 @@ -709,6 +720,8 @@
22912  #include "locking-selftest-spin-hardirq.h"
22913  GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_hard_spin)
22915 +#ifndef CONFIG_PREEMPT_RT_FULL
22917  #include "locking-selftest-rlock-hardirq.h"
22918  GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_hard_rlock)
22920 @@ -724,6 +737,8 @@
22921  #include "locking-selftest-wlock-softirq.h"
22922  GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_soft_wlock)
22924 +#endif
22926  #undef E1
22927  #undef E2
22928  #undef E3
22929 @@ -757,6 +772,8 @@
22930  #include "locking-selftest-spin-hardirq.h"
22931  GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_hard_spin)
22933 +#ifndef CONFIG_PREEMPT_RT_FULL
22935  #include "locking-selftest-rlock-hardirq.h"
22936  GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_hard_rlock)
22938 @@ -772,10 +789,14 @@
22939  #include "locking-selftest-wlock-softirq.h"
22940  GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_soft_wlock)
22942 +#endif
22944  #undef E1
22945  #undef E2
22946  #undef E3
22948 +#ifndef CONFIG_PREEMPT_RT_FULL
22950  /*
22951   * read-lock / write-lock irq inversion.
22952   *
22953 @@ -838,6 +859,10 @@
22954  #undef E2
22955  #undef E3
22957 +#endif
22959 +#ifndef CONFIG_PREEMPT_RT_FULL
22961  /*
22962   * read-lock / write-lock recursion that is actually safe.
22963   */
22964 @@ -876,6 +901,8 @@
22965  #undef E2
22966  #undef E3
22968 +#endif
22970  /*
22971   * read-lock / write-lock recursion that is unsafe.
22972   */
22973 @@ -1858,6 +1885,7 @@
22975         printk("  --------------------------------------------------------------------------\n");
22977 +#ifndef CONFIG_PREEMPT_RT_FULL
22978         /*
22979          * irq-context testcases:
22980          */
22981 @@ -1870,6 +1898,28 @@
22983         DO_TESTCASE_6x2("irq read-recursion", irq_read_recursion);
22984  //     DO_TESTCASE_6x2B("irq read-recursion #2", irq_read_recursion2);
22985 +#else
22986 +       /* On -rt, we only do hardirq context test for raw spinlock */
22987 +       DO_TESTCASE_1B("hard-irqs-on + irq-safe-A", irqsafe1_hard_spin, 12);
22988 +       DO_TESTCASE_1B("hard-irqs-on + irq-safe-A", irqsafe1_hard_spin, 21);
22990 +       DO_TESTCASE_1B("hard-safe-A + irqs-on", irqsafe2B_hard_spin, 12);
22991 +       DO_TESTCASE_1B("hard-safe-A + irqs-on", irqsafe2B_hard_spin, 21);
22993 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 123);
22994 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 132);
22995 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 213);
22996 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 231);
22997 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 312);
22998 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 321);
23000 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 123);
23001 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 132);
23002 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 213);
23003 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 231);
23004 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 312);
23005 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 321);
23006 +#endif
23008         ww_tests();
23010 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/lib/percpu_ida.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/lib/percpu_ida.c
23011 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/lib/percpu_ida.c        2017-04-16 10:38:30.000000000 +0200
23012 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/lib/percpu_ida.c     2017-04-18 17:54:26.000000000 +0200
23013 @@ -26,6 +26,9 @@
23014  #include <linux/string.h>
23015  #include <linux/spinlock.h>
23016  #include <linux/percpu_ida.h>
23017 +#include <linux/locallock.h>
23019 +static DEFINE_LOCAL_IRQ_LOCK(irq_off_lock);
23021  struct percpu_ida_cpu {
23022         /*
23023 @@ -148,13 +151,13 @@
23024         unsigned long flags;
23025         int tag;
23027 -       local_irq_save(flags);
23028 +       local_lock_irqsave(irq_off_lock, flags);
23029         tags = this_cpu_ptr(pool->tag_cpu);
23031         /* Fastpath */
23032         tag = alloc_local_tag(tags);
23033         if (likely(tag >= 0)) {
23034 -               local_irq_restore(flags);
23035 +               local_unlock_irqrestore(irq_off_lock, flags);
23036                 return tag;
23037         }
23039 @@ -173,6 +176,7 @@
23041                 if (!tags->nr_free)
23042                         alloc_global_tags(pool, tags);
23044                 if (!tags->nr_free)
23045                         steal_tags(pool, tags);
23047 @@ -184,7 +188,7 @@
23048                 }
23050                 spin_unlock(&pool->lock);
23051 -               local_irq_restore(flags);
23052 +               local_unlock_irqrestore(irq_off_lock, flags);
23054                 if (tag >= 0 || state == TASK_RUNNING)
23055                         break;
23056 @@ -196,7 +200,7 @@
23058                 schedule();
23060 -               local_irq_save(flags);
23061 +               local_lock_irqsave(irq_off_lock, flags);
23062                 tags = this_cpu_ptr(pool->tag_cpu);
23063         }
23064         if (state != TASK_RUNNING)
23065 @@ -221,7 +225,7 @@
23067         BUG_ON(tag >= pool->nr_tags);
23069 -       local_irq_save(flags);
23070 +       local_lock_irqsave(irq_off_lock, flags);
23071         tags = this_cpu_ptr(pool->tag_cpu);
23073         spin_lock(&tags->lock);
23074 @@ -253,7 +257,7 @@
23075                 spin_unlock(&pool->lock);
23076         }
23078 -       local_irq_restore(flags);
23079 +       local_unlock_irqrestore(irq_off_lock, flags);
23081  EXPORT_SYMBOL_GPL(percpu_ida_free);
23083 @@ -345,7 +349,7 @@
23084         struct percpu_ida_cpu *remote;
23085         unsigned cpu, i, err = 0;
23087 -       local_irq_save(flags);
23088 +       local_lock_irqsave(irq_off_lock, flags);
23089         for_each_possible_cpu(cpu) {
23090                 remote = per_cpu_ptr(pool->tag_cpu, cpu);
23091                 spin_lock(&remote->lock);
23092 @@ -367,7 +371,7 @@
23093         }
23094         spin_unlock(&pool->lock);
23095  out:
23096 -       local_irq_restore(flags);
23097 +       local_unlock_irqrestore(irq_off_lock, flags);
23098         return err;
23100  EXPORT_SYMBOL_GPL(percpu_ida_for_each_free);
23101 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/lib/radix-tree.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/lib/radix-tree.c
23102 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/lib/radix-tree.c        2017-04-16 10:38:30.000000000 +0200
23103 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/lib/radix-tree.c     2017-04-18 17:54:26.000000000 +0200
23104 @@ -36,7 +36,7 @@
23105  #include <linux/bitops.h>
23106  #include <linux/rcupdate.h>
23107  #include <linux/preempt.h>             /* in_interrupt() */
23109 +#include <linux/locallock.h>
23111  /* Number of nodes in fully populated tree of given height */
23112  static unsigned long height_to_maxnodes[RADIX_TREE_MAX_PATH + 1] __read_mostly;
23113 @@ -68,6 +68,7 @@
23114         struct radix_tree_node *nodes;
23115  };
23116  static DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, };
23117 +static DEFINE_LOCAL_IRQ_LOCK(radix_tree_preloads_lock);
23119  static inline void *node_to_entry(void *ptr)
23121 @@ -290,13 +291,14 @@
23122                  * succeed in getting a node here (and never reach
23123                  * kmem_cache_alloc)
23124                  */
23125 -               rtp = this_cpu_ptr(&radix_tree_preloads);
23126 +               rtp = &get_locked_var(radix_tree_preloads_lock, radix_tree_preloads);
23127                 if (rtp->nr) {
23128                         ret = rtp->nodes;
23129                         rtp->nodes = ret->private_data;
23130                         ret->private_data = NULL;
23131                         rtp->nr--;
23132                 }
23133 +               put_locked_var(radix_tree_preloads_lock, radix_tree_preloads);
23134                 /*
23135                  * Update the allocation stack trace as this is more useful
23136                  * for debugging.
23137 @@ -357,14 +359,14 @@
23138          */
23139         gfp_mask &= ~__GFP_ACCOUNT;
23141 -       preempt_disable();
23142 +       local_lock(radix_tree_preloads_lock);
23143         rtp = this_cpu_ptr(&radix_tree_preloads);
23144         while (rtp->nr < nr) {
23145 -               preempt_enable();
23146 +               local_unlock(radix_tree_preloads_lock);
23147                 node = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask);
23148                 if (node == NULL)
23149                         goto out;
23150 -               preempt_disable();
23151 +               local_lock(radix_tree_preloads_lock);
23152                 rtp = this_cpu_ptr(&radix_tree_preloads);
23153                 if (rtp->nr < nr) {
23154                         node->private_data = rtp->nodes;
23155 @@ -406,7 +408,7 @@
23156         if (gfpflags_allow_blocking(gfp_mask))
23157                 return __radix_tree_preload(gfp_mask, RADIX_TREE_PRELOAD_SIZE);
23158         /* Preloading doesn't help anything with this gfp mask, skip it */
23159 -       preempt_disable();
23160 +       local_lock(radix_tree_preloads_lock);
23161         return 0;
23163  EXPORT_SYMBOL(radix_tree_maybe_preload);
23164 @@ -422,7 +424,7 @@
23166         /* Preloading doesn't help anything with this gfp mask, skip it */
23167         if (!gfpflags_allow_blocking(gfp_mask)) {
23168 -               preempt_disable();
23169 +               local_lock(radix_tree_preloads_lock);
23170                 return 0;
23171         }
23173 @@ -456,6 +458,12 @@
23174         return __radix_tree_preload(gfp_mask, nr_nodes);
23177 +void radix_tree_preload_end(void)
23179 +       local_unlock(radix_tree_preloads_lock);
23181 +EXPORT_SYMBOL(radix_tree_preload_end);
23183  /*
23184   * The maximum index which can be stored in a radix tree
23185   */
23186 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/lib/scatterlist.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/lib/scatterlist.c
23187 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/lib/scatterlist.c       2017-04-16 10:38:30.000000000 +0200
23188 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/lib/scatterlist.c    2017-04-18 17:54:26.000000000 +0200
23189 @@ -620,7 +620,7 @@
23190                         flush_kernel_dcache_page(miter->page);
23192                 if (miter->__flags & SG_MITER_ATOMIC) {
23193 -                       WARN_ON_ONCE(preemptible());
23194 +                       WARN_ON_ONCE(!pagefault_disabled());
23195                         kunmap_atomic(miter->addr);
23196                 } else
23197                         kunmap(miter->page);
23198 @@ -664,7 +664,7 @@
23199         if (!sg_miter_skip(&miter, skip))
23200                 return false;
23202 -       local_irq_save(flags);
23203 +       local_irq_save_nort(flags);
23205         while (sg_miter_next(&miter) && offset < buflen) {
23206                 unsigned int len;
23207 @@ -681,7 +681,7 @@
23209         sg_miter_stop(&miter);
23211 -       local_irq_restore(flags);
23212 +       local_irq_restore_nort(flags);
23213         return offset;
23215  EXPORT_SYMBOL(sg_copy_buffer);
23216 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/lib/smp_processor_id.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/lib/smp_processor_id.c
23217 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/lib/smp_processor_id.c  2017-04-16 10:38:30.000000000 +0200
23218 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/lib/smp_processor_id.c       2017-04-18 17:54:26.000000000 +0200
23219 @@ -39,8 +39,9 @@
23220         if (!printk_ratelimit())
23221                 goto out_enable;
23223 -       printk(KERN_ERR "BUG: using %s%s() in preemptible [%08x] code: %s/%d\n",
23224 -               what1, what2, preempt_count() - 1, current->comm, current->pid);
23225 +       printk(KERN_ERR "BUG: using %s%s() in preemptible [%08x %08x] code: %s/%d\n",
23226 +               what1, what2, preempt_count() - 1, __migrate_disabled(current),
23227 +               current->comm, current->pid);
23229         print_symbol("caller is %s\n", (long)__builtin_return_address(0));
23230         dump_stack();
23231 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/mm/Kconfig linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/mm/Kconfig
23232 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/mm/Kconfig      2017-04-16 10:38:30.000000000 +0200
23233 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/mm/Kconfig   2017-04-18 17:54:26.000000000 +0200
23234 @@ -410,7 +410,7 @@
23236  config TRANSPARENT_HUGEPAGE
23237         bool "Transparent Hugepage Support"
23238 -       depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE
23239 +       depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE && !PREEMPT_RT_FULL
23240         select COMPACTION
23241         select RADIX_TREE_MULTIORDER
23242         help
23243 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/mm/backing-dev.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/mm/backing-dev.c
23244 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/mm/backing-dev.c        2017-04-16 10:38:30.000000000 +0200
23245 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/mm/backing-dev.c     2017-04-18 17:54:26.000000000 +0200
23246 @@ -457,9 +457,9 @@
23248         unsigned long flags;
23250 -       local_irq_save(flags);
23251 +       local_irq_save_nort(flags);
23252         if (!atomic_dec_and_lock(&congested->refcnt, &cgwb_lock)) {
23253 -               local_irq_restore(flags);
23254 +               local_irq_restore_nort(flags);
23255                 return;
23256         }
23258 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/mm/compaction.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/mm/compaction.c
23259 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/mm/compaction.c 2017-04-16 10:38:30.000000000 +0200
23260 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/mm/compaction.c      2017-04-18 17:54:26.000000000 +0200
23261 @@ -1593,10 +1593,12 @@
23262                                 block_start_pfn(cc->migrate_pfn, cc->order);
23264                         if (cc->last_migrated_pfn < current_block_start) {
23265 -                               cpu = get_cpu();
23266 +                               cpu = get_cpu_light();
23267 +                               local_lock_irq(swapvec_lock);
23268                                 lru_add_drain_cpu(cpu);
23269 +                               local_unlock_irq(swapvec_lock);
23270                                 drain_local_pages(zone);
23271 -                               put_cpu();
23272 +                               put_cpu_light();
23273                                 /* No more flushing until we migrate again */
23274                                 cc->last_migrated_pfn = 0;
23275                         }
23276 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/mm/filemap.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/mm/filemap.c
23277 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/mm/filemap.c    2017-04-16 10:38:30.000000000 +0200
23278 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/mm/filemap.c 2017-04-18 17:54:26.000000000 +0200
23279 @@ -159,9 +159,12 @@
23280                  * node->private_list is protected by
23281                  * mapping->tree_lock.
23282                  */
23283 -               if (!list_empty(&node->private_list))
23284 -                       list_lru_del(&workingset_shadow_nodes,
23285 +               if (!list_empty(&node->private_list)) {
23286 +                       local_lock(workingset_shadow_lock);
23287 +                       list_lru_del(&__workingset_shadow_nodes,
23288                                      &node->private_list);
23289 +                       local_unlock(workingset_shadow_lock);
23290 +               }
23291         }
23292         return 0;
23294 @@ -217,8 +220,10 @@
23295                 if (!dax_mapping(mapping) && !workingset_node_pages(node) &&
23296                                 list_empty(&node->private_list)) {
23297                         node->private_data = mapping;
23298 -                       list_lru_add(&workingset_shadow_nodes,
23299 -                                       &node->private_list);
23300 +                       local_lock(workingset_shadow_lock);
23301 +                       list_lru_add(&__workingset_shadow_nodes,
23302 +                                    &node->private_list);
23303 +                       local_unlock(workingset_shadow_lock);
23304                 }
23305         }
23307 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/mm/highmem.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/mm/highmem.c
23308 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/mm/highmem.c    2017-04-16 10:38:30.000000000 +0200
23309 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/mm/highmem.c 2017-04-18 17:54:26.000000000 +0200
23310 @@ -29,10 +29,11 @@
23311  #include <linux/kgdb.h>
23312  #include <asm/tlbflush.h>
23315 +#ifndef CONFIG_PREEMPT_RT_FULL
23316  #if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32)
23317  DEFINE_PER_CPU(int, __kmap_atomic_idx);
23318  #endif
23319 +#endif
23321  /*
23322   * Virtual_count is not a pure "count".
23323 @@ -107,8 +108,9 @@
23324  unsigned long totalhigh_pages __read_mostly;
23325  EXPORT_SYMBOL(totalhigh_pages);
23328 +#ifndef CONFIG_PREEMPT_RT_FULL
23329  EXPORT_PER_CPU_SYMBOL(__kmap_atomic_idx);
23330 +#endif
23332  unsigned int nr_free_highpages (void)
23334 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/mm/memcontrol.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/mm/memcontrol.c
23335 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/mm/memcontrol.c 2017-04-16 10:38:30.000000000 +0200
23336 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/mm/memcontrol.c      2017-04-18 17:54:26.000000000 +0200
23337 @@ -67,6 +67,7 @@
23338  #include <net/sock.h>
23339  #include <net/ip.h>
23340  #include "slab.h"
23341 +#include <linux/locallock.h>
23343  #include <asm/uaccess.h>
23345 @@ -92,6 +93,8 @@
23346  #define do_swap_account                0
23347  #endif
23349 +static DEFINE_LOCAL_IRQ_LOCK(event_lock);
23351  /* Whether legacy memory+swap accounting is active */
23352  static bool do_memsw_account(void)
23354 @@ -1692,6 +1695,7 @@
23355  #define FLUSHING_CACHED_CHARGE 0
23356  };
23357  static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
23358 +static DEFINE_LOCAL_IRQ_LOCK(memcg_stock_ll);
23359  static DEFINE_MUTEX(percpu_charge_mutex);
23361  /**
23362 @@ -1714,7 +1718,7 @@
23363         if (nr_pages > CHARGE_BATCH)
23364                 return ret;
23366 -       local_irq_save(flags);
23367 +       local_lock_irqsave(memcg_stock_ll, flags);
23369         stock = this_cpu_ptr(&memcg_stock);
23370         if (memcg == stock->cached && stock->nr_pages >= nr_pages) {
23371 @@ -1722,7 +1726,7 @@
23372                 ret = true;
23373         }
23375 -       local_irq_restore(flags);
23376 +       local_unlock_irqrestore(memcg_stock_ll, flags);
23378         return ret;
23380 @@ -1749,13 +1753,13 @@
23381         struct memcg_stock_pcp *stock;
23382         unsigned long flags;
23384 -       local_irq_save(flags);
23385 +       local_lock_irqsave(memcg_stock_ll, flags);
23387         stock = this_cpu_ptr(&memcg_stock);
23388         drain_stock(stock);
23389         clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
23391 -       local_irq_restore(flags);
23392 +       local_unlock_irqrestore(memcg_stock_ll, flags);
23395  /*
23396 @@ -1767,7 +1771,7 @@
23397         struct memcg_stock_pcp *stock;
23398         unsigned long flags;
23400 -       local_irq_save(flags);
23401 +       local_lock_irqsave(memcg_stock_ll, flags);
23403         stock = this_cpu_ptr(&memcg_stock);
23404         if (stock->cached != memcg) { /* reset if necessary */
23405 @@ -1776,7 +1780,7 @@
23406         }
23407         stock->nr_pages += nr_pages;
23409 -       local_irq_restore(flags);
23410 +       local_unlock_irqrestore(memcg_stock_ll, flags);
23413  /*
23414 @@ -1792,7 +1796,7 @@
23415                 return;
23416         /* Notify other cpus that system-wide "drain" is running */
23417         get_online_cpus();
23418 -       curcpu = get_cpu();
23419 +       curcpu = get_cpu_light();
23420         for_each_online_cpu(cpu) {
23421                 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
23422                 struct mem_cgroup *memcg;
23423 @@ -1809,7 +1813,7 @@
23424                                 schedule_work_on(cpu, &stock->work);
23425                 }
23426         }
23427 -       put_cpu();
23428 +       put_cpu_light();
23429         put_online_cpus();
23430         mutex_unlock(&percpu_charge_mutex);
23432 @@ -4553,12 +4557,12 @@
23434         ret = 0;
23436 -       local_irq_disable();
23437 +       local_lock_irq(event_lock);
23438         mem_cgroup_charge_statistics(to, page, compound, nr_pages);
23439         memcg_check_events(to, page);
23440         mem_cgroup_charge_statistics(from, page, compound, -nr_pages);
23441         memcg_check_events(from, page);
23442 -       local_irq_enable();
23443 +       local_unlock_irq(event_lock);
23444  out_unlock:
23445         unlock_page(page);
23446  out:
23447 @@ -5433,10 +5437,10 @@
23449         commit_charge(page, memcg, lrucare);
23451 -       local_irq_disable();
23452 +       local_lock_irq(event_lock);
23453         mem_cgroup_charge_statistics(memcg, page, compound, nr_pages);
23454         memcg_check_events(memcg, page);
23455 -       local_irq_enable();
23456 +       local_unlock_irq(event_lock);
23458         if (do_memsw_account() && PageSwapCache(page)) {
23459                 swp_entry_t entry = { .val = page_private(page) };
23460 @@ -5492,14 +5496,14 @@
23461                 memcg_oom_recover(memcg);
23462         }
23464 -       local_irq_save(flags);
23465 +       local_lock_irqsave(event_lock, flags);
23466         __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon);
23467         __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_file);
23468         __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_huge);
23469         __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout);
23470         __this_cpu_add(memcg->stat->nr_page_events, nr_pages);
23471         memcg_check_events(memcg, dummy_page);
23472 -       local_irq_restore(flags);
23473 +       local_unlock_irqrestore(event_lock, flags);
23475         if (!mem_cgroup_is_root(memcg))
23476                 css_put_many(&memcg->css, nr_pages);
23477 @@ -5654,10 +5658,10 @@
23479         commit_charge(newpage, memcg, false);
23481 -       local_irq_save(flags);
23482 +       local_lock_irqsave(event_lock, flags);
23483         mem_cgroup_charge_statistics(memcg, newpage, compound, nr_pages);
23484         memcg_check_events(memcg, newpage);
23485 -       local_irq_restore(flags);
23486 +       local_unlock_irqrestore(event_lock, flags);
23489  DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key);
23490 @@ -5837,6 +5841,7 @@
23492         struct mem_cgroup *memcg, *swap_memcg;
23493         unsigned short oldid;
23494 +       unsigned long flags;
23496         VM_BUG_ON_PAGE(PageLRU(page), page);
23497         VM_BUG_ON_PAGE(page_count(page), page);
23498 @@ -5877,12 +5882,16 @@
23499          * important here to have the interrupts disabled because it is the
23500          * only synchronisation we have for udpating the per-CPU variables.
23501          */
23502 +       local_lock_irqsave(event_lock, flags);
23503 +#ifndef CONFIG_PREEMPT_RT_BASE
23504         VM_BUG_ON(!irqs_disabled());
23505 +#endif
23506         mem_cgroup_charge_statistics(memcg, page, false, -1);
23507         memcg_check_events(memcg, page);
23509         if (!mem_cgroup_is_root(memcg))
23510                 css_put(&memcg->css);
23511 +       local_unlock_irqrestore(event_lock, flags);
23514  /*
23515 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/mm/mmu_context.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/mm/mmu_context.c
23516 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/mm/mmu_context.c        2017-04-16 10:38:30.000000000 +0200
23517 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/mm/mmu_context.c     2017-04-18 17:54:26.000000000 +0200
23518 @@ -23,6 +23,7 @@
23519         struct task_struct *tsk = current;
23521         task_lock(tsk);
23522 +       preempt_disable_rt();
23523         active_mm = tsk->active_mm;
23524         if (active_mm != mm) {
23525                 atomic_inc(&mm->mm_count);
23526 @@ -30,6 +31,7 @@
23527         }
23528         tsk->mm = mm;
23529         switch_mm(active_mm, mm, tsk);
23530 +       preempt_enable_rt();
23531         task_unlock(tsk);
23532  #ifdef finish_arch_post_lock_switch
23533         finish_arch_post_lock_switch();
23534 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/mm/page_alloc.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/mm/page_alloc.c
23535 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/mm/page_alloc.c 2017-04-16 10:38:30.000000000 +0200
23536 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/mm/page_alloc.c      2017-04-18 17:54:26.000000000 +0200
23537 @@ -61,6 +61,7 @@
23538  #include <linux/page_ext.h>
23539  #include <linux/hugetlb.h>
23540  #include <linux/sched/rt.h>
23541 +#include <linux/locallock.h>
23542  #include <linux/page_owner.h>
23543  #include <linux/kthread.h>
23544  #include <linux/memcontrol.h>
23545 @@ -281,6 +282,18 @@
23546  EXPORT_SYMBOL(nr_online_nodes);
23547  #endif
23549 +static DEFINE_LOCAL_IRQ_LOCK(pa_lock);
23551 +#ifdef CONFIG_PREEMPT_RT_BASE
23552 +# define cpu_lock_irqsave(cpu, flags)          \
23553 +       local_lock_irqsave_on(pa_lock, flags, cpu)
23554 +# define cpu_unlock_irqrestore(cpu, flags)     \
23555 +       local_unlock_irqrestore_on(pa_lock, flags, cpu)
23556 +#else
23557 +# define cpu_lock_irqsave(cpu, flags)          local_irq_save(flags)
23558 +# define cpu_unlock_irqrestore(cpu, flags)     local_irq_restore(flags)
23559 +#endif
23561  int page_group_by_mobility_disabled __read_mostly;
23563  #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
23564 @@ -1072,7 +1085,7 @@
23565  #endif /* CONFIG_DEBUG_VM */
23567  /*
23568 - * Frees a number of pages from the PCP lists
23569 + * Frees a number of pages which have been collected from the pcp lists.
23570   * Assumes all pages on list are in same zone, and of same order.
23571   * count is the number of pages to free.
23572   *
23573 @@ -1083,19 +1096,58 @@
23574   * pinned" detection logic.
23575   */
23576  static void free_pcppages_bulk(struct zone *zone, int count,
23577 -                                       struct per_cpu_pages *pcp)
23578 +                              struct list_head *list)
23580 -       int migratetype = 0;
23581 -       int batch_free = 0;
23582         unsigned long nr_scanned;
23583         bool isolated_pageblocks;
23584 +       unsigned long flags;
23586 +       spin_lock_irqsave(&zone->lock, flags);
23588 -       spin_lock(&zone->lock);
23589         isolated_pageblocks = has_isolate_pageblock(zone);
23590         nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED);
23591         if (nr_scanned)
23592                 __mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned);
23594 +       while (!list_empty(list)) {
23595 +               struct page *page;
23596 +               int mt; /* migratetype of the to-be-freed page */
23598 +               page = list_first_entry(list, struct page, lru);
23599 +               /* must delete as __free_one_page list manipulates */
23600 +               list_del(&page->lru);
23602 +               mt = get_pcppage_migratetype(page);
23603 +               /* MIGRATE_ISOLATE page should not go to pcplists */
23604 +               VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
23605 +               /* Pageblock could have been isolated meanwhile */
23606 +               if (unlikely(isolated_pageblocks))
23607 +                       mt = get_pageblock_migratetype(page);
23609 +               if (bulkfree_pcp_prepare(page))
23610 +                       continue;
23612 +               __free_one_page(page, page_to_pfn(page), zone, 0, mt);
23613 +               trace_mm_page_pcpu_drain(page, 0, mt);
23614 +               count--;
23615 +       }
23616 +       WARN_ON(count != 0);
23617 +       spin_unlock_irqrestore(&zone->lock, flags);
23621 + * Moves a number of pages from the PCP lists to free list which
23622 + * is freed outside of the locked region.
23623 + *
23624 + * Assumes all pages on list are in same zone, and of same order.
23625 + * count is the number of pages to free.
23626 + */
23627 +static void isolate_pcp_pages(int count, struct per_cpu_pages *src,
23628 +                             struct list_head *dst)
23630 +       int migratetype = 0;
23631 +       int batch_free = 0;
23633         while (count) {
23634                 struct page *page;
23635                 struct list_head *list;
23636 @@ -1111,7 +1163,7 @@
23637                         batch_free++;
23638                         if (++migratetype == MIGRATE_PCPTYPES)
23639                                 migratetype = 0;
23640 -                       list = &pcp->lists[migratetype];
23641 +                       list = &src->lists[migratetype];
23642                 } while (list_empty(list));
23644                 /* This is the only non-empty list. Free them all. */
23645 @@ -1119,27 +1171,12 @@
23646                         batch_free = count;
23648                 do {
23649 -                       int mt; /* migratetype of the to-be-freed page */
23651                         page = list_last_entry(list, struct page, lru);
23652 -                       /* must delete as __free_one_page list manipulates */
23653                         list_del(&page->lru);
23655 -                       mt = get_pcppage_migratetype(page);
23656 -                       /* MIGRATE_ISOLATE page should not go to pcplists */
23657 -                       VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
23658 -                       /* Pageblock could have been isolated meanwhile */
23659 -                       if (unlikely(isolated_pageblocks))
23660 -                               mt = get_pageblock_migratetype(page);
23662 -                       if (bulkfree_pcp_prepare(page))
23663 -                               continue;
23665 -                       __free_one_page(page, page_to_pfn(page), zone, 0, mt);
23666 -                       trace_mm_page_pcpu_drain(page, 0, mt);
23667 +                       list_add(&page->lru, dst);
23668                 } while (--count && --batch_free && !list_empty(list));
23669         }
23670 -       spin_unlock(&zone->lock);
23673  static void free_one_page(struct zone *zone,
23674 @@ -1148,7 +1185,9 @@
23675                                 int migratetype)
23677         unsigned long nr_scanned;
23678 -       spin_lock(&zone->lock);
23679 +       unsigned long flags;
23681 +       spin_lock_irqsave(&zone->lock, flags);
23682         nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED);
23683         if (nr_scanned)
23684                 __mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned);
23685 @@ -1158,7 +1197,7 @@
23686                 migratetype = get_pfnblock_migratetype(page, pfn);
23687         }
23688         __free_one_page(page, pfn, zone, order, migratetype);
23689 -       spin_unlock(&zone->lock);
23690 +       spin_unlock_irqrestore(&zone->lock, flags);
23693  static void __meminit __init_single_page(struct page *page, unsigned long pfn,
23694 @@ -1244,10 +1283,10 @@
23695                 return;
23697         migratetype = get_pfnblock_migratetype(page, pfn);
23698 -       local_irq_save(flags);
23699 +       local_lock_irqsave(pa_lock, flags);
23700         __count_vm_events(PGFREE, 1 << order);
23701         free_one_page(page_zone(page), page, pfn, order, migratetype);
23702 -       local_irq_restore(flags);
23703 +       local_unlock_irqrestore(pa_lock, flags);
23706  static void __init __free_pages_boot_core(struct page *page, unsigned int order)
23707 @@ -2246,16 +2285,18 @@
23708  void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
23710         unsigned long flags;
23711 +       LIST_HEAD(dst);
23712         int to_drain, batch;
23714 -       local_irq_save(flags);
23715 +       local_lock_irqsave(pa_lock, flags);
23716         batch = READ_ONCE(pcp->batch);
23717         to_drain = min(pcp->count, batch);
23718         if (to_drain > 0) {
23719 -               free_pcppages_bulk(zone, to_drain, pcp);
23720 +               isolate_pcp_pages(to_drain, pcp, &dst);
23721                 pcp->count -= to_drain;
23722         }
23723 -       local_irq_restore(flags);
23724 +       local_unlock_irqrestore(pa_lock, flags);
23725 +       free_pcppages_bulk(zone, to_drain, &dst);
23727  #endif
23729 @@ -2271,16 +2312,21 @@
23730         unsigned long flags;
23731         struct per_cpu_pageset *pset;
23732         struct per_cpu_pages *pcp;
23733 +       LIST_HEAD(dst);
23734 +       int count;
23736 -       local_irq_save(flags);
23737 +       cpu_lock_irqsave(cpu, flags);
23738         pset = per_cpu_ptr(zone->pageset, cpu);
23740         pcp = &pset->pcp;
23741 -       if (pcp->count) {
23742 -               free_pcppages_bulk(zone, pcp->count, pcp);
23743 +       count = pcp->count;
23744 +       if (count) {
23745 +               isolate_pcp_pages(count, pcp, &dst);
23746                 pcp->count = 0;
23747         }
23748 -       local_irq_restore(flags);
23749 +       cpu_unlock_irqrestore(cpu, flags);
23750 +       if (count)
23751 +               free_pcppages_bulk(zone, count, &dst);
23754  /*
23755 @@ -2366,8 +2412,17 @@
23756                 else
23757                         cpumask_clear_cpu(cpu, &cpus_with_pcps);
23758         }
23759 +#ifndef CONFIG_PREEMPT_RT_BASE
23760         on_each_cpu_mask(&cpus_with_pcps, (smp_call_func_t) drain_local_pages,
23761                                                                 zone, 1);
23762 +#else
23763 +       for_each_cpu(cpu, &cpus_with_pcps) {
23764 +               if (zone)
23765 +                       drain_pages_zone(cpu, zone);
23766 +               else
23767 +                       drain_pages(cpu);
23768 +       }
23769 +#endif
23772  #ifdef CONFIG_HIBERNATION
23773 @@ -2427,7 +2482,7 @@
23775         migratetype = get_pfnblock_migratetype(page, pfn);
23776         set_pcppage_migratetype(page, migratetype);
23777 -       local_irq_save(flags);
23778 +       local_lock_irqsave(pa_lock, flags);
23779         __count_vm_event(PGFREE);
23781         /*
23782 @@ -2453,12 +2508,17 @@
23783         pcp->count++;
23784         if (pcp->count >= pcp->high) {
23785                 unsigned long batch = READ_ONCE(pcp->batch);
23786 -               free_pcppages_bulk(zone, batch, pcp);
23787 +               LIST_HEAD(dst);
23789 +               isolate_pcp_pages(batch, pcp, &dst);
23790                 pcp->count -= batch;
23791 +               local_unlock_irqrestore(pa_lock, flags);
23792 +               free_pcppages_bulk(zone, batch, &dst);
23793 +               return;
23794         }
23796  out:
23797 -       local_irq_restore(flags);
23798 +       local_unlock_irqrestore(pa_lock, flags);
23801  /*
23802 @@ -2600,7 +2660,7 @@
23803                 struct per_cpu_pages *pcp;
23804                 struct list_head *list;
23806 -               local_irq_save(flags);
23807 +               local_lock_irqsave(pa_lock, flags);
23808                 do {
23809                         pcp = &this_cpu_ptr(zone->pageset)->pcp;
23810                         list = &pcp->lists[migratetype];
23811 @@ -2627,7 +2687,7 @@
23812                  * allocate greater than order-1 page units with __GFP_NOFAIL.
23813                  */
23814                 WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
23815 -               spin_lock_irqsave(&zone->lock, flags);
23816 +               local_spin_lock_irqsave(pa_lock, &zone->lock, flags);
23818                 do {
23819                         page = NULL;
23820 @@ -2639,22 +2699,24 @@
23821                         if (!page)
23822                                 page = __rmqueue(zone, order, migratetype);
23823                 } while (page && check_new_pages(page, order));
23824 -               spin_unlock(&zone->lock);
23825 -               if (!page)
23826 +               if (!page) {
23827 +                       spin_unlock(&zone->lock);
23828                         goto failed;
23829 +               }
23830                 __mod_zone_freepage_state(zone, -(1 << order),
23831                                           get_pcppage_migratetype(page));
23832 +               spin_unlock(&zone->lock);
23833         }
23835         __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
23836         zone_statistics(preferred_zone, zone, gfp_flags);
23837 -       local_irq_restore(flags);
23838 +       local_unlock_irqrestore(pa_lock, flags);
23840         VM_BUG_ON_PAGE(bad_range(zone, page), page);
23841         return page;
23843  failed:
23844 -       local_irq_restore(flags);
23845 +       local_unlock_irqrestore(pa_lock, flags);
23846         return NULL;
23849 @@ -6531,7 +6593,9 @@
23850         int cpu = (unsigned long)hcpu;
23852         if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
23853 +               local_lock_irq_on(swapvec_lock, cpu);
23854                 lru_add_drain_cpu(cpu);
23855 +               local_unlock_irq_on(swapvec_lock, cpu);
23856                 drain_pages(cpu);
23858                 /*
23859 @@ -6557,6 +6621,7 @@
23860  void __init page_alloc_init(void)
23862         hotcpu_notifier(page_alloc_cpu_notify, 0);
23863 +       local_irq_lock_init(pa_lock);
23866  /*
23867 @@ -7383,7 +7448,7 @@
23868         struct per_cpu_pageset *pset;
23870         /* avoid races with drain_pages()  */
23871 -       local_irq_save(flags);
23872 +       local_lock_irqsave(pa_lock, flags);
23873         if (zone->pageset != &boot_pageset) {
23874                 for_each_online_cpu(cpu) {
23875                         pset = per_cpu_ptr(zone->pageset, cpu);
23876 @@ -7392,7 +7457,7 @@
23877                 free_percpu(zone->pageset);
23878                 zone->pageset = &boot_pageset;
23879         }
23880 -       local_irq_restore(flags);
23881 +       local_unlock_irqrestore(pa_lock, flags);
23884  #ifdef CONFIG_MEMORY_HOTREMOVE
23885 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/mm/percpu.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/mm/percpu.c
23886 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/mm/percpu.c     2017-04-16 10:38:30.000000000 +0200
23887 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/mm/percpu.c  2017-04-18 17:54:26.000000000 +0200
23888 @@ -1283,18 +1283,7 @@
23890  EXPORT_SYMBOL_GPL(free_percpu);
23892 -/**
23893 - * is_kernel_percpu_address - test whether address is from static percpu area
23894 - * @addr: address to test
23895 - *
23896 - * Test whether @addr belongs to in-kernel static percpu area.  Module
23897 - * static percpu areas are not considered.  For those, use
23898 - * is_module_percpu_address().
23899 - *
23900 - * RETURNS:
23901 - * %true if @addr is from in-kernel static percpu area, %false otherwise.
23902 - */
23903 -bool is_kernel_percpu_address(unsigned long addr)
23904 +bool __is_kernel_percpu_address(unsigned long addr, unsigned long *can_addr)
23906  #ifdef CONFIG_SMP
23907         const size_t static_size = __per_cpu_end - __per_cpu_start;
23908 @@ -1303,16 +1292,39 @@
23910         for_each_possible_cpu(cpu) {
23911                 void *start = per_cpu_ptr(base, cpu);
23912 +               void *va = (void *)addr;
23914 -               if ((void *)addr >= start && (void *)addr < start + static_size)
23915 +               if (va >= start && va < start + static_size) {
23916 +                       if (can_addr) {
23917 +                               *can_addr = (unsigned long) (va - start);
23918 +                               *can_addr += (unsigned long)
23919 +                                       per_cpu_ptr(base, get_boot_cpu_id());
23920 +                       }
23921                         return true;
23922 -        }
23923 +               }
23924 +       }
23925  #endif
23926         /* on UP, can't distinguish from other static vars, always false */
23927         return false;
23930  /**
23931 + * is_kernel_percpu_address - test whether address is from static percpu area
23932 + * @addr: address to test
23933 + *
23934 + * Test whether @addr belongs to in-kernel static percpu area.  Module
23935 + * static percpu areas are not considered.  For those, use
23936 + * is_module_percpu_address().
23937 + *
23938 + * RETURNS:
23939 + * %true if @addr is from in-kernel static percpu area, %false otherwise.
23940 + */
23941 +bool is_kernel_percpu_address(unsigned long addr)
23943 +       return __is_kernel_percpu_address(addr, NULL);
23946 +/**
23947   * per_cpu_ptr_to_phys - convert translated percpu address to physical address
23948   * @addr: the address to be converted to physical address
23949   *
23950 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/mm/slab.h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/mm/slab.h
23951 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/mm/slab.h       2017-04-16 10:38:30.000000000 +0200
23952 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/mm/slab.h    2017-04-18 17:54:26.000000000 +0200
23953 @@ -426,7 +426,11 @@
23954   * The slab lists for all objects.
23955   */
23956  struct kmem_cache_node {
23957 +#ifdef CONFIG_SLUB
23958 +       raw_spinlock_t list_lock;
23959 +#else
23960         spinlock_t list_lock;
23961 +#endif
23963  #ifdef CONFIG_SLAB
23964         struct list_head slabs_partial; /* partial list first, better asm code */
23965 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/mm/slub.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/mm/slub.c
23966 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/mm/slub.c       2017-04-16 10:38:30.000000000 +0200
23967 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/mm/slub.c    2017-04-18 17:54:26.000000000 +0200
23968 @@ -1141,7 +1141,7 @@
23969         unsigned long uninitialized_var(flags);
23970         int ret = 0;
23972 -       spin_lock_irqsave(&n->list_lock, flags);
23973 +       raw_spin_lock_irqsave(&n->list_lock, flags);
23974         slab_lock(page);
23976         if (s->flags & SLAB_CONSISTENCY_CHECKS) {
23977 @@ -1176,7 +1176,7 @@
23978                          bulk_cnt, cnt);
23980         slab_unlock(page);
23981 -       spin_unlock_irqrestore(&n->list_lock, flags);
23982 +       raw_spin_unlock_irqrestore(&n->list_lock, flags);
23983         if (!ret)
23984                 slab_fix(s, "Object at 0x%p not freed", object);
23985         return ret;
23986 @@ -1304,6 +1304,12 @@
23988  #endif /* CONFIG_SLUB_DEBUG */
23990 +struct slub_free_list {
23991 +       raw_spinlock_t          lock;
23992 +       struct list_head        list;
23994 +static DEFINE_PER_CPU(struct slub_free_list, slub_free_list);
23996  /*
23997   * Hooks for other subsystems that check memory allocations. In a typical
23998   * production configuration these hooks all should produce no code at all.
23999 @@ -1527,10 +1533,17 @@
24000         void *start, *p;
24001         int idx, order;
24002         bool shuffle;
24003 +       bool enableirqs = false;
24005         flags &= gfp_allowed_mask;
24007         if (gfpflags_allow_blocking(flags))
24008 +               enableirqs = true;
24009 +#ifdef CONFIG_PREEMPT_RT_FULL
24010 +       if (system_state == SYSTEM_RUNNING)
24011 +               enableirqs = true;
24012 +#endif
24013 +       if (enableirqs)
24014                 local_irq_enable();
24016         flags |= s->allocflags;
24017 @@ -1605,7 +1618,7 @@
24018         page->frozen = 1;
24020  out:
24021 -       if (gfpflags_allow_blocking(flags))
24022 +       if (enableirqs)
24023                 local_irq_disable();
24024         if (!page)
24025                 return NULL;
24026 @@ -1664,6 +1677,16 @@
24027         __free_pages(page, order);
24030 +static void free_delayed(struct list_head *h)
24032 +       while(!list_empty(h)) {
24033 +               struct page *page = list_first_entry(h, struct page, lru);
24035 +               list_del(&page->lru);
24036 +               __free_slab(page->slab_cache, page);
24037 +       }
24040  #define need_reserve_slab_rcu                                          \
24041         (sizeof(((struct page *)NULL)->lru) < sizeof(struct rcu_head))
24043 @@ -1695,6 +1718,12 @@
24044                 }
24046                 call_rcu(head, rcu_free_slab);
24047 +       } else if (irqs_disabled()) {
24048 +               struct slub_free_list *f = this_cpu_ptr(&slub_free_list);
24050 +               raw_spin_lock(&f->lock);
24051 +               list_add(&page->lru, &f->list);
24052 +               raw_spin_unlock(&f->lock);
24053         } else
24054                 __free_slab(s, page);
24056 @@ -1802,7 +1831,7 @@
24057         if (!n || !n->nr_partial)
24058                 return NULL;
24060 -       spin_lock(&n->list_lock);
24061 +       raw_spin_lock(&n->list_lock);
24062         list_for_each_entry_safe(page, page2, &n->partial, lru) {
24063                 void *t;
24065 @@ -1827,7 +1856,7 @@
24066                         break;
24068         }
24069 -       spin_unlock(&n->list_lock);
24070 +       raw_spin_unlock(&n->list_lock);
24071         return object;
24074 @@ -2073,7 +2102,7 @@
24075                          * that acquire_slab() will see a slab page that
24076                          * is frozen
24077                          */
24078 -                       spin_lock(&n->list_lock);
24079 +                       raw_spin_lock(&n->list_lock);
24080                 }
24081         } else {
24082                 m = M_FULL;
24083 @@ -2084,7 +2113,7 @@
24084                          * slabs from diagnostic functions will not see
24085                          * any frozen slabs.
24086                          */
24087 -                       spin_lock(&n->list_lock);
24088 +                       raw_spin_lock(&n->list_lock);
24089                 }
24090         }
24092 @@ -2119,7 +2148,7 @@
24093                 goto redo;
24095         if (lock)
24096 -               spin_unlock(&n->list_lock);
24097 +               raw_spin_unlock(&n->list_lock);
24099         if (m == M_FREE) {
24100                 stat(s, DEACTIVATE_EMPTY);
24101 @@ -2151,10 +2180,10 @@
24102                 n2 = get_node(s, page_to_nid(page));
24103                 if (n != n2) {
24104                         if (n)
24105 -                               spin_unlock(&n->list_lock);
24106 +                               raw_spin_unlock(&n->list_lock);
24108                         n = n2;
24109 -                       spin_lock(&n->list_lock);
24110 +                       raw_spin_lock(&n->list_lock);
24111                 }
24113                 do {
24114 @@ -2183,7 +2212,7 @@
24115         }
24117         if (n)
24118 -               spin_unlock(&n->list_lock);
24119 +               raw_spin_unlock(&n->list_lock);
24121         while (discard_page) {
24122                 page = discard_page;
24123 @@ -2222,14 +2251,21 @@
24124                         pobjects = oldpage->pobjects;
24125                         pages = oldpage->pages;
24126                         if (drain && pobjects > s->cpu_partial) {
24127 +                               struct slub_free_list *f;
24128                                 unsigned long flags;
24129 +                               LIST_HEAD(tofree);
24130                                 /*
24131                                  * partial array is full. Move the existing
24132                                  * set to the per node partial list.
24133                                  */
24134                                 local_irq_save(flags);
24135                                 unfreeze_partials(s, this_cpu_ptr(s->cpu_slab));
24136 +                               f = this_cpu_ptr(&slub_free_list);
24137 +                               raw_spin_lock(&f->lock);
24138 +                               list_splice_init(&f->list, &tofree);
24139 +                               raw_spin_unlock(&f->lock);
24140                                 local_irq_restore(flags);
24141 +                               free_delayed(&tofree);
24142                                 oldpage = NULL;
24143                                 pobjects = 0;
24144                                 pages = 0;
24145 @@ -2301,7 +2337,22 @@
24147  static void flush_all(struct kmem_cache *s)
24149 +       LIST_HEAD(tofree);
24150 +       int cpu;
24152         on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1, GFP_ATOMIC);
24153 +       for_each_online_cpu(cpu) {
24154 +               struct slub_free_list *f;
24156 +               if (!has_cpu_slab(cpu, s))
24157 +                       continue;
24159 +               f = &per_cpu(slub_free_list, cpu);
24160 +               raw_spin_lock_irq(&f->lock);
24161 +               list_splice_init(&f->list, &tofree);
24162 +               raw_spin_unlock_irq(&f->lock);
24163 +               free_delayed(&tofree);
24164 +       }
24167  /*
24168 @@ -2356,10 +2407,10 @@
24169         unsigned long x = 0;
24170         struct page *page;
24172 -       spin_lock_irqsave(&n->list_lock, flags);
24173 +       raw_spin_lock_irqsave(&n->list_lock, flags);
24174         list_for_each_entry(page, &n->partial, lru)
24175                 x += get_count(page);
24176 -       spin_unlock_irqrestore(&n->list_lock, flags);
24177 +       raw_spin_unlock_irqrestore(&n->list_lock, flags);
24178         return x;
24180  #endif /* CONFIG_SLUB_DEBUG || CONFIG_SYSFS */
24181 @@ -2497,8 +2548,10 @@
24182   * already disabled (which is the case for bulk allocation).
24183   */
24184  static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
24185 -                         unsigned long addr, struct kmem_cache_cpu *c)
24186 +                         unsigned long addr, struct kmem_cache_cpu *c,
24187 +                         struct list_head *to_free)
24189 +       struct slub_free_list *f;
24190         void *freelist;
24191         struct page *page;
24193 @@ -2558,6 +2611,13 @@
24194         VM_BUG_ON(!c->page->frozen);
24195         c->freelist = get_freepointer(s, freelist);
24196         c->tid = next_tid(c->tid);
24198 +out:
24199 +       f = this_cpu_ptr(&slub_free_list);
24200 +       raw_spin_lock(&f->lock);
24201 +       list_splice_init(&f->list, to_free);
24202 +       raw_spin_unlock(&f->lock);
24204         return freelist;
24206  new_slab:
24207 @@ -2589,7 +2649,7 @@
24208         deactivate_slab(s, page, get_freepointer(s, freelist));
24209         c->page = NULL;
24210         c->freelist = NULL;
24211 -       return freelist;
24212 +       goto out;
24215  /*
24216 @@ -2601,6 +2661,7 @@
24218         void *p;
24219         unsigned long flags;
24220 +       LIST_HEAD(tofree);
24222         local_irq_save(flags);
24223  #ifdef CONFIG_PREEMPT
24224 @@ -2612,8 +2673,9 @@
24225         c = this_cpu_ptr(s->cpu_slab);
24226  #endif
24228 -       p = ___slab_alloc(s, gfpflags, node, addr, c);
24229 +       p = ___slab_alloc(s, gfpflags, node, addr, c, &tofree);
24230         local_irq_restore(flags);
24231 +       free_delayed(&tofree);
24232         return p;
24235 @@ -2799,7 +2861,7 @@
24237         do {
24238                 if (unlikely(n)) {
24239 -                       spin_unlock_irqrestore(&n->list_lock, flags);
24240 +                       raw_spin_unlock_irqrestore(&n->list_lock, flags);
24241                         n = NULL;
24242                 }
24243                 prior = page->freelist;
24244 @@ -2831,7 +2893,7 @@
24245                                  * Otherwise the list_lock will synchronize with
24246                                  * other processors updating the list of slabs.
24247                                  */
24248 -                               spin_lock_irqsave(&n->list_lock, flags);
24249 +                               raw_spin_lock_irqsave(&n->list_lock, flags);
24251                         }
24252                 }
24253 @@ -2873,7 +2935,7 @@
24254                 add_partial(n, page, DEACTIVATE_TO_TAIL);
24255                 stat(s, FREE_ADD_PARTIAL);
24256         }
24257 -       spin_unlock_irqrestore(&n->list_lock, flags);
24258 +       raw_spin_unlock_irqrestore(&n->list_lock, flags);
24259         return;
24261  slab_empty:
24262 @@ -2888,7 +2950,7 @@
24263                 remove_full(s, n, page);
24264         }
24266 -       spin_unlock_irqrestore(&n->list_lock, flags);
24267 +       raw_spin_unlock_irqrestore(&n->list_lock, flags);
24268         stat(s, FREE_SLAB);
24269         discard_slab(s, page);
24271 @@ -3093,6 +3155,7 @@
24272                           void **p)
24274         struct kmem_cache_cpu *c;
24275 +       LIST_HEAD(to_free);
24276         int i;
24278         /* memcg and kmem_cache debug support */
24279 @@ -3116,7 +3179,7 @@
24280                          * of re-populating per CPU c->freelist
24281                          */
24282                         p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE,
24283 -                                           _RET_IP_, c);
24284 +                                           _RET_IP_, c, &to_free);
24285                         if (unlikely(!p[i]))
24286                                 goto error;
24288 @@ -3128,6 +3191,7 @@
24289         }
24290         c->tid = next_tid(c->tid);
24291         local_irq_enable();
24292 +       free_delayed(&to_free);
24294         /* Clear memory outside IRQ disabled fastpath loop */
24295         if (unlikely(flags & __GFP_ZERO)) {
24296 @@ -3275,7 +3339,7 @@
24297  init_kmem_cache_node(struct kmem_cache_node *n)
24299         n->nr_partial = 0;
24300 -       spin_lock_init(&n->list_lock);
24301 +       raw_spin_lock_init(&n->list_lock);
24302         INIT_LIST_HEAD(&n->partial);
24303  #ifdef CONFIG_SLUB_DEBUG
24304         atomic_long_set(&n->nr_slabs, 0);
24305 @@ -3619,6 +3683,10 @@
24306                                                         const char *text)
24308  #ifdef CONFIG_SLUB_DEBUG
24309 +#ifdef CONFIG_PREEMPT_RT_BASE
24310 +       /* XXX move out of irq-off section */
24311 +       slab_err(s, page, text, s->name);
24312 +#else
24313         void *addr = page_address(page);
24314         void *p;
24315         unsigned long *map = kzalloc(BITS_TO_LONGS(page->objects) *
24316 @@ -3639,6 +3707,7 @@
24317         slab_unlock(page);
24318         kfree(map);
24319  #endif
24320 +#endif
24323  /*
24324 @@ -3652,7 +3721,7 @@
24325         struct page *page, *h;
24327         BUG_ON(irqs_disabled());
24328 -       spin_lock_irq(&n->list_lock);
24329 +       raw_spin_lock_irq(&n->list_lock);
24330         list_for_each_entry_safe(page, h, &n->partial, lru) {
24331                 if (!page->inuse) {
24332                         remove_partial(n, page);
24333 @@ -3662,7 +3731,7 @@
24334                         "Objects remaining in %s on __kmem_cache_shutdown()");
24335                 }
24336         }
24337 -       spin_unlock_irq(&n->list_lock);
24338 +       raw_spin_unlock_irq(&n->list_lock);
24340         list_for_each_entry_safe(page, h, &discard, lru)
24341                 discard_slab(s, page);
24342 @@ -3905,7 +3974,7 @@
24343                 for (i = 0; i < SHRINK_PROMOTE_MAX; i++)
24344                         INIT_LIST_HEAD(promote + i);
24346 -               spin_lock_irqsave(&n->list_lock, flags);
24347 +               raw_spin_lock_irqsave(&n->list_lock, flags);
24349                 /*
24350                  * Build lists of slabs to discard or promote.
24351 @@ -3936,7 +4005,7 @@
24352                 for (i = SHRINK_PROMOTE_MAX - 1; i >= 0; i--)
24353                         list_splice(promote + i, &n->partial);
24355 -               spin_unlock_irqrestore(&n->list_lock, flags);
24356 +               raw_spin_unlock_irqrestore(&n->list_lock, flags);
24358                 /* Release empty slabs */
24359                 list_for_each_entry_safe(page, t, &discard, lru)
24360 @@ -4112,6 +4181,12 @@
24362         static __initdata struct kmem_cache boot_kmem_cache,
24363                 boot_kmem_cache_node;
24364 +       int cpu;
24366 +       for_each_possible_cpu(cpu) {
24367 +               raw_spin_lock_init(&per_cpu(slub_free_list, cpu).lock);
24368 +               INIT_LIST_HEAD(&per_cpu(slub_free_list, cpu).list);
24369 +       }
24371         if (debug_guardpage_minorder())
24372                 slub_max_order = 0;
24373 @@ -4320,7 +4395,7 @@
24374         struct page *page;
24375         unsigned long flags;
24377 -       spin_lock_irqsave(&n->list_lock, flags);
24378 +       raw_spin_lock_irqsave(&n->list_lock, flags);
24380         list_for_each_entry(page, &n->partial, lru) {
24381                 validate_slab_slab(s, page, map);
24382 @@ -4342,7 +4417,7 @@
24383                        s->name, count, atomic_long_read(&n->nr_slabs));
24385  out:
24386 -       spin_unlock_irqrestore(&n->list_lock, flags);
24387 +       raw_spin_unlock_irqrestore(&n->list_lock, flags);
24388         return count;
24391 @@ -4530,12 +4605,12 @@
24392                 if (!atomic_long_read(&n->nr_slabs))
24393                         continue;
24395 -               spin_lock_irqsave(&n->list_lock, flags);
24396 +               raw_spin_lock_irqsave(&n->list_lock, flags);
24397                 list_for_each_entry(page, &n->partial, lru)
24398                         process_slab(&t, s, page, alloc, map);
24399                 list_for_each_entry(page, &n->full, lru)
24400                         process_slab(&t, s, page, alloc, map);
24401 -               spin_unlock_irqrestore(&n->list_lock, flags);
24402 +               raw_spin_unlock_irqrestore(&n->list_lock, flags);
24403         }
24405         for (i = 0; i < t.count; i++) {
24406 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/mm/swap.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/mm/swap.c
24407 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/mm/swap.c       2017-04-16 10:38:30.000000000 +0200
24408 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/mm/swap.c    2017-04-18 17:54:26.000000000 +0200
24409 @@ -32,6 +32,7 @@
24410  #include <linux/memcontrol.h>
24411  #include <linux/gfp.h>
24412  #include <linux/uio.h>
24413 +#include <linux/locallock.h>
24414  #include <linux/hugetlb.h>
24415  #include <linux/page_idle.h>
24417 @@ -50,6 +51,8 @@
24418  #ifdef CONFIG_SMP
24419  static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs);
24420  #endif
24421 +static DEFINE_LOCAL_IRQ_LOCK(rotate_lock);
24422 +DEFINE_LOCAL_IRQ_LOCK(swapvec_lock);
24424  /*
24425   * This path almost never happens for VM activity - pages are normally
24426 @@ -240,11 +243,11 @@
24427                 unsigned long flags;
24429                 get_page(page);
24430 -               local_irq_save(flags);
24431 +               local_lock_irqsave(rotate_lock, flags);
24432                 pvec = this_cpu_ptr(&lru_rotate_pvecs);
24433                 if (!pagevec_add(pvec, page) || PageCompound(page))
24434                         pagevec_move_tail(pvec);
24435 -               local_irq_restore(flags);
24436 +               local_unlock_irqrestore(rotate_lock, flags);
24437         }
24440 @@ -294,12 +297,13 @@
24442         page = compound_head(page);
24443         if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
24444 -               struct pagevec *pvec = &get_cpu_var(activate_page_pvecs);
24445 +               struct pagevec *pvec = &get_locked_var(swapvec_lock,
24446 +                                                      activate_page_pvecs);
24448                 get_page(page);
24449                 if (!pagevec_add(pvec, page) || PageCompound(page))
24450                         pagevec_lru_move_fn(pvec, __activate_page, NULL);
24451 -               put_cpu_var(activate_page_pvecs);
24452 +               put_locked_var(swapvec_lock, activate_page_pvecs);
24453         }
24456 @@ -326,7 +330,7 @@
24458  static void __lru_cache_activate_page(struct page *page)
24460 -       struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
24461 +       struct pagevec *pvec = &get_locked_var(swapvec_lock, lru_add_pvec);
24462         int i;
24464         /*
24465 @@ -348,7 +352,7 @@
24466                 }
24467         }
24469 -       put_cpu_var(lru_add_pvec);
24470 +       put_locked_var(swapvec_lock, lru_add_pvec);
24473  /*
24474 @@ -390,12 +394,12 @@
24476  static void __lru_cache_add(struct page *page)
24478 -       struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
24479 +       struct pagevec *pvec = &get_locked_var(swapvec_lock, lru_add_pvec);
24481         get_page(page);
24482         if (!pagevec_add(pvec, page) || PageCompound(page))
24483                 __pagevec_lru_add(pvec);
24484 -       put_cpu_var(lru_add_pvec);
24485 +       put_locked_var(swapvec_lock, lru_add_pvec);
24488  /**
24489 @@ -593,9 +597,15 @@
24490                 unsigned long flags;
24492                 /* No harm done if a racing interrupt already did this */
24493 -               local_irq_save(flags);
24494 +#ifdef CONFIG_PREEMPT_RT_BASE
24495 +               local_lock_irqsave_on(rotate_lock, flags, cpu);
24496                 pagevec_move_tail(pvec);
24497 -               local_irq_restore(flags);
24498 +               local_unlock_irqrestore_on(rotate_lock, flags, cpu);
24499 +#else
24500 +               local_lock_irqsave(rotate_lock, flags);
24501 +               pagevec_move_tail(pvec);
24502 +               local_unlock_irqrestore(rotate_lock, flags);
24503 +#endif
24504         }
24506         pvec = &per_cpu(lru_deactivate_file_pvecs, cpu);
24507 @@ -627,11 +637,12 @@
24508                 return;
24510         if (likely(get_page_unless_zero(page))) {
24511 -               struct pagevec *pvec = &get_cpu_var(lru_deactivate_file_pvecs);
24512 +               struct pagevec *pvec = &get_locked_var(swapvec_lock,
24513 +                                                      lru_deactivate_file_pvecs);
24515                 if (!pagevec_add(pvec, page) || PageCompound(page))
24516                         pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
24517 -               put_cpu_var(lru_deactivate_file_pvecs);
24518 +               put_locked_var(swapvec_lock, lru_deactivate_file_pvecs);
24519         }
24522 @@ -646,27 +657,31 @@
24523  void deactivate_page(struct page *page)
24525         if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
24526 -               struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs);
24527 +               struct pagevec *pvec = &get_locked_var(swapvec_lock,
24528 +                                                      lru_deactivate_pvecs);
24530                 get_page(page);
24531                 if (!pagevec_add(pvec, page) || PageCompound(page))
24532                         pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
24533 -               put_cpu_var(lru_deactivate_pvecs);
24534 +               put_locked_var(swapvec_lock, lru_deactivate_pvecs);
24535         }
24538  void lru_add_drain(void)
24540 -       lru_add_drain_cpu(get_cpu());
24541 -       put_cpu();
24542 +       lru_add_drain_cpu(local_lock_cpu(swapvec_lock));
24543 +       local_unlock_cpu(swapvec_lock);
24546 -static void lru_add_drain_per_cpu(struct work_struct *dummy)
24547 +#ifdef CONFIG_PREEMPT_RT_BASE
24548 +static inline void remote_lru_add_drain(int cpu, struct cpumask *has_work)
24550 -       lru_add_drain();
24551 +       local_lock_on(swapvec_lock, cpu);
24552 +       lru_add_drain_cpu(cpu);
24553 +       local_unlock_on(swapvec_lock, cpu);
24556 -static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
24557 +#else
24559  /*
24560   * lru_add_drain_wq is used to do lru_add_drain_all() from a WQ_MEM_RECLAIM
24561 @@ -686,6 +701,22 @@
24563  early_initcall(lru_init);
24565 +static void lru_add_drain_per_cpu(struct work_struct *dummy)
24567 +       lru_add_drain();
24570 +static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
24571 +static inline void remote_lru_add_drain(int cpu, struct cpumask *has_work)
24573 +       struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);
24575 +       INIT_WORK(work, lru_add_drain_per_cpu);
24576 +       queue_work_on(cpu, lru_add_drain_wq, work);
24577 +       cpumask_set_cpu(cpu, has_work);
24579 +#endif
24581  void lru_add_drain_all(void)
24583         static DEFINE_MUTEX(lock);
24584 @@ -697,21 +728,18 @@
24585         cpumask_clear(&has_work);
24587         for_each_online_cpu(cpu) {
24588 -               struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);
24590                 if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
24591                     pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
24592                     pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) ||
24593                     pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) ||
24594 -                   need_activate_page_drain(cpu)) {
24595 -                       INIT_WORK(work, lru_add_drain_per_cpu);
24596 -                       queue_work_on(cpu, lru_add_drain_wq, work);
24597 -                       cpumask_set_cpu(cpu, &has_work);
24598 -               }
24599 +                   need_activate_page_drain(cpu))
24600 +                       remote_lru_add_drain(cpu, &has_work);
24601         }
24603 +#ifndef CONFIG_PREEMPT_RT_BASE
24604         for_each_cpu(cpu, &has_work)
24605                 flush_work(&per_cpu(lru_add_drain_work, cpu));
24606 +#endif
24608         put_online_cpus();
24609         mutex_unlock(&lock);
24610 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/mm/truncate.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/mm/truncate.c
24611 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/mm/truncate.c   2017-04-16 10:38:30.000000000 +0200
24612 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/mm/truncate.c        2017-04-18 17:54:26.000000000 +0200
24613 @@ -62,9 +62,12 @@
24614          * protected by mapping->tree_lock.
24615          */
24616         if (!workingset_node_shadows(node) &&
24617 -           !list_empty(&node->private_list))
24618 -               list_lru_del(&workingset_shadow_nodes,
24619 +           !list_empty(&node->private_list)) {
24620 +               local_lock(workingset_shadow_lock);
24621 +               list_lru_del(&__workingset_shadow_nodes,
24622                                 &node->private_list);
24623 +               local_unlock(workingset_shadow_lock);
24624 +       }
24625         __radix_tree_delete_node(&mapping->page_tree, node);
24626  unlock:
24627         spin_unlock_irq(&mapping->tree_lock);
24628 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/mm/vmalloc.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/mm/vmalloc.c
24629 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/mm/vmalloc.c    2017-04-16 10:38:30.000000000 +0200
24630 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/mm/vmalloc.c 2017-04-18 17:54:26.000000000 +0200
24631 @@ -845,7 +845,7 @@
24632         struct vmap_block *vb;
24633         struct vmap_area *va;
24634         unsigned long vb_idx;
24635 -       int node, err;
24636 +       int node, err, cpu;
24637         void *vaddr;
24639         node = numa_node_id();
24640 @@ -888,11 +888,12 @@
24641         BUG_ON(err);
24642         radix_tree_preload_end();
24644 -       vbq = &get_cpu_var(vmap_block_queue);
24645 +       cpu = get_cpu_light();
24646 +       vbq = this_cpu_ptr(&vmap_block_queue);
24647         spin_lock(&vbq->lock);
24648         list_add_tail_rcu(&vb->free_list, &vbq->free);
24649         spin_unlock(&vbq->lock);
24650 -       put_cpu_var(vmap_block_queue);
24651 +       put_cpu_light();
24653         return vaddr;
24655 @@ -961,6 +962,7 @@
24656         struct vmap_block *vb;
24657         void *vaddr = NULL;
24658         unsigned int order;
24659 +       int cpu;
24661         BUG_ON(offset_in_page(size));
24662         BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
24663 @@ -975,7 +977,8 @@
24664         order = get_order(size);
24666         rcu_read_lock();
24667 -       vbq = &get_cpu_var(vmap_block_queue);
24668 +       cpu = get_cpu_light();
24669 +       vbq = this_cpu_ptr(&vmap_block_queue);
24670         list_for_each_entry_rcu(vb, &vbq->free, free_list) {
24671                 unsigned long pages_off;
24673 @@ -998,7 +1001,7 @@
24674                 break;
24675         }
24677 -       put_cpu_var(vmap_block_queue);
24678 +       put_cpu_light();
24679         rcu_read_unlock();
24681         /* Allocate new block if nothing was found */
24682 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/mm/vmstat.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/mm/vmstat.c
24683 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/mm/vmstat.c     2017-04-16 10:38:30.000000000 +0200
24684 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/mm/vmstat.c  2017-04-18 17:54:26.000000000 +0200
24685 @@ -245,6 +245,7 @@
24686         long x;
24687         long t;
24689 +       preempt_disable_rt();
24690         x = delta + __this_cpu_read(*p);
24692         t = __this_cpu_read(pcp->stat_threshold);
24693 @@ -254,6 +255,7 @@
24694                 x = 0;
24695         }
24696         __this_cpu_write(*p, x);
24697 +       preempt_enable_rt();
24699  EXPORT_SYMBOL(__mod_zone_page_state);
24701 @@ -265,6 +267,7 @@
24702         long x;
24703         long t;
24705 +       preempt_disable_rt();
24706         x = delta + __this_cpu_read(*p);
24708         t = __this_cpu_read(pcp->stat_threshold);
24709 @@ -274,6 +277,7 @@
24710                 x = 0;
24711         }
24712         __this_cpu_write(*p, x);
24713 +       preempt_enable_rt();
24715  EXPORT_SYMBOL(__mod_node_page_state);
24717 @@ -306,6 +310,7 @@
24718         s8 __percpu *p = pcp->vm_stat_diff + item;
24719         s8 v, t;
24721 +       preempt_disable_rt();
24722         v = __this_cpu_inc_return(*p);
24723         t = __this_cpu_read(pcp->stat_threshold);
24724         if (unlikely(v > t)) {
24725 @@ -314,6 +319,7 @@
24726                 zone_page_state_add(v + overstep, zone, item);
24727                 __this_cpu_write(*p, -overstep);
24728         }
24729 +       preempt_enable_rt();
24732  void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
24733 @@ -322,6 +328,7 @@
24734         s8 __percpu *p = pcp->vm_node_stat_diff + item;
24735         s8 v, t;
24737 +       preempt_disable_rt();
24738         v = __this_cpu_inc_return(*p);
24739         t = __this_cpu_read(pcp->stat_threshold);
24740         if (unlikely(v > t)) {
24741 @@ -330,6 +337,7 @@
24742                 node_page_state_add(v + overstep, pgdat, item);
24743                 __this_cpu_write(*p, -overstep);
24744         }
24745 +       preempt_enable_rt();
24748  void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
24749 @@ -350,6 +358,7 @@
24750         s8 __percpu *p = pcp->vm_stat_diff + item;
24751         s8 v, t;
24753 +       preempt_disable_rt();
24754         v = __this_cpu_dec_return(*p);
24755         t = __this_cpu_read(pcp->stat_threshold);
24756         if (unlikely(v < - t)) {
24757 @@ -358,6 +367,7 @@
24758                 zone_page_state_add(v - overstep, zone, item);
24759                 __this_cpu_write(*p, overstep);
24760         }
24761 +       preempt_enable_rt();
24764  void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
24765 @@ -366,6 +376,7 @@
24766         s8 __percpu *p = pcp->vm_node_stat_diff + item;
24767         s8 v, t;
24769 +       preempt_disable_rt();
24770         v = __this_cpu_dec_return(*p);
24771         t = __this_cpu_read(pcp->stat_threshold);
24772         if (unlikely(v < - t)) {
24773 @@ -374,6 +385,7 @@
24774                 node_page_state_add(v - overstep, pgdat, item);
24775                 __this_cpu_write(*p, overstep);
24776         }
24777 +       preempt_enable_rt();
24780  void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
24781 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/mm/workingset.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/mm/workingset.c
24782 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/mm/workingset.c 2017-04-16 10:38:30.000000000 +0200
24783 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/mm/workingset.c      2017-04-18 17:54:26.000000000 +0200
24784 @@ -334,7 +334,8 @@
24785   * point where they would still be useful.
24786   */
24788 -struct list_lru workingset_shadow_nodes;
24789 +struct list_lru __workingset_shadow_nodes;
24790 +DEFINE_LOCAL_IRQ_LOCK(workingset_shadow_lock);
24792  static unsigned long count_shadow_nodes(struct shrinker *shrinker,
24793                                         struct shrink_control *sc)
24794 @@ -344,9 +345,9 @@
24795         unsigned long pages;
24797         /* list_lru lock nests inside IRQ-safe mapping->tree_lock */
24798 -       local_irq_disable();
24799 -       shadow_nodes = list_lru_shrink_count(&workingset_shadow_nodes, sc);
24800 -       local_irq_enable();
24801 +       local_lock_irq(workingset_shadow_lock);
24802 +       shadow_nodes = list_lru_shrink_count(&__workingset_shadow_nodes, sc);
24803 +       local_unlock_irq(workingset_shadow_lock);
24805         if (sc->memcg) {
24806                 pages = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid,
24807 @@ -438,9 +439,9 @@
24808         spin_unlock(&mapping->tree_lock);
24809         ret = LRU_REMOVED_RETRY;
24810  out:
24811 -       local_irq_enable();
24812 +       local_unlock_irq(workingset_shadow_lock);
24813         cond_resched();
24814 -       local_irq_disable();
24815 +       local_lock_irq(workingset_shadow_lock);
24816         spin_lock(lru_lock);
24817         return ret;
24819 @@ -451,10 +452,10 @@
24820         unsigned long ret;
24822         /* list_lru lock nests inside IRQ-safe mapping->tree_lock */
24823 -       local_irq_disable();
24824 -       ret =  list_lru_shrink_walk(&workingset_shadow_nodes, sc,
24825 +       local_lock_irq(workingset_shadow_lock);
24826 +       ret =  list_lru_shrink_walk(&__workingset_shadow_nodes, sc,
24827                                     shadow_lru_isolate, NULL);
24828 -       local_irq_enable();
24829 +       local_unlock_irq(workingset_shadow_lock);
24830         return ret;
24833 @@ -492,7 +493,7 @@
24834         pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n",
24835                timestamp_bits, max_order, bucket_order);
24837 -       ret = __list_lru_init(&workingset_shadow_nodes, true, &shadow_nodes_key);
24838 +       ret = __list_lru_init(&__workingset_shadow_nodes, true, &shadow_nodes_key);
24839         if (ret)
24840                 goto err;
24841         ret = register_shrinker(&workingset_shadow_shrinker);
24842 @@ -500,7 +501,7 @@
24843                 goto err_list_lru;
24844         return 0;
24845  err_list_lru:
24846 -       list_lru_destroy(&workingset_shadow_nodes);
24847 +       list_lru_destroy(&__workingset_shadow_nodes);
24848  err:
24849         return ret;
24851 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/mm/zsmalloc.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/mm/zsmalloc.c
24852 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/mm/zsmalloc.c   2017-04-16 10:38:30.000000000 +0200
24853 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/mm/zsmalloc.c        2017-04-18 17:54:26.000000000 +0200
24854 @@ -53,6 +53,7 @@
24855  #include <linux/mount.h>
24856  #include <linux/migrate.h>
24857  #include <linux/pagemap.h>
24858 +#include <linux/locallock.h>
24860  #define ZSPAGE_MAGIC   0x58
24862 @@ -70,9 +71,22 @@
24863   */
24864  #define ZS_MAX_ZSPAGE_ORDER 2
24865  #define ZS_MAX_PAGES_PER_ZSPAGE (_AC(1, UL) << ZS_MAX_ZSPAGE_ORDER)
24867  #define ZS_HANDLE_SIZE (sizeof(unsigned long))
24869 +#ifdef CONFIG_PREEMPT_RT_FULL
24871 +struct zsmalloc_handle {
24872 +       unsigned long addr;
24873 +       struct mutex lock;
24876 +#define ZS_HANDLE_ALLOC_SIZE (sizeof(struct zsmalloc_handle))
24878 +#else
24880 +#define ZS_HANDLE_ALLOC_SIZE (sizeof(unsigned long))
24881 +#endif
24883  /*
24884   * Object location (<PFN>, <obj_idx>) is encoded as
24885   * as single (unsigned long) handle value.
24886 @@ -327,7 +341,7 @@
24888  static int create_cache(struct zs_pool *pool)
24890 -       pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_SIZE,
24891 +       pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_ALLOC_SIZE,
24892                                         0, 0, NULL);
24893         if (!pool->handle_cachep)
24894                 return 1;
24895 @@ -351,10 +365,27 @@
24897  static unsigned long cache_alloc_handle(struct zs_pool *pool, gfp_t gfp)
24899 -       return (unsigned long)kmem_cache_alloc(pool->handle_cachep,
24900 -                       gfp & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
24901 +       void *p;
24903 +       p = kmem_cache_alloc(pool->handle_cachep,
24904 +                            gfp & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
24905 +#ifdef CONFIG_PREEMPT_RT_FULL
24906 +       if (p) {
24907 +               struct zsmalloc_handle *zh = p;
24909 +               mutex_init(&zh->lock);
24910 +       }
24911 +#endif
24912 +       return (unsigned long)p;
24915 +#ifdef CONFIG_PREEMPT_RT_FULL
24916 +static struct zsmalloc_handle *zs_get_pure_handle(unsigned long handle)
24918 +       return (void *)(handle &~((1 << OBJ_TAG_BITS) - 1));
24920 +#endif
24922  static void cache_free_handle(struct zs_pool *pool, unsigned long handle)
24924         kmem_cache_free(pool->handle_cachep, (void *)handle);
24925 @@ -373,12 +404,18 @@
24927  static void record_obj(unsigned long handle, unsigned long obj)
24929 +#ifdef CONFIG_PREEMPT_RT_FULL
24930 +       struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
24932 +       WRITE_ONCE(zh->addr, obj);
24933 +#else
24934         /*
24935          * lsb of @obj represents handle lock while other bits
24936          * represent object value the handle is pointing so
24937          * updating shouldn't do store tearing.
24938          */
24939         WRITE_ONCE(*(unsigned long *)handle, obj);
24940 +#endif
24943  /* zpool driver */
24944 @@ -467,6 +504,7 @@
24946  /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */
24947  static DEFINE_PER_CPU(struct mapping_area, zs_map_area);
24948 +static DEFINE_LOCAL_IRQ_LOCK(zs_map_area_lock);
24950  static bool is_zspage_isolated(struct zspage *zspage)
24952 @@ -902,7 +940,13 @@
24954  static unsigned long handle_to_obj(unsigned long handle)
24956 +#ifdef CONFIG_PREEMPT_RT_FULL
24957 +       struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
24959 +       return zh->addr;
24960 +#else
24961         return *(unsigned long *)handle;
24962 +#endif
24965  static unsigned long obj_to_head(struct page *page, void *obj)
24966 @@ -916,22 +960,46 @@
24968  static inline int testpin_tag(unsigned long handle)
24970 +#ifdef CONFIG_PREEMPT_RT_FULL
24971 +       struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
24973 +       return mutex_is_locked(&zh->lock);
24974 +#else
24975         return bit_spin_is_locked(HANDLE_PIN_BIT, (unsigned long *)handle);
24976 +#endif
24979  static inline int trypin_tag(unsigned long handle)
24981 +#ifdef CONFIG_PREEMPT_RT_FULL
24982 +       struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
24984 +       return mutex_trylock(&zh->lock);
24985 +#else
24986         return bit_spin_trylock(HANDLE_PIN_BIT, (unsigned long *)handle);
24987 +#endif
24990  static void pin_tag(unsigned long handle)
24992 +#ifdef CONFIG_PREEMPT_RT_FULL
24993 +       struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
24995 +       return mutex_lock(&zh->lock);
24996 +#else
24997         bit_spin_lock(HANDLE_PIN_BIT, (unsigned long *)handle);
24998 +#endif
25001  static void unpin_tag(unsigned long handle)
25003 +#ifdef CONFIG_PREEMPT_RT_FULL
25004 +       struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
25006 +       return mutex_unlock(&zh->lock);
25007 +#else
25008         bit_spin_unlock(HANDLE_PIN_BIT, (unsigned long *)handle);
25009 +#endif
25012  static void reset_page(struct page *page)
25013 @@ -1423,7 +1491,7 @@
25014         class = pool->size_class[class_idx];
25015         off = (class->size * obj_idx) & ~PAGE_MASK;
25017 -       area = &get_cpu_var(zs_map_area);
25018 +       area = &get_locked_var(zs_map_area_lock, zs_map_area);
25019         area->vm_mm = mm;
25020         if (off + class->size <= PAGE_SIZE) {
25021                 /* this object is contained entirely within a page */
25022 @@ -1477,7 +1545,7 @@
25024                 __zs_unmap_object(area, pages, off, class->size);
25025         }
25026 -       put_cpu_var(zs_map_area);
25027 +       put_locked_var(zs_map_area_lock, zs_map_area);
25029         migrate_read_unlock(zspage);
25030         unpin_tag(handle);
25031 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/net/core/dev.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/net/core/dev.c
25032 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/net/core/dev.c  2017-04-16 10:38:31.000000000 +0200
25033 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/net/core/dev.c       2017-04-18 17:54:26.000000000 +0200
25034 @@ -190,6 +190,7 @@
25035  static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
25037  static seqcount_t devnet_rename_seq;
25038 +static DEFINE_MUTEX(devnet_rename_mutex);
25040  static inline void dev_base_seq_inc(struct net *net)
25042 @@ -211,14 +212,14 @@
25043  static inline void rps_lock(struct softnet_data *sd)
25045  #ifdef CONFIG_RPS
25046 -       spin_lock(&sd->input_pkt_queue.lock);
25047 +       raw_spin_lock(&sd->input_pkt_queue.raw_lock);
25048  #endif
25051  static inline void rps_unlock(struct softnet_data *sd)
25053  #ifdef CONFIG_RPS
25054 -       spin_unlock(&sd->input_pkt_queue.lock);
25055 +       raw_spin_unlock(&sd->input_pkt_queue.raw_lock);
25056  #endif
25059 @@ -888,7 +889,8 @@
25060         strcpy(name, dev->name);
25061         rcu_read_unlock();
25062         if (read_seqcount_retry(&devnet_rename_seq, seq)) {
25063 -               cond_resched();
25064 +               mutex_lock(&devnet_rename_mutex);
25065 +               mutex_unlock(&devnet_rename_mutex);
25066                 goto retry;
25067         }
25069 @@ -1157,20 +1159,17 @@
25070         if (dev->flags & IFF_UP)
25071                 return -EBUSY;
25073 -       write_seqcount_begin(&devnet_rename_seq);
25074 +       mutex_lock(&devnet_rename_mutex);
25075 +       __raw_write_seqcount_begin(&devnet_rename_seq);
25077 -       if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
25078 -               write_seqcount_end(&devnet_rename_seq);
25079 -               return 0;
25080 -       }
25081 +       if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
25082 +               goto outunlock;
25084         memcpy(oldname, dev->name, IFNAMSIZ);
25086         err = dev_get_valid_name(net, dev, newname);
25087 -       if (err < 0) {
25088 -               write_seqcount_end(&devnet_rename_seq);
25089 -               return err;
25090 -       }
25091 +       if (err < 0)
25092 +               goto outunlock;
25094         if (oldname[0] && !strchr(oldname, '%'))
25095                 netdev_info(dev, "renamed from %s\n", oldname);
25096 @@ -1183,11 +1182,12 @@
25097         if (ret) {
25098                 memcpy(dev->name, oldname, IFNAMSIZ);
25099                 dev->name_assign_type = old_assign_type;
25100 -               write_seqcount_end(&devnet_rename_seq);
25101 -               return ret;
25102 +               err = ret;
25103 +               goto outunlock;
25104         }
25106 -       write_seqcount_end(&devnet_rename_seq);
25107 +       __raw_write_seqcount_end(&devnet_rename_seq);
25108 +       mutex_unlock(&devnet_rename_mutex);
25110         netdev_adjacent_rename_links(dev, oldname);
25112 @@ -1208,7 +1208,8 @@
25113                 /* err >= 0 after dev_alloc_name() or stores the first errno */
25114                 if (err >= 0) {
25115                         err = ret;
25116 -                       write_seqcount_begin(&devnet_rename_seq);
25117 +                       mutex_lock(&devnet_rename_mutex);
25118 +                       __raw_write_seqcount_begin(&devnet_rename_seq);
25119                         memcpy(dev->name, oldname, IFNAMSIZ);
25120                         memcpy(oldname, newname, IFNAMSIZ);
25121                         dev->name_assign_type = old_assign_type;
25122 @@ -1221,6 +1222,11 @@
25123         }
25125         return err;
25127 +outunlock:
25128 +       __raw_write_seqcount_end(&devnet_rename_seq);
25129 +       mutex_unlock(&devnet_rename_mutex);
25130 +       return err;
25133  /**
25134 @@ -2285,6 +2291,7 @@
25135         sd->output_queue_tailp = &q->next_sched;
25136         raise_softirq_irqoff(NET_TX_SOFTIRQ);
25137         local_irq_restore(flags);
25138 +       preempt_check_resched_rt();
25141  void __netif_schedule(struct Qdisc *q)
25142 @@ -2366,6 +2373,7 @@
25143         __this_cpu_write(softnet_data.completion_queue, skb);
25144         raise_softirq_irqoff(NET_TX_SOFTIRQ);
25145         local_irq_restore(flags);
25146 +       preempt_check_resched_rt();
25148  EXPORT_SYMBOL(__dev_kfree_skb_irq);
25150 @@ -3100,7 +3108,11 @@
25151          * This permits qdisc->running owner to get the lock more
25152          * often and dequeue packets faster.
25153          */
25154 +#ifdef CONFIG_PREEMPT_RT_FULL
25155 +       contended = true;
25156 +#else
25157         contended = qdisc_is_running(q);
25158 +#endif
25159         if (unlikely(contended))
25160                 spin_lock(&q->busylock);
25162 @@ -3163,8 +3175,10 @@
25163  #define skb_update_prio(skb)
25164  #endif
25166 +#ifndef CONFIG_PREEMPT_RT_FULL
25167  DEFINE_PER_CPU(int, xmit_recursion);
25168  EXPORT_SYMBOL(xmit_recursion);
25169 +#endif
25171  /**
25172   *     dev_loopback_xmit - loop back @skb
25173 @@ -3398,8 +3412,7 @@
25174                 int cpu = smp_processor_id(); /* ok because BHs are off */
25176                 if (txq->xmit_lock_owner != cpu) {
25177 -                       if (unlikely(__this_cpu_read(xmit_recursion) >
25178 -                                    XMIT_RECURSION_LIMIT))
25179 +                       if (unlikely(xmit_rec_read() > XMIT_RECURSION_LIMIT))
25180                                 goto recursion_alert;
25182                         skb = validate_xmit_skb(skb, dev);
25183 @@ -3409,9 +3422,9 @@
25184                         HARD_TX_LOCK(dev, txq, cpu);
25186                         if (!netif_xmit_stopped(txq)) {
25187 -                               __this_cpu_inc(xmit_recursion);
25188 +                               xmit_rec_inc();
25189                                 skb = dev_hard_start_xmit(skb, dev, txq, &rc);
25190 -                               __this_cpu_dec(xmit_recursion);
25191 +                               xmit_rec_dec();
25192                                 if (dev_xmit_complete(rc)) {
25193                                         HARD_TX_UNLOCK(dev, txq);
25194                                         goto out;
25195 @@ -3785,6 +3798,7 @@
25196         rps_unlock(sd);
25198         local_irq_restore(flags);
25199 +       preempt_check_resched_rt();
25201         atomic_long_inc(&skb->dev->rx_dropped);
25202         kfree_skb(skb);
25203 @@ -3803,7 +3817,7 @@
25204                 struct rps_dev_flow voidflow, *rflow = &voidflow;
25205                 int cpu;
25207 -               preempt_disable();
25208 +               migrate_disable();
25209                 rcu_read_lock();
25211                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
25212 @@ -3813,13 +3827,13 @@
25213                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
25215                 rcu_read_unlock();
25216 -               preempt_enable();
25217 +               migrate_enable();
25218         } else
25219  #endif
25220         {
25221                 unsigned int qtail;
25222 -               ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
25223 -               put_cpu();
25224 +               ret = enqueue_to_backlog(skb, get_cpu_light(), &qtail);
25225 +               put_cpu_light();
25226         }
25227         return ret;
25229 @@ -3853,11 +3867,9 @@
25231         trace_netif_rx_ni_entry(skb);
25233 -       preempt_disable();
25234 +       local_bh_disable();
25235         err = netif_rx_internal(skb);
25236 -       if (local_softirq_pending())
25237 -               do_softirq();
25238 -       preempt_enable();
25239 +       local_bh_enable();
25241         return err;
25243 @@ -4336,7 +4348,7 @@
25244         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
25245                 if (skb->dev->reg_state == NETREG_UNREGISTERING) {
25246                         __skb_unlink(skb, &sd->input_pkt_queue);
25247 -                       kfree_skb(skb);
25248 +                       __skb_queue_tail(&sd->tofree_queue, skb);
25249                         input_queue_head_incr(sd);
25250                 }
25251         }
25252 @@ -4346,11 +4358,14 @@
25253         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
25254                 if (skb->dev->reg_state == NETREG_UNREGISTERING) {
25255                         __skb_unlink(skb, &sd->process_queue);
25256 -                       kfree_skb(skb);
25257 +                       __skb_queue_tail(&sd->tofree_queue, skb);
25258                         input_queue_head_incr(sd);
25259                 }
25260         }
25261 +       if (!skb_queue_empty(&sd->tofree_queue))
25262 +               raise_softirq_irqoff(NET_RX_SOFTIRQ);
25263         local_bh_enable();
25267  static void flush_all_backlogs(void)
25268 @@ -4831,6 +4846,7 @@
25269                 sd->rps_ipi_list = NULL;
25271                 local_irq_enable();
25272 +               preempt_check_resched_rt();
25274                 /* Send pending IPI's to kick RPS processing on remote cpus. */
25275                 while (remsd) {
25276 @@ -4844,6 +4860,7 @@
25277         } else
25278  #endif
25279                 local_irq_enable();
25280 +       preempt_check_resched_rt();
25283  static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
25284 @@ -4873,7 +4890,9 @@
25285         while (again) {
25286                 struct sk_buff *skb;
25288 +               local_irq_disable();
25289                 while ((skb = __skb_dequeue(&sd->process_queue))) {
25290 +                       local_irq_enable();
25291                         rcu_read_lock();
25292                         __netif_receive_skb(skb);
25293                         rcu_read_unlock();
25294 @@ -4881,9 +4900,9 @@
25295                         if (++work >= quota)
25296                                 return work;
25298 +                       local_irq_disable();
25299                 }
25301 -               local_irq_disable();
25302                 rps_lock(sd);
25303                 if (skb_queue_empty(&sd->input_pkt_queue)) {
25304                         /*
25305 @@ -4921,9 +4940,11 @@
25306         local_irq_save(flags);
25307         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
25308         local_irq_restore(flags);
25309 +       preempt_check_resched_rt();
25311  EXPORT_SYMBOL(__napi_schedule);
25313 +#ifndef CONFIG_PREEMPT_RT_FULL
25314  /**
25315   * __napi_schedule_irqoff - schedule for receive
25316   * @n: entry to schedule
25317 @@ -4935,6 +4956,7 @@
25318         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
25320  EXPORT_SYMBOL(__napi_schedule_irqoff);
25321 +#endif
25323  void __napi_complete(struct napi_struct *n)
25325 @@ -5224,13 +5246,21 @@
25326         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
25327         unsigned long time_limit = jiffies + 2;
25328         int budget = netdev_budget;
25329 +       struct sk_buff_head tofree_q;
25330 +       struct sk_buff *skb;
25331         LIST_HEAD(list);
25332         LIST_HEAD(repoll);
25334 +       __skb_queue_head_init(&tofree_q);
25336         local_irq_disable();
25337 +       skb_queue_splice_init(&sd->tofree_queue, &tofree_q);
25338         list_splice_init(&sd->poll_list, &list);
25339         local_irq_enable();
25341 +       while ((skb = __skb_dequeue(&tofree_q)))
25342 +               kfree_skb(skb);
25344         for (;;) {
25345                 struct napi_struct *n;
25347 @@ -5261,7 +5291,7 @@
25348         list_splice_tail(&repoll, &list);
25349         list_splice(&list, &sd->poll_list);
25350         if (!list_empty(&sd->poll_list))
25351 -               __raise_softirq_irqoff(NET_RX_SOFTIRQ);
25352 +               __raise_softirq_irqoff_ksoft(NET_RX_SOFTIRQ);
25354         net_rps_action_and_irq_enable(sd);
25356 @@ -8022,16 +8052,20 @@
25358         raise_softirq_irqoff(NET_TX_SOFTIRQ);
25359         local_irq_enable();
25360 +       preempt_check_resched_rt();
25362         /* Process offline CPU's input_pkt_queue */
25363         while ((skb = __skb_dequeue(&oldsd->process_queue))) {
25364                 netif_rx_ni(skb);
25365                 input_queue_head_incr(oldsd);
25366         }
25367 -       while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
25368 +       while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
25369                 netif_rx_ni(skb);
25370                 input_queue_head_incr(oldsd);
25371         }
25372 +       while ((skb = __skb_dequeue(&oldsd->tofree_queue))) {
25373 +               kfree_skb(skb);
25374 +       }
25376         return NOTIFY_OK;
25378 @@ -8336,8 +8370,9 @@
25380                 INIT_WORK(flush, flush_backlog);
25382 -               skb_queue_head_init(&sd->input_pkt_queue);
25383 -               skb_queue_head_init(&sd->process_queue);
25384 +               skb_queue_head_init_raw(&sd->input_pkt_queue);
25385 +               skb_queue_head_init_raw(&sd->process_queue);
25386 +               skb_queue_head_init_raw(&sd->tofree_queue);
25387                 INIT_LIST_HEAD(&sd->poll_list);
25388                 sd->output_queue_tailp = &sd->output_queue;
25389  #ifdef CONFIG_RPS
25390 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/net/core/filter.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/net/core/filter.c
25391 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/net/core/filter.c       2017-04-16 10:38:31.000000000 +0200
25392 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/net/core/filter.c    2017-04-18 17:54:26.000000000 +0200
25393 @@ -1645,7 +1645,7 @@
25395         int ret;
25397 -       if (unlikely(__this_cpu_read(xmit_recursion) > XMIT_RECURSION_LIMIT)) {
25398 +       if (unlikely(xmit_rec_read() > XMIT_RECURSION_LIMIT)) {
25399                 net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n");
25400                 kfree_skb(skb);
25401                 return -ENETDOWN;
25402 @@ -1653,9 +1653,9 @@
25404         skb->dev = dev;
25406 -       __this_cpu_inc(xmit_recursion);
25407 +       xmit_rec_inc();
25408         ret = dev_queue_xmit(skb);
25409 -       __this_cpu_dec(xmit_recursion);
25410 +       xmit_rec_dec();
25412         return ret;
25414 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/net/core/gen_estimator.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/net/core/gen_estimator.c
25415 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/net/core/gen_estimator.c        2017-04-16 10:38:31.000000000 +0200
25416 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/net/core/gen_estimator.c     2017-04-18 17:54:26.000000000 +0200
25417 @@ -84,7 +84,7 @@
25418         struct gnet_stats_basic_packed  *bstats;
25419         struct gnet_stats_rate_est64    *rate_est;
25420         spinlock_t              *stats_lock;
25421 -       seqcount_t              *running;
25422 +       net_seqlock_t           *running;
25423         int                     ewma_log;
25424         u32                     last_packets;
25425         unsigned long           avpps;
25426 @@ -213,7 +213,7 @@
25427                       struct gnet_stats_basic_cpu __percpu *cpu_bstats,
25428                       struct gnet_stats_rate_est64 *rate_est,
25429                       spinlock_t *stats_lock,
25430 -                     seqcount_t *running,
25431 +                     net_seqlock_t *running,
25432                       struct nlattr *opt)
25434         struct gen_estimator *est;
25435 @@ -309,7 +309,7 @@
25436                           struct gnet_stats_basic_cpu __percpu *cpu_bstats,
25437                           struct gnet_stats_rate_est64 *rate_est,
25438                           spinlock_t *stats_lock,
25439 -                         seqcount_t *running, struct nlattr *opt)
25440 +                         net_seqlock_t *running, struct nlattr *opt)
25442         gen_kill_estimator(bstats, rate_est);
25443         return gen_new_estimator(bstats, cpu_bstats, rate_est, stats_lock, running, opt);
25444 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/net/core/gen_stats.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/net/core/gen_stats.c
25445 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/net/core/gen_stats.c    2017-04-16 10:38:31.000000000 +0200
25446 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/net/core/gen_stats.c 2017-04-18 17:54:26.000000000 +0200
25447 @@ -130,7 +130,7 @@
25450  void
25451 -__gnet_stats_copy_basic(const seqcount_t *running,
25452 +__gnet_stats_copy_basic(net_seqlock_t *running,
25453                         struct gnet_stats_basic_packed *bstats,
25454                         struct gnet_stats_basic_cpu __percpu *cpu,
25455                         struct gnet_stats_basic_packed *b)
25456 @@ -143,10 +143,10 @@
25457         }
25458         do {
25459                 if (running)
25460 -                       seq = read_seqcount_begin(running);
25461 +                       seq = net_seq_begin(running);
25462                 bstats->bytes = b->bytes;
25463                 bstats->packets = b->packets;
25464 -       } while (running && read_seqcount_retry(running, seq));
25465 +       } while (running && net_seq_retry(running, seq));
25467  EXPORT_SYMBOL(__gnet_stats_copy_basic);
25469 @@ -164,7 +164,7 @@
25470   * if the room in the socket buffer was not sufficient.
25471   */
25472  int
25473 -gnet_stats_copy_basic(const seqcount_t *running,
25474 +gnet_stats_copy_basic(net_seqlock_t *running,
25475                       struct gnet_dump *d,
25476                       struct gnet_stats_basic_cpu __percpu *cpu,
25477                       struct gnet_stats_basic_packed *b)
25478 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/net/core/skbuff.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/net/core/skbuff.c
25479 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/net/core/skbuff.c       2017-04-16 10:38:31.000000000 +0200
25480 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/net/core/skbuff.c    2017-04-18 17:54:26.000000000 +0200
25481 @@ -64,6 +64,7 @@
25482  #include <linux/errqueue.h>
25483  #include <linux/prefetch.h>
25484  #include <linux/if_vlan.h>
25485 +#include <linux/locallock.h>
25487  #include <net/protocol.h>
25488  #include <net/dst.h>
25489 @@ -360,6 +361,8 @@
25491  static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache);
25492  static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache);
25493 +static DEFINE_LOCAL_IRQ_LOCK(netdev_alloc_lock);
25494 +static DEFINE_LOCAL_IRQ_LOCK(napi_alloc_cache_lock);
25496  static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
25498 @@ -367,10 +370,10 @@
25499         unsigned long flags;
25500         void *data;
25502 -       local_irq_save(flags);
25503 +       local_lock_irqsave(netdev_alloc_lock, flags);
25504         nc = this_cpu_ptr(&netdev_alloc_cache);
25505         data = __alloc_page_frag(nc, fragsz, gfp_mask);
25506 -       local_irq_restore(flags);
25507 +       local_unlock_irqrestore(netdev_alloc_lock, flags);
25508         return data;
25511 @@ -389,9 +392,13 @@
25513  static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
25515 -       struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
25516 +       struct napi_alloc_cache *nc;
25517 +       void *data;
25519 -       return __alloc_page_frag(&nc->page, fragsz, gfp_mask);
25520 +       nc = &get_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
25521 +       data = __alloc_page_frag(&nc->page, fragsz, gfp_mask);
25522 +       put_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
25523 +       return data;
25526  void *napi_alloc_frag(unsigned int fragsz)
25527 @@ -438,13 +445,13 @@
25528         if (sk_memalloc_socks())
25529                 gfp_mask |= __GFP_MEMALLOC;
25531 -       local_irq_save(flags);
25532 +       local_lock_irqsave(netdev_alloc_lock, flags);
25534         nc = this_cpu_ptr(&netdev_alloc_cache);
25535         data = __alloc_page_frag(nc, len, gfp_mask);
25536         pfmemalloc = nc->pfmemalloc;
25538 -       local_irq_restore(flags);
25539 +       local_unlock_irqrestore(netdev_alloc_lock, flags);
25541         if (unlikely(!data))
25542                 return NULL;
25543 @@ -485,9 +492,10 @@
25544  struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
25545                                  gfp_t gfp_mask)
25547 -       struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
25548 +       struct napi_alloc_cache *nc;
25549         struct sk_buff *skb;
25550         void *data;
25551 +       bool pfmemalloc;
25553         len += NET_SKB_PAD + NET_IP_ALIGN;
25555 @@ -505,7 +513,10 @@
25556         if (sk_memalloc_socks())
25557                 gfp_mask |= __GFP_MEMALLOC;
25559 +       nc = &get_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
25560         data = __alloc_page_frag(&nc->page, len, gfp_mask);
25561 +       pfmemalloc = nc->page.pfmemalloc;
25562 +       put_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
25563         if (unlikely(!data))
25564                 return NULL;
25566 @@ -516,7 +527,7 @@
25567         }
25569         /* use OR instead of assignment to avoid clearing of bits in mask */
25570 -       if (nc->page.pfmemalloc)
25571 +       if (pfmemalloc)
25572                 skb->pfmemalloc = 1;
25573         skb->head_frag = 1;
25575 @@ -760,23 +771,26 @@
25577  void __kfree_skb_flush(void)
25579 -       struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
25580 +       struct napi_alloc_cache *nc;
25582 +       nc = &get_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
25583         /* flush skb_cache if containing objects */
25584         if (nc->skb_count) {
25585                 kmem_cache_free_bulk(skbuff_head_cache, nc->skb_count,
25586                                      nc->skb_cache);
25587                 nc->skb_count = 0;
25588         }
25589 +       put_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
25592  static inline void _kfree_skb_defer(struct sk_buff *skb)
25594 -       struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
25595 +       struct napi_alloc_cache *nc;
25597         /* drop skb->head and call any destructors for packet */
25598         skb_release_all(skb);
25600 +       nc = &get_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
25601         /* record skb to CPU local list */
25602         nc->skb_cache[nc->skb_count++] = skb;
25604 @@ -791,6 +805,7 @@
25605                                      nc->skb_cache);
25606                 nc->skb_count = 0;
25607         }
25608 +       put_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
25610  void __kfree_skb_defer(struct sk_buff *skb)
25612 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/net/core/sock.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/net/core/sock.c
25613 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/net/core/sock.c 2017-04-16 10:38:31.000000000 +0200
25614 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/net/core/sock.c      2017-04-18 17:54:26.000000000 +0200
25615 @@ -2499,12 +2499,11 @@
25616         if (sk->sk_lock.owned)
25617                 __lock_sock(sk);
25618         sk->sk_lock.owned = 1;
25619 -       spin_unlock(&sk->sk_lock.slock);
25620 +       spin_unlock_bh(&sk->sk_lock.slock);
25621         /*
25622          * The sk_lock has mutex_lock() semantics here:
25623          */
25624         mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
25625 -       local_bh_enable();
25627  EXPORT_SYMBOL(lock_sock_nested);
25629 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/net/ipv4/icmp.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/net/ipv4/icmp.c
25630 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/net/ipv4/icmp.c 2017-04-16 10:38:31.000000000 +0200
25631 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/net/ipv4/icmp.c      2017-04-18 17:54:26.000000000 +0200
25632 @@ -69,6 +69,7 @@
25633  #include <linux/jiffies.h>
25634  #include <linux/kernel.h>
25635  #include <linux/fcntl.h>
25636 +#include <linux/sysrq.h>
25637  #include <linux/socket.h>
25638  #include <linux/in.h>
25639  #include <linux/inet.h>
25640 @@ -77,6 +78,7 @@
25641  #include <linux/string.h>
25642  #include <linux/netfilter_ipv4.h>
25643  #include <linux/slab.h>
25644 +#include <linux/locallock.h>
25645  #include <net/snmp.h>
25646  #include <net/ip.h>
25647  #include <net/route.h>
25648 @@ -204,6 +206,8 @@
25649   *
25650   *     On SMP we have one ICMP socket per-cpu.
25651   */
25652 +static DEFINE_LOCAL_IRQ_LOCK(icmp_sk_lock);
25654  static struct sock *icmp_sk(struct net *net)
25656         return *this_cpu_ptr(net->ipv4.icmp_sk);
25657 @@ -215,12 +219,14 @@
25659         local_bh_disable();
25661 +       local_lock(icmp_sk_lock);
25662         sk = icmp_sk(net);
25664         if (unlikely(!spin_trylock(&sk->sk_lock.slock))) {
25665                 /* This can happen if the output path signals a
25666                  * dst_link_failure() for an outgoing ICMP packet.
25667                  */
25668 +               local_unlock(icmp_sk_lock);
25669                 local_bh_enable();
25670                 return NULL;
25671         }
25672 @@ -230,6 +236,7 @@
25673  static inline void icmp_xmit_unlock(struct sock *sk)
25675         spin_unlock_bh(&sk->sk_lock.slock);
25676 +       local_unlock(icmp_sk_lock);
25679  int sysctl_icmp_msgs_per_sec __read_mostly = 1000;
25680 @@ -358,6 +365,7 @@
25681         struct sock *sk;
25682         struct sk_buff *skb;
25684 +       local_lock(icmp_sk_lock);
25685         sk = icmp_sk(dev_net((*rt)->dst.dev));
25686         if (ip_append_data(sk, fl4, icmp_glue_bits, icmp_param,
25687                            icmp_param->data_len+icmp_param->head_len,
25688 @@ -380,6 +388,7 @@
25689                 skb->ip_summed = CHECKSUM_NONE;
25690                 ip_push_pending_frames(sk, fl4);
25691         }
25692 +       local_unlock(icmp_sk_lock);
25695  /*
25696 @@ -891,6 +900,30 @@
25699  /*
25700 + * 32bit and 64bit have different timestamp length, so we check for
25701 + * the cookie at offset 20 and verify it is repeated at offset 50
25702 + */
25703 +#define CO_POS0                20
25704 +#define CO_POS1                50
25705 +#define CO_SIZE                sizeof(int)
25706 +#define ICMP_SYSRQ_SIZE        57
25709 + * We got a ICMP_SYSRQ_SIZE sized ping request. Check for the cookie
25710 + * pattern and if it matches send the next byte as a trigger to sysrq.
25711 + */
25712 +static void icmp_check_sysrq(struct net *net, struct sk_buff *skb)
25714 +       int cookie = htonl(net->ipv4.sysctl_icmp_echo_sysrq);
25715 +       char *p = skb->data;
25717 +       if (!memcmp(&cookie, p + CO_POS0, CO_SIZE) &&
25718 +           !memcmp(&cookie, p + CO_POS1, CO_SIZE) &&
25719 +           p[CO_POS0 + CO_SIZE] == p[CO_POS1 + CO_SIZE])
25720 +               handle_sysrq(p[CO_POS0 + CO_SIZE]);
25724   *     Handle ICMP_ECHO ("ping") requests.
25725   *
25726   *     RFC 1122: 3.2.2.6 MUST have an echo server that answers ICMP echo
25727 @@ -917,6 +950,11 @@
25728                 icmp_param.data_len        = skb->len;
25729                 icmp_param.head_len        = sizeof(struct icmphdr);
25730                 icmp_reply(&icmp_param, skb);
25732 +               if (skb->len == ICMP_SYSRQ_SIZE &&
25733 +                   net->ipv4.sysctl_icmp_echo_sysrq) {
25734 +                       icmp_check_sysrq(net, skb);
25735 +               }
25736         }
25737         /* should there be an ICMP stat for ignored echos? */
25738         return true;
25739 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/net/ipv4/sysctl_net_ipv4.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/net/ipv4/sysctl_net_ipv4.c
25740 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/net/ipv4/sysctl_net_ipv4.c      2017-04-16 10:38:31.000000000 +0200
25741 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/net/ipv4/sysctl_net_ipv4.c   2017-04-18 17:54:27.000000000 +0200
25742 @@ -681,6 +681,13 @@
25743                 .proc_handler   = proc_dointvec
25744         },
25745         {
25746 +               .procname       = "icmp_echo_sysrq",
25747 +               .data           = &init_net.ipv4.sysctl_icmp_echo_sysrq,
25748 +               .maxlen         = sizeof(int),
25749 +               .mode           = 0644,
25750 +               .proc_handler   = proc_dointvec
25751 +       },
25752 +       {
25753                 .procname       = "icmp_ignore_bogus_error_responses",
25754                 .data           = &init_net.ipv4.sysctl_icmp_ignore_bogus_error_responses,
25755                 .maxlen         = sizeof(int),
25756 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/net/ipv4/tcp_ipv4.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/net/ipv4/tcp_ipv4.c
25757 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/net/ipv4/tcp_ipv4.c     2017-04-16 10:38:31.000000000 +0200
25758 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/net/ipv4/tcp_ipv4.c  2017-04-18 17:54:27.000000000 +0200
25759 @@ -62,6 +62,7 @@
25760  #include <linux/init.h>
25761  #include <linux/times.h>
25762  #include <linux/slab.h>
25763 +#include <linux/locallock.h>
25765  #include <net/net_namespace.h>
25766  #include <net/icmp.h>
25767 @@ -568,6 +569,7 @@
25769  EXPORT_SYMBOL(tcp_v4_send_check);
25771 +static DEFINE_LOCAL_IRQ_LOCK(tcp_sk_lock);
25772  /*
25773   *     This routine will send an RST to the other tcp.
25774   *
25775 @@ -695,6 +697,8 @@
25776                      offsetof(struct inet_timewait_sock, tw_bound_dev_if));
25778         arg.tos = ip_hdr(skb)->tos;
25780 +       local_lock(tcp_sk_lock);
25781         local_bh_disable();
25782         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
25783                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
25784 @@ -704,6 +708,7 @@
25785         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
25786         __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
25787         local_bh_enable();
25788 +       local_unlock(tcp_sk_lock);
25790  #ifdef CONFIG_TCP_MD5SIG
25791  out:
25792 @@ -779,6 +784,7 @@
25793         if (oif)
25794                 arg.bound_dev_if = oif;
25795         arg.tos = tos;
25796 +       local_lock(tcp_sk_lock);
25797         local_bh_disable();
25798         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
25799                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
25800 @@ -787,6 +793,7 @@
25802         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
25803         local_bh_enable();
25804 +       local_unlock(tcp_sk_lock);
25807  static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
25808 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/net/mac80211/rx.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/net/mac80211/rx.c
25809 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/net/mac80211/rx.c       2017-04-16 10:38:32.000000000 +0200
25810 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/net/mac80211/rx.c    2017-04-18 17:54:27.000000000 +0200
25811 @@ -4180,7 +4180,7 @@
25812         struct ieee80211_supported_band *sband;
25813         struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
25815 -       WARN_ON_ONCE(softirq_count() == 0);
25816 +       WARN_ON_ONCE_NONRT(softirq_count() == 0);
25818         if (WARN_ON(status->band >= NUM_NL80211_BANDS))
25819                 goto drop;
25820 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/net/netfilter/core.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/net/netfilter/core.c
25821 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/net/netfilter/core.c    2017-04-16 10:38:32.000000000 +0200
25822 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/net/netfilter/core.c 2017-04-18 17:54:27.000000000 +0200
25823 @@ -22,12 +22,18 @@
25824  #include <linux/proc_fs.h>
25825  #include <linux/mutex.h>
25826  #include <linux/slab.h>
25827 +#include <linux/locallock.h>
25828  #include <linux/rcupdate.h>
25829  #include <net/net_namespace.h>
25830  #include <net/sock.h>
25832  #include "nf_internals.h"
25834 +#ifdef CONFIG_PREEMPT_RT_BASE
25835 +DEFINE_LOCAL_IRQ_LOCK(xt_write_lock);
25836 +EXPORT_PER_CPU_SYMBOL(xt_write_lock);
25837 +#endif
25839  static DEFINE_MUTEX(afinfo_mutex);
25841  const struct nf_afinfo __rcu *nf_afinfo[NFPROTO_NUMPROTO] __read_mostly;
25842 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/net/packet/af_packet.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/net/packet/af_packet.c
25843 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/net/packet/af_packet.c  2017-04-16 10:38:33.000000000 +0200
25844 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/net/packet/af_packet.c       2017-04-18 17:54:27.000000000 +0200
25845 @@ -63,6 +63,7 @@
25846  #include <linux/if_packet.h>
25847  #include <linux/wireless.h>
25848  #include <linux/kernel.h>
25849 +#include <linux/delay.h>
25850  #include <linux/kmod.h>
25851  #include <linux/slab.h>
25852  #include <linux/vmalloc.h>
25853 @@ -694,7 +695,7 @@
25854         if (BLOCK_NUM_PKTS(pbd)) {
25855                 while (atomic_read(&pkc->blk_fill_in_prog)) {
25856                         /* Waiting for skb_copy_bits to finish... */
25857 -                       cpu_relax();
25858 +                       cpu_chill();
25859                 }
25860         }
25862 @@ -956,7 +957,7 @@
25863                 if (!(status & TP_STATUS_BLK_TMO)) {
25864                         while (atomic_read(&pkc->blk_fill_in_prog)) {
25865                                 /* Waiting for skb_copy_bits to finish... */
25866 -                               cpu_relax();
25867 +                               cpu_chill();
25868                         }
25869                 }
25870                 prb_close_block(pkc, pbd, po, status);
25871 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/net/rds/ib_rdma.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/net/rds/ib_rdma.c
25872 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/net/rds/ib_rdma.c       2017-04-16 10:38:33.000000000 +0200
25873 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/net/rds/ib_rdma.c    2017-04-18 17:54:27.000000000 +0200
25874 @@ -34,6 +34,7 @@
25875  #include <linux/slab.h>
25876  #include <linux/rculist.h>
25877  #include <linux/llist.h>
25878 +#include <linux/delay.h>
25880  #include "rds_single_path.h"
25881  #include "ib_mr.h"
25882 @@ -210,7 +211,7 @@
25883         for_each_online_cpu(cpu) {
25884                 flag = &per_cpu(clean_list_grace, cpu);
25885                 while (test_bit(CLEAN_LIST_BUSY_BIT, flag))
25886 -                       cpu_relax();
25887 +                       cpu_chill();
25888         }
25891 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/net/rxrpc/security.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/net/rxrpc/security.c
25892 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/net/rxrpc/security.c    2017-04-16 10:38:33.000000000 +0200
25893 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/net/rxrpc/security.c 2017-04-18 17:54:27.000000000 +0200
25894 @@ -19,9 +19,6 @@
25895  #include <keys/rxrpc-type.h>
25896  #include "ar-internal.h"
25898 -static LIST_HEAD(rxrpc_security_methods);
25899 -static DECLARE_RWSEM(rxrpc_security_sem);
25901  static const struct rxrpc_security *rxrpc_security_types[] = {
25902         [RXRPC_SECURITY_NONE]   = &rxrpc_no_security,
25903  #ifdef CONFIG_RXKAD
25904 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/net/sched/sch_api.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/net/sched/sch_api.c
25905 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/net/sched/sch_api.c     2017-04-16 10:38:33.000000000 +0200
25906 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/net/sched/sch_api.c  2017-04-18 17:54:27.000000000 +0200
25907 @@ -981,7 +981,7 @@
25908                         rcu_assign_pointer(sch->stab, stab);
25909                 }
25910                 if (tca[TCA_RATE]) {
25911 -                       seqcount_t *running;
25912 +                       net_seqlock_t *running;
25914                         err = -EOPNOTSUPP;
25915                         if (sch->flags & TCQ_F_MQROOT)
25916 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/net/sched/sch_generic.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/net/sched/sch_generic.c
25917 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/net/sched/sch_generic.c 2017-04-16 10:38:33.000000000 +0200
25918 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/net/sched/sch_generic.c      2017-04-18 17:54:27.000000000 +0200
25919 @@ -425,7 +425,11 @@
25920         .ops            =       &noop_qdisc_ops,
25921         .q.lock         =       __SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock),
25922         .dev_queue      =       &noop_netdev_queue,
25923 +#ifdef CONFIG_PREEMPT_RT_BASE
25924 +       .running        =       __SEQLOCK_UNLOCKED(noop_qdisc.running),
25925 +#else
25926         .running        =       SEQCNT_ZERO(noop_qdisc.running),
25927 +#endif
25928         .busylock       =       __SPIN_LOCK_UNLOCKED(noop_qdisc.busylock),
25929  };
25930  EXPORT_SYMBOL(noop_qdisc);
25931 @@ -624,9 +628,17 @@
25932         lockdep_set_class(&sch->busylock,
25933                           dev->qdisc_tx_busylock ?: &qdisc_tx_busylock);
25935 +#ifdef CONFIG_PREEMPT_RT_BASE
25936 +       seqlock_init(&sch->running);
25937 +       lockdep_set_class(&sch->running.seqcount,
25938 +                         dev->qdisc_running_key ?: &qdisc_running_key);
25939 +       lockdep_set_class(&sch->running.lock,
25940 +                         dev->qdisc_running_key ?: &qdisc_running_key);
25941 +#else
25942         seqcount_init(&sch->running);
25943         lockdep_set_class(&sch->running,
25944                           dev->qdisc_running_key ?: &qdisc_running_key);
25945 +#endif
25947         sch->ops = ops;
25948         sch->enqueue = ops->enqueue;
25949 @@ -925,7 +937,7 @@
25950         /* Wait for outstanding qdisc_run calls. */
25951         list_for_each_entry(dev, head, close_list)
25952                 while (some_qdisc_is_busy(dev))
25953 -                       yield();
25954 +                       msleep(1);
25957  void dev_deactivate(struct net_device *dev)
25958 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/net/sunrpc/svc_xprt.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/net/sunrpc/svc_xprt.c
25959 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/net/sunrpc/svc_xprt.c   2017-04-16 10:38:34.000000000 +0200
25960 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/net/sunrpc/svc_xprt.c        2017-04-18 17:54:27.000000000 +0200
25961 @@ -396,7 +396,7 @@
25962                 goto out;
25963         }
25965 -       cpu = get_cpu();
25966 +       cpu = get_cpu_light();
25967         pool = svc_pool_for_cpu(xprt->xpt_server, cpu);
25969         atomic_long_inc(&pool->sp_stats.packets);
25970 @@ -432,7 +432,7 @@
25972                 atomic_long_inc(&pool->sp_stats.threads_woken);
25973                 wake_up_process(rqstp->rq_task);
25974 -               put_cpu();
25975 +               put_cpu_light();
25976                 goto out;
25977         }
25978         rcu_read_unlock();
25979 @@ -453,7 +453,7 @@
25980                 goto redo_search;
25981         }
25982         rqstp = NULL;
25983 -       put_cpu();
25984 +       put_cpu_light();
25985  out:
25986         trace_svc_xprt_do_enqueue(xprt, rqstp);
25988 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/scripts/mkcompile_h linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/scripts/mkcompile_h
25989 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/scripts/mkcompile_h     2017-04-16 10:38:34.000000000 +0200
25990 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/scripts/mkcompile_h  2017-04-18 17:54:27.000000000 +0200
25991 @@ -4,7 +4,8 @@
25992  ARCH=$2
25993  SMP=$3
25994  PREEMPT=$4
25995 -CC=$5
25996 +RT=$5
25997 +CC=$6
25999  vecho() { [ "${quiet}" = "silent_" ] || echo "$@" ; }
26001 @@ -57,6 +58,7 @@
26002  CONFIG_FLAGS=""
26003  if [ -n "$SMP" ] ; then CONFIG_FLAGS="SMP"; fi
26004  if [ -n "$PREEMPT" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS PREEMPT"; fi
26005 +if [ -n "$RT" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS RT"; fi
26006  UTS_VERSION="$UTS_VERSION $CONFIG_FLAGS $TIMESTAMP"
26008  # Truncate to maximum length
26009 diff -Nur linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/sound/core/pcm_native.c linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/sound/core/pcm_native.c
26010 --- linux-72134397d72079a533c8fc742701fdc7f5ae7c5b.orig/sound/core/pcm_native.c 2017-04-16 10:38:35.000000000 +0200
26011 +++ linux-72134397d72079a533c8fc742701fdc7f5ae7c5b/sound/core/pcm_native.c      2017-04-18 17:54:27.000000000 +0200
26012 @@ -135,7 +135,7 @@
26013  void snd_pcm_stream_lock_irq(struct snd_pcm_substream *substream)
26015         if (!substream->pcm->nonatomic)
26016 -               local_irq_disable();
26017 +               local_irq_disable_nort();
26018         snd_pcm_stream_lock(substream);
26020  EXPORT_SYMBOL_GPL(snd_pcm_stream_lock_irq);
26021 @@ -150,7 +150,7 @@
26023         snd_pcm_stream_unlock(substream);
26024         if (!substream->pcm->nonatomic)
26025 -               local_irq_enable();
26026 +               local_irq_enable_nort();
26028  EXPORT_SYMBOL_GPL(snd_pcm_stream_unlock_irq);
26030 @@ -158,7 +158,7 @@
26032         unsigned long flags = 0;
26033         if (!substream->pcm->nonatomic)
26034 -               local_irq_save(flags);
26035 +               local_irq_save_nort(flags);
26036         snd_pcm_stream_lock(substream);
26037         return flags;
26039 @@ -176,7 +176,7 @@
26041         snd_pcm_stream_unlock(substream);
26042         if (!substream->pcm->nonatomic)
26043 -               local_irq_restore(flags);
26044 +               local_irq_restore_nort(flags);
26046  EXPORT_SYMBOL_GPL(snd_pcm_stream_unlock_irqrestore);