1 diff -Nur linux-4.9.6.orig/arch/arm/include/asm/irq.h linux-4.9.6/arch/arm/include/asm/irq.h
2 --- linux-4.9.6.orig/arch/arm/include/asm/irq.h 2017-01-26 08:25:24.000000000 +0100
3 +++ linux-4.9.6/arch/arm/include/asm/irq.h 2017-01-28 13:59:09.887654498 +0100
8 +#include <linux/cpumask.h>
12 extern void migrate_irqs(void);
13 diff -Nur linux-4.9.6.orig/arch/arm/include/asm/switch_to.h linux-4.9.6/arch/arm/include/asm/switch_to.h
14 --- linux-4.9.6.orig/arch/arm/include/asm/switch_to.h 2017-01-26 08:25:24.000000000 +0100
15 +++ linux-4.9.6/arch/arm/include/asm/switch_to.h 2017-01-28 13:59:09.887654498 +0100
18 #include <linux/thread_info.h>
20 +#if defined CONFIG_PREEMPT_RT_FULL && defined CONFIG_HIGHMEM
21 +void switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p);
24 +switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p) { }
28 * For v7 SMP cores running a preemptible kernel we may be pre-empted
29 * during a TLB maintenance operation, so execute an inner-shareable dsb
31 #define switch_to(prev,next,last) \
33 __complete_pending_tlbi(); \
34 + switch_kmaps(prev, next); \
35 last = __switch_to(prev,task_thread_info(prev), task_thread_info(next)); \
38 diff -Nur linux-4.9.6.orig/arch/arm/include/asm/thread_info.h linux-4.9.6/arch/arm/include/asm/thread_info.h
39 --- linux-4.9.6.orig/arch/arm/include/asm/thread_info.h 2017-01-26 08:25:24.000000000 +0100
40 +++ linux-4.9.6/arch/arm/include/asm/thread_info.h 2017-01-28 13:59:09.887654498 +0100
43 unsigned long flags; /* low level flags */
44 int preempt_count; /* 0 => preemptable, <0 => bug */
45 + int preempt_lazy_count; /* 0 => preemptable, <0 => bug */
46 mm_segment_t addr_limit; /* address limit */
47 struct task_struct *task; /* main task structure */
50 #define TIF_SYSCALL_TRACE 4 /* syscall trace active */
51 #define TIF_SYSCALL_AUDIT 5 /* syscall auditing active */
52 #define TIF_SYSCALL_TRACEPOINT 6 /* syscall tracepoint instrumentation */
53 -#define TIF_SECCOMP 7 /* seccomp syscall filtering active */
54 +#define TIF_SECCOMP 8 /* seccomp syscall filtering active */
55 +#define TIF_NEED_RESCHED_LAZY 7
57 #define TIF_NOHZ 12 /* in adaptive nohz mode */
58 #define TIF_USING_IWMMXT 17
60 #define _TIF_SIGPENDING (1 << TIF_SIGPENDING)
61 #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED)
62 #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME)
63 +#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY)
64 #define _TIF_UPROBE (1 << TIF_UPROBE)
65 #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE)
66 #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT)
68 * Change these and you break ASM code in entry-common.S
70 #define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
71 - _TIF_NOTIFY_RESUME | _TIF_UPROBE)
72 + _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
73 + _TIF_NEED_RESCHED_LAZY)
75 #endif /* __KERNEL__ */
76 #endif /* __ASM_ARM_THREAD_INFO_H */
77 diff -Nur linux-4.9.6.orig/arch/arm/Kconfig linux-4.9.6/arch/arm/Kconfig
78 --- linux-4.9.6.orig/arch/arm/Kconfig 2017-01-26 08:25:24.000000000 +0100
79 +++ linux-4.9.6/arch/arm/Kconfig 2017-01-28 13:59:09.887654498 +0100
81 select HAVE_ARCH_AUDITSYSCALL if (AEABI && !OABI_COMPAT)
82 select HAVE_ARCH_BITREVERSE if (CPU_32v7M || CPU_32v7) && !CPU_32v6
83 select HAVE_ARCH_HARDENED_USERCOPY
84 - select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 && MMU
85 + select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 && MMU && !PREEMPT_RT_BASE
86 select HAVE_ARCH_KGDB if !CPU_ENDIAN_BE32 && MMU
87 select HAVE_ARCH_MMAP_RND_BITS if MMU
88 select HAVE_ARCH_SECCOMP_FILTER if (AEABI && !OABI_COMPAT)
90 select HAVE_PERF_EVENTS
92 select HAVE_PERF_USER_STACK_DUMP
93 + select HAVE_PREEMPT_LAZY
94 select HAVE_RCU_TABLE_FREE if (SMP && ARM_LPAE)
95 select HAVE_REGS_AND_STACK_ACCESS_API
96 select HAVE_SYSCALL_TRACEPOINTS
97 diff -Nur linux-4.9.6.orig/arch/arm/kernel/asm-offsets.c linux-4.9.6/arch/arm/kernel/asm-offsets.c
98 --- linux-4.9.6.orig/arch/arm/kernel/asm-offsets.c 2017-01-26 08:25:24.000000000 +0100
99 +++ linux-4.9.6/arch/arm/kernel/asm-offsets.c 2017-01-28 13:59:09.887654498 +0100
102 DEFINE(TI_FLAGS, offsetof(struct thread_info, flags));
103 DEFINE(TI_PREEMPT, offsetof(struct thread_info, preempt_count));
104 + DEFINE(TI_PREEMPT_LAZY, offsetof(struct thread_info, preempt_lazy_count));
105 DEFINE(TI_ADDR_LIMIT, offsetof(struct thread_info, addr_limit));
106 DEFINE(TI_TASK, offsetof(struct thread_info, task));
107 DEFINE(TI_CPU, offsetof(struct thread_info, cpu));
108 diff -Nur linux-4.9.6.orig/arch/arm/kernel/entry-armv.S linux-4.9.6/arch/arm/kernel/entry-armv.S
109 --- linux-4.9.6.orig/arch/arm/kernel/entry-armv.S 2017-01-26 08:25:24.000000000 +0100
110 +++ linux-4.9.6/arch/arm/kernel/entry-armv.S 2017-01-28 13:59:09.887654498 +0100
111 @@ -220,11 +220,18 @@
113 #ifdef CONFIG_PREEMPT
114 ldr r8, [tsk, #TI_PREEMPT] @ get preempt count
115 - ldr r0, [tsk, #TI_FLAGS] @ get flags
116 teq r8, #0 @ if preempt count != 0
117 + bne 1f @ return from exeption
118 + ldr r0, [tsk, #TI_FLAGS] @ get flags
119 + tst r0, #_TIF_NEED_RESCHED @ if NEED_RESCHED is set
120 + blne svc_preempt @ preempt!
122 + ldr r8, [tsk, #TI_PREEMPT_LAZY] @ get preempt lazy count
123 + teq r8, #0 @ if preempt lazy count != 0
124 movne r0, #0 @ force flags to 0
125 - tst r0, #_TIF_NEED_RESCHED
126 + tst r0, #_TIF_NEED_RESCHED_LAZY
131 svc_exit r5, irq = 1 @ return from exception
133 1: bl preempt_schedule_irq @ irq en/disable is done inside
134 ldr r0, [tsk, #TI_FLAGS] @ get new tasks TI_FLAGS
135 tst r0, #_TIF_NEED_RESCHED
137 + tst r0, #_TIF_NEED_RESCHED_LAZY
140 + ldr r0, [tsk, #TI_PREEMPT_LAZY] @ get preempt lazy count
141 + teq r0, #0 @ if preempt lazy count != 0
148 diff -Nur linux-4.9.6.orig/arch/arm/kernel/entry-common.S linux-4.9.6/arch/arm/kernel/entry-common.S
149 --- linux-4.9.6.orig/arch/arm/kernel/entry-common.S 2017-01-26 08:25:24.000000000 +0100
150 +++ linux-4.9.6/arch/arm/kernel/entry-common.S 2017-01-28 13:59:09.887654498 +0100
153 disable_irq_notrace @ disable interrupts
154 ldr r1, [tsk, #TI_FLAGS] @ re-check for syscall tracing
155 - tst r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
156 + tst r1, #((_TIF_SYSCALL_WORK | _TIF_WORK_MASK) & ~_TIF_SECCOMP)
157 + bne fast_work_pending
158 + tst r1, #_TIF_SECCOMP
159 bne fast_work_pending
161 /* perform architecture specific actions before user return */
163 str r0, [sp, #S_R0 + S_OFF]! @ save returned r0
164 disable_irq_notrace @ disable interrupts
165 ldr r1, [tsk, #TI_FLAGS] @ re-check for syscall tracing
166 - tst r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
167 + tst r1, #((_TIF_SYSCALL_WORK | _TIF_WORK_MASK) & ~_TIF_SECCOMP)
169 + tst r1, #_TIF_SECCOMP
173 ENDPROC(ret_fast_syscall)
175 diff -Nur linux-4.9.6.orig/arch/arm/kernel/patch.c linux-4.9.6/arch/arm/kernel/patch.c
176 --- linux-4.9.6.orig/arch/arm/kernel/patch.c 2017-01-26 08:25:24.000000000 +0100
177 +++ linux-4.9.6/arch/arm/kernel/patch.c 2017-01-28 13:59:09.895654805 +0100
182 -static DEFINE_SPINLOCK(patch_lock);
183 +static DEFINE_RAW_SPINLOCK(patch_lock);
185 static void __kprobes *patch_map(void *addr, int fixmap, unsigned long *flags)
186 __acquires(&patch_lock)
191 - spin_lock_irqsave(&patch_lock, *flags);
192 + raw_spin_lock_irqsave(&patch_lock, *flags);
194 __acquire(&patch_lock);
197 clear_fixmap(fixmap);
200 - spin_unlock_irqrestore(&patch_lock, *flags);
201 + raw_spin_unlock_irqrestore(&patch_lock, *flags);
203 __release(&patch_lock);
205 diff -Nur linux-4.9.6.orig/arch/arm/kernel/process.c linux-4.9.6/arch/arm/kernel/process.c
206 --- linux-4.9.6.orig/arch/arm/kernel/process.c 2017-01-26 08:25:24.000000000 +0100
207 +++ linux-4.9.6/arch/arm/kernel/process.c 2017-01-28 13:59:09.895654805 +0100
213 + * CONFIG_SPLIT_PTLOCK_CPUS results in a page->ptl lock. If the lock is not
214 + * initialized by pgtable_page_ctor() then a coredump of the vector page will
217 +static int __init vectors_user_mapping_init_page(void)
220 + unsigned long addr = 0xffff0000;
225 + pgd = pgd_offset_k(addr);
226 + pud = pud_offset(pgd, addr);
227 + pmd = pmd_offset(pud, addr);
228 + page = pmd_page(*(pmd));
230 + pgtable_page_ctor(page);
234 +late_initcall(vectors_user_mapping_init_page);
236 #ifdef CONFIG_KUSER_HELPERS
238 * The vectors page is always readable from user space for the
239 diff -Nur linux-4.9.6.orig/arch/arm/kernel/signal.c linux-4.9.6/arch/arm/kernel/signal.c
240 --- linux-4.9.6.orig/arch/arm/kernel/signal.c 2017-01-26 08:25:24.000000000 +0100
241 +++ linux-4.9.6/arch/arm/kernel/signal.c 2017-01-28 13:59:09.895654805 +0100
244 trace_hardirqs_off();
246 - if (likely(thread_flags & _TIF_NEED_RESCHED)) {
247 + if (likely(thread_flags & (_TIF_NEED_RESCHED |
248 + _TIF_NEED_RESCHED_LAZY))) {
251 if (unlikely(!user_mode(regs)))
252 diff -Nur linux-4.9.6.orig/arch/arm/kernel/smp.c linux-4.9.6/arch/arm/kernel/smp.c
253 --- linux-4.9.6.orig/arch/arm/kernel/smp.c 2017-01-26 08:25:24.000000000 +0100
254 +++ linux-4.9.6/arch/arm/kernel/smp.c 2017-01-28 13:59:09.895654805 +0100
257 local_flush_tlb_all();
259 - clear_tasks_mm_cpumask(cpu);
265 pr_err("CPU%u: cpu didn't die\n", cpu);
269 + clear_tasks_mm_cpumask(cpu);
271 pr_notice("CPU%u: shutdown\n", cpu);
274 diff -Nur linux-4.9.6.orig/arch/arm/kernel/unwind.c linux-4.9.6/arch/arm/kernel/unwind.c
275 --- linux-4.9.6.orig/arch/arm/kernel/unwind.c 2017-01-26 08:25:24.000000000 +0100
276 +++ linux-4.9.6/arch/arm/kernel/unwind.c 2017-01-28 13:59:09.895654805 +0100
278 static const struct unwind_idx *__origin_unwind_idx;
279 extern const struct unwind_idx __stop_unwind_idx[];
281 -static DEFINE_SPINLOCK(unwind_lock);
282 +static DEFINE_RAW_SPINLOCK(unwind_lock);
283 static LIST_HEAD(unwind_tables);
285 /* Convert a prel31 symbol to an absolute address */
287 /* module unwind tables */
288 struct unwind_table *table;
290 - spin_lock_irqsave(&unwind_lock, flags);
291 + raw_spin_lock_irqsave(&unwind_lock, flags);
292 list_for_each_entry(table, &unwind_tables, list) {
293 if (addr >= table->begin_addr &&
294 addr < table->end_addr) {
299 - spin_unlock_irqrestore(&unwind_lock, flags);
300 + raw_spin_unlock_irqrestore(&unwind_lock, flags);
303 pr_debug("%s: idx = %p\n", __func__, idx);
305 tab->begin_addr = text_addr;
306 tab->end_addr = text_addr + text_size;
308 - spin_lock_irqsave(&unwind_lock, flags);
309 + raw_spin_lock_irqsave(&unwind_lock, flags);
310 list_add_tail(&tab->list, &unwind_tables);
311 - spin_unlock_irqrestore(&unwind_lock, flags);
312 + raw_spin_unlock_irqrestore(&unwind_lock, flags);
320 - spin_lock_irqsave(&unwind_lock, flags);
321 + raw_spin_lock_irqsave(&unwind_lock, flags);
322 list_del(&tab->list);
323 - spin_unlock_irqrestore(&unwind_lock, flags);
324 + raw_spin_unlock_irqrestore(&unwind_lock, flags);
328 diff -Nur linux-4.9.6.orig/arch/arm/kvm/arm.c linux-4.9.6/arch/arm/kvm/arm.c
329 --- linux-4.9.6.orig/arch/arm/kvm/arm.c 2017-01-26 08:25:24.000000000 +0100
330 +++ linux-4.9.6/arch/arm/kvm/arm.c 2017-01-28 13:59:09.895654805 +0100
332 * involves poking the GIC, which must be done in a
333 * non-preemptible context.
337 kvm_pmu_flush_hwstate(vcpu);
338 kvm_timer_flush_hwstate(vcpu);
339 kvm_vgic_flush_hwstate(vcpu);
341 kvm_pmu_sync_hwstate(vcpu);
342 kvm_timer_sync_hwstate(vcpu);
343 kvm_vgic_sync_hwstate(vcpu);
351 kvm_vgic_sync_hwstate(vcpu);
356 ret = handle_exit(vcpu, run, ret);
358 diff -Nur linux-4.9.6.orig/arch/arm/mach-exynos/platsmp.c linux-4.9.6/arch/arm/mach-exynos/platsmp.c
359 --- linux-4.9.6.orig/arch/arm/mach-exynos/platsmp.c 2017-01-26 08:25:24.000000000 +0100
360 +++ linux-4.9.6/arch/arm/mach-exynos/platsmp.c 2017-01-28 13:59:09.895654805 +0100
362 return (void __iomem *)(S5P_VA_SCU);
365 -static DEFINE_SPINLOCK(boot_lock);
366 +static DEFINE_RAW_SPINLOCK(boot_lock);
368 static void exynos_secondary_init(unsigned int cpu)
372 * Synchronise with the boot thread.
374 - spin_lock(&boot_lock);
375 - spin_unlock(&boot_lock);
376 + raw_spin_lock(&boot_lock);
377 + raw_spin_unlock(&boot_lock);
380 int exynos_set_boot_addr(u32 core_id, unsigned long boot_addr)
382 * Set synchronisation state between this boot processor
383 * and the secondary one
385 - spin_lock(&boot_lock);
386 + raw_spin_lock(&boot_lock);
389 * The secondary processor is waiting to be released from
393 printk(KERN_ERR "cpu1 power enable failed");
394 - spin_unlock(&boot_lock);
395 + raw_spin_unlock(&boot_lock);
400 * calibrations, then wait for it to finish
403 - spin_unlock(&boot_lock);
404 + raw_spin_unlock(&boot_lock);
406 return pen_release != -1 ? ret : 0;
408 diff -Nur linux-4.9.6.orig/arch/arm/mach-hisi/platmcpm.c linux-4.9.6/arch/arm/mach-hisi/platmcpm.c
409 --- linux-4.9.6.orig/arch/arm/mach-hisi/platmcpm.c 2017-01-26 08:25:24.000000000 +0100
410 +++ linux-4.9.6/arch/arm/mach-hisi/platmcpm.c 2017-01-28 13:59:09.895654805 +0100
413 static void __iomem *sysctrl, *fabric;
414 static int hip04_cpu_table[HIP04_MAX_CLUSTERS][HIP04_MAX_CPUS_PER_CLUSTER];
415 -static DEFINE_SPINLOCK(boot_lock);
416 +static DEFINE_RAW_SPINLOCK(boot_lock);
417 static u32 fabric_phys_addr;
419 * [0]: bootwrapper physical address
421 if (cluster >= HIP04_MAX_CLUSTERS || cpu >= HIP04_MAX_CPUS_PER_CLUSTER)
424 - spin_lock_irq(&boot_lock);
425 + raw_spin_lock_irq(&boot_lock);
427 if (hip04_cpu_table[cluster][cpu])
432 hip04_cpu_table[cluster][cpu]++;
433 - spin_unlock_irq(&boot_lock);
434 + raw_spin_unlock_irq(&boot_lock);
438 @@ -162,11 +162,11 @@
439 cpu = MPIDR_AFFINITY_LEVEL(mpidr, 0);
440 cluster = MPIDR_AFFINITY_LEVEL(mpidr, 1);
442 - spin_lock(&boot_lock);
443 + raw_spin_lock(&boot_lock);
444 hip04_cpu_table[cluster][cpu]--;
445 if (hip04_cpu_table[cluster][cpu] == 1) {
446 /* A power_up request went ahead of us. */
447 - spin_unlock(&boot_lock);
448 + raw_spin_unlock(&boot_lock);
450 } else if (hip04_cpu_table[cluster][cpu] > 1) {
451 pr_err("Cluster %d CPU%d boots multiple times\n", cluster, cpu);
455 last_man = hip04_cluster_is_down(cluster);
456 - spin_unlock(&boot_lock);
457 + raw_spin_unlock(&boot_lock);
459 /* Since it's Cortex A15, disable L2 prefetching. */
462 cpu >= HIP04_MAX_CPUS_PER_CLUSTER);
464 count = TIMEOUT_MSEC / POLL_MSEC;
465 - spin_lock_irq(&boot_lock);
466 + raw_spin_lock_irq(&boot_lock);
467 for (tries = 0; tries < count; tries++) {
468 if (hip04_cpu_table[cluster][cpu])
470 @@ -211,10 +211,10 @@
471 data = readl_relaxed(sysctrl + SC_CPU_RESET_STATUS(cluster));
472 if (data & CORE_WFI_STATUS(cpu))
474 - spin_unlock_irq(&boot_lock);
475 + raw_spin_unlock_irq(&boot_lock);
476 /* Wait for clean L2 when the whole cluster is down. */
478 - spin_lock_irq(&boot_lock);
479 + raw_spin_lock_irq(&boot_lock);
483 @@ -231,10 +231,10 @@
485 if (hip04_cluster_is_down(cluster))
486 hip04_set_snoop_filter(cluster, 0);
487 - spin_unlock_irq(&boot_lock);
488 + raw_spin_unlock_irq(&boot_lock);
491 - spin_unlock_irq(&boot_lock);
492 + raw_spin_unlock_irq(&boot_lock);
496 diff -Nur linux-4.9.6.orig/arch/arm/mach-omap2/omap-smp.c linux-4.9.6/arch/arm/mach-omap2/omap-smp.c
497 --- linux-4.9.6.orig/arch/arm/mach-omap2/omap-smp.c 2017-01-26 08:25:24.000000000 +0100
498 +++ linux-4.9.6/arch/arm/mach-omap2/omap-smp.c 2017-01-28 13:59:09.895654805 +0100
500 .startup_addr = omap5_secondary_startup,
503 -static DEFINE_SPINLOCK(boot_lock);
504 +static DEFINE_RAW_SPINLOCK(boot_lock);
506 void __iomem *omap4_get_scu_base(void)
510 * Synchronise with the boot thread.
512 - spin_lock(&boot_lock);
513 - spin_unlock(&boot_lock);
514 + raw_spin_lock(&boot_lock);
515 + raw_spin_unlock(&boot_lock);
518 static int omap4_boot_secondary(unsigned int cpu, struct task_struct *idle)
520 * Set synchronisation state between this boot processor
521 * and the secondary one
523 - spin_lock(&boot_lock);
524 + raw_spin_lock(&boot_lock);
527 * Update the AuxCoreBoot0 with boot state for secondary core.
529 * Now the secondary core is starting up let it run its
530 * calibrations, then wait for it to finish
532 - spin_unlock(&boot_lock);
533 + raw_spin_unlock(&boot_lock);
537 diff -Nur linux-4.9.6.orig/arch/arm/mach-prima2/platsmp.c linux-4.9.6/arch/arm/mach-prima2/platsmp.c
538 --- linux-4.9.6.orig/arch/arm/mach-prima2/platsmp.c 2017-01-26 08:25:24.000000000 +0100
539 +++ linux-4.9.6/arch/arm/mach-prima2/platsmp.c 2017-01-28 13:59:09.895654805 +0100
542 static void __iomem *clk_base;
544 -static DEFINE_SPINLOCK(boot_lock);
545 +static DEFINE_RAW_SPINLOCK(boot_lock);
547 static void sirfsoc_secondary_init(unsigned int cpu)
551 * Synchronise with the boot thread.
553 - spin_lock(&boot_lock);
554 - spin_unlock(&boot_lock);
555 + raw_spin_lock(&boot_lock);
556 + raw_spin_unlock(&boot_lock);
559 static const struct of_device_id clk_ids[] = {
561 /* make sure write buffer is drained */
564 - spin_lock(&boot_lock);
565 + raw_spin_lock(&boot_lock);
568 * The secondary processor is waiting to be released from
570 * now the secondary core is starting up let it run its
571 * calibrations, then wait for it to finish
573 - spin_unlock(&boot_lock);
574 + raw_spin_unlock(&boot_lock);
576 return pen_release != -1 ? -ENOSYS : 0;
578 diff -Nur linux-4.9.6.orig/arch/arm/mach-qcom/platsmp.c linux-4.9.6/arch/arm/mach-qcom/platsmp.c
579 --- linux-4.9.6.orig/arch/arm/mach-qcom/platsmp.c 2017-01-26 08:25:24.000000000 +0100
580 +++ linux-4.9.6/arch/arm/mach-qcom/platsmp.c 2017-01-28 13:59:09.895654805 +0100
583 extern void secondary_startup_arm(void);
585 -static DEFINE_SPINLOCK(boot_lock);
586 +static DEFINE_RAW_SPINLOCK(boot_lock);
588 #ifdef CONFIG_HOTPLUG_CPU
589 static void qcom_cpu_die(unsigned int cpu)
592 * Synchronise with the boot thread.
594 - spin_lock(&boot_lock);
595 - spin_unlock(&boot_lock);
596 + raw_spin_lock(&boot_lock);
597 + raw_spin_unlock(&boot_lock);
600 static int scss_release_secondary(unsigned int cpu)
602 * set synchronisation state between this boot processor
603 * and the secondary one
605 - spin_lock(&boot_lock);
606 + raw_spin_lock(&boot_lock);
609 * Send the secondary CPU a soft interrupt, thereby causing
611 * now the secondary core is starting up let it run its
612 * calibrations, then wait for it to finish
614 - spin_unlock(&boot_lock);
615 + raw_spin_unlock(&boot_lock);
619 diff -Nur linux-4.9.6.orig/arch/arm/mach-spear/platsmp.c linux-4.9.6/arch/arm/mach-spear/platsmp.c
620 --- linux-4.9.6.orig/arch/arm/mach-spear/platsmp.c 2017-01-26 08:25:24.000000000 +0100
621 +++ linux-4.9.6/arch/arm/mach-spear/platsmp.c 2017-01-28 13:59:09.895654805 +0100
623 sync_cache_w(&pen_release);
626 -static DEFINE_SPINLOCK(boot_lock);
627 +static DEFINE_RAW_SPINLOCK(boot_lock);
629 static void __iomem *scu_base = IOMEM(VA_SCU_BASE);
633 * Synchronise with the boot thread.
635 - spin_lock(&boot_lock);
636 - spin_unlock(&boot_lock);
637 + raw_spin_lock(&boot_lock);
638 + raw_spin_unlock(&boot_lock);
641 static int spear13xx_boot_secondary(unsigned int cpu, struct task_struct *idle)
643 * set synchronisation state between this boot processor
644 * and the secondary one
646 - spin_lock(&boot_lock);
647 + raw_spin_lock(&boot_lock);
650 * The secondary processor is waiting to be released from
652 * now the secondary core is starting up let it run its
653 * calibrations, then wait for it to finish
655 - spin_unlock(&boot_lock);
656 + raw_spin_unlock(&boot_lock);
658 return pen_release != -1 ? -ENOSYS : 0;
660 diff -Nur linux-4.9.6.orig/arch/arm/mach-sti/platsmp.c linux-4.9.6/arch/arm/mach-sti/platsmp.c
661 --- linux-4.9.6.orig/arch/arm/mach-sti/platsmp.c 2017-01-26 08:25:24.000000000 +0100
662 +++ linux-4.9.6/arch/arm/mach-sti/platsmp.c 2017-01-28 13:59:09.895654805 +0100
664 sync_cache_w(&pen_release);
667 -static DEFINE_SPINLOCK(boot_lock);
668 +static DEFINE_RAW_SPINLOCK(boot_lock);
670 static void sti_secondary_init(unsigned int cpu)
674 * Synchronise with the boot thread.
676 - spin_lock(&boot_lock);
677 - spin_unlock(&boot_lock);
678 + raw_spin_lock(&boot_lock);
679 + raw_spin_unlock(&boot_lock);
682 static int sti_boot_secondary(unsigned int cpu, struct task_struct *idle)
684 * set synchronisation state between this boot processor
685 * and the secondary one
687 - spin_lock(&boot_lock);
688 + raw_spin_lock(&boot_lock);
691 * The secondary processor is waiting to be released from
693 * now the secondary core is starting up let it run its
694 * calibrations, then wait for it to finish
696 - spin_unlock(&boot_lock);
697 + raw_spin_unlock(&boot_lock);
699 return pen_release != -1 ? -ENOSYS : 0;
701 diff -Nur linux-4.9.6.orig/arch/arm/mm/fault.c linux-4.9.6/arch/arm/mm/fault.c
702 --- linux-4.9.6.orig/arch/arm/mm/fault.c 2017-01-26 08:25:24.000000000 +0100
703 +++ linux-4.9.6/arch/arm/mm/fault.c 2017-01-28 13:59:09.895654805 +0100
705 if (addr < TASK_SIZE)
706 return do_page_fault(addr, fsr, regs);
708 + if (interrupts_enabled(regs))
709 + local_irq_enable();
716 do_sect_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
718 + if (interrupts_enabled(regs))
719 + local_irq_enable();
721 do_bad_area(addr, fsr, regs);
724 diff -Nur linux-4.9.6.orig/arch/arm/mm/highmem.c linux-4.9.6/arch/arm/mm/highmem.c
725 --- linux-4.9.6.orig/arch/arm/mm/highmem.c 2017-01-26 08:25:24.000000000 +0100
726 +++ linux-4.9.6/arch/arm/mm/highmem.c 2017-01-28 13:59:09.895654805 +0100
731 +static unsigned int fixmap_idx(int type)
733 + return FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id();
736 void *kmap(struct page *page)
741 void *kmap_atomic(struct page *page)
743 + pte_t pte = mk_pte(page, kmap_prot);
750 + preempt_disable_nort();
752 if (!PageHighMem(page))
753 return page_address(page);
756 type = kmap_atomic_idx_push();
758 - idx = FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id();
759 + idx = fixmap_idx(type);
760 vaddr = __fix_to_virt(idx);
761 #ifdef CONFIG_DEBUG_HIGHMEM
764 * in place, so the contained TLB flush ensures the TLB is updated
765 * with the new mapping.
767 - set_fixmap_pte(idx, mk_pte(page, kmap_prot));
768 +#ifdef CONFIG_PREEMPT_RT_FULL
769 + current->kmap_pte[type] = pte;
771 + set_fixmap_pte(idx, pte);
773 return (void *)vaddr;
775 @@ -106,44 +115,75 @@
777 if (kvaddr >= (void *)FIXADDR_START) {
778 type = kmap_atomic_idx();
779 - idx = FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id();
780 + idx = fixmap_idx(type);
783 __cpuc_flush_dcache_area((void *)vaddr, PAGE_SIZE);
784 +#ifdef CONFIG_PREEMPT_RT_FULL
785 + current->kmap_pte[type] = __pte(0);
787 #ifdef CONFIG_DEBUG_HIGHMEM
788 BUG_ON(vaddr != __fix_to_virt(idx));
789 - set_fixmap_pte(idx, __pte(0));
791 (void) idx; /* to kill a warning */
793 + set_fixmap_pte(idx, __pte(0));
794 kmap_atomic_idx_pop();
795 } else if (vaddr >= PKMAP_ADDR(0) && vaddr < PKMAP_ADDR(LAST_PKMAP)) {
796 /* this address was obtained through kmap_high_get() */
797 kunmap_high(pte_page(pkmap_page_table[PKMAP_NR(vaddr)]));
801 + preempt_enable_nort();
803 EXPORT_SYMBOL(__kunmap_atomic);
805 void *kmap_atomic_pfn(unsigned long pfn)
807 + pte_t pte = pfn_pte(pfn, kmap_prot);
810 struct page *page = pfn_to_page(pfn);
813 + preempt_disable_nort();
815 if (!PageHighMem(page))
816 return page_address(page);
818 type = kmap_atomic_idx_push();
819 - idx = FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id();
820 + idx = fixmap_idx(type);
821 vaddr = __fix_to_virt(idx);
822 #ifdef CONFIG_DEBUG_HIGHMEM
823 BUG_ON(!pte_none(get_fixmap_pte(vaddr)));
825 - set_fixmap_pte(idx, pfn_pte(pfn, kmap_prot));
826 +#ifdef CONFIG_PREEMPT_RT_FULL
827 + current->kmap_pte[type] = pte;
829 + set_fixmap_pte(idx, pte);
831 return (void *)vaddr;
833 +#if defined CONFIG_PREEMPT_RT_FULL
834 +void switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p)
839 + * Clear @prev's kmap_atomic mappings
841 + for (i = 0; i < prev_p->kmap_idx; i++) {
842 + int idx = fixmap_idx(i);
844 + set_fixmap_pte(idx, __pte(0));
847 + * Restore @next_p's kmap_atomic mappings
849 + for (i = 0; i < next_p->kmap_idx; i++) {
850 + int idx = fixmap_idx(i);
852 + if (!pte_none(next_p->kmap_pte[i]))
853 + set_fixmap_pte(idx, next_p->kmap_pte[i]);
857 diff -Nur linux-4.9.6.orig/arch/arm/plat-versatile/platsmp.c linux-4.9.6/arch/arm/plat-versatile/platsmp.c
858 --- linux-4.9.6.orig/arch/arm/plat-versatile/platsmp.c 2017-01-26 08:25:24.000000000 +0100
859 +++ linux-4.9.6/arch/arm/plat-versatile/platsmp.c 2017-01-28 13:59:09.895654805 +0100
861 sync_cache_w(&pen_release);
864 -static DEFINE_SPINLOCK(boot_lock);
865 +static DEFINE_RAW_SPINLOCK(boot_lock);
867 void versatile_secondary_init(unsigned int cpu)
871 * Synchronise with the boot thread.
873 - spin_lock(&boot_lock);
874 - spin_unlock(&boot_lock);
875 + raw_spin_lock(&boot_lock);
876 + raw_spin_unlock(&boot_lock);
879 int versatile_boot_secondary(unsigned int cpu, struct task_struct *idle)
881 * Set synchronisation state between this boot processor
882 * and the secondary one
884 - spin_lock(&boot_lock);
885 + raw_spin_lock(&boot_lock);
888 * This is really belt and braces; we hold unintended secondary
890 * now the secondary core is starting up let it run its
891 * calibrations, then wait for it to finish
893 - spin_unlock(&boot_lock);
894 + raw_spin_unlock(&boot_lock);
896 return pen_release != -1 ? -ENOSYS : 0;
898 diff -Nur linux-4.9.6.orig/arch/arm64/include/asm/thread_info.h linux-4.9.6/arch/arm64/include/asm/thread_info.h
899 --- linux-4.9.6.orig/arch/arm64/include/asm/thread_info.h 2017-01-26 08:25:24.000000000 +0100
900 +++ linux-4.9.6/arch/arm64/include/asm/thread_info.h 2017-01-28 13:59:09.895654805 +0100
902 mm_segment_t addr_limit; /* address limit */
903 struct task_struct *task; /* main task structure */
904 int preempt_count; /* 0 => preemptable, <0 => bug */
905 + int preempt_lazy_count; /* 0 => preemptable, <0 => bug */
910 #define TIF_NEED_RESCHED 1
911 #define TIF_NOTIFY_RESUME 2 /* callback before returning to user */
912 #define TIF_FOREIGN_FPSTATE 3 /* CPU's FP state is not current's */
913 +#define TIF_NEED_RESCHED_LAZY 4
915 #define TIF_SYSCALL_TRACE 8
916 #define TIF_SYSCALL_AUDIT 9
918 #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED)
919 #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME)
920 #define _TIF_FOREIGN_FPSTATE (1 << TIF_FOREIGN_FPSTATE)
921 +#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY)
922 #define _TIF_NOHZ (1 << TIF_NOHZ)
923 #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE)
924 #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT)
926 #define _TIF_32BIT (1 << TIF_32BIT)
928 #define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
929 - _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE)
930 + _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE | \
931 + _TIF_NEED_RESCHED_LAZY)
932 +#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)
934 #define _TIF_SYSCALL_WORK (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \
935 _TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP | \
936 diff -Nur linux-4.9.6.orig/arch/arm64/Kconfig linux-4.9.6/arch/arm64/Kconfig
937 --- linux-4.9.6.orig/arch/arm64/Kconfig 2017-01-26 08:25:24.000000000 +0100
938 +++ linux-4.9.6/arch/arm64/Kconfig 2017-01-28 13:59:09.895654805 +0100
940 select HAVE_PERF_EVENTS
941 select HAVE_PERF_REGS
942 select HAVE_PERF_USER_STACK_DUMP
943 + select HAVE_PREEMPT_LAZY
944 select HAVE_REGS_AND_STACK_ACCESS_API
945 select HAVE_RCU_TABLE_FREE
946 select HAVE_SYSCALL_TRACEPOINTS
950 bool "Xen guest support on ARM64"
951 - depends on ARM64 && OF
952 + depends on ARM64 && OF && !PREEMPT_RT_FULL
956 diff -Nur linux-4.9.6.orig/arch/arm64/kernel/asm-offsets.c linux-4.9.6/arch/arm64/kernel/asm-offsets.c
957 --- linux-4.9.6.orig/arch/arm64/kernel/asm-offsets.c 2017-01-26 08:25:24.000000000 +0100
958 +++ linux-4.9.6/arch/arm64/kernel/asm-offsets.c 2017-01-28 13:59:09.895654805 +0100
961 DEFINE(TI_FLAGS, offsetof(struct thread_info, flags));
962 DEFINE(TI_PREEMPT, offsetof(struct thread_info, preempt_count));
963 + DEFINE(TI_PREEMPT_LAZY, offsetof(struct thread_info, preempt_lazy_count));
964 DEFINE(TI_ADDR_LIMIT, offsetof(struct thread_info, addr_limit));
965 DEFINE(TI_TASK, offsetof(struct thread_info, task));
966 DEFINE(TI_CPU, offsetof(struct thread_info, cpu));
967 diff -Nur linux-4.9.6.orig/arch/arm64/kernel/entry.S linux-4.9.6/arch/arm64/kernel/entry.S
968 --- linux-4.9.6.orig/arch/arm64/kernel/entry.S 2017-01-26 08:25:24.000000000 +0100
969 +++ linux-4.9.6/arch/arm64/kernel/entry.S 2017-01-28 13:59:09.895654805 +0100
970 @@ -428,11 +428,16 @@
972 #ifdef CONFIG_PREEMPT
973 ldr w24, [tsk, #TI_PREEMPT] // get preempt count
974 - cbnz w24, 1f // preempt count != 0
975 + cbnz w24, 2f // preempt count != 0
976 ldr x0, [tsk, #TI_FLAGS] // get flags
977 - tbz x0, #TIF_NEED_RESCHED, 1f // needs rescheduling?
979 + tbnz x0, #TIF_NEED_RESCHED, 1f // needs rescheduling?
981 + ldr w24, [tsk, #TI_PREEMPT_LAZY] // get preempt lazy count
982 + cbnz w24, 2f // preempt lazy count != 0
983 + tbz x0, #TIF_NEED_RESCHED_LAZY, 2f // needs rescheduling?
988 #ifdef CONFIG_TRACE_IRQFLAGS
991 1: bl preempt_schedule_irq // irq en/disable is done inside
992 ldr x0, [tsk, #TI_FLAGS] // get new tasks TI_FLAGS
993 tbnz x0, #TIF_NEED_RESCHED, 1b // needs rescheduling?
994 + tbnz x0, #TIF_NEED_RESCHED_LAZY, 1b // needs rescheduling?
998 diff -Nur linux-4.9.6.orig/arch/arm64/kernel/signal.c linux-4.9.6/arch/arm64/kernel/signal.c
999 --- linux-4.9.6.orig/arch/arm64/kernel/signal.c 2017-01-26 08:25:24.000000000 +0100
1000 +++ linux-4.9.6/arch/arm64/kernel/signal.c 2017-01-28 13:59:09.895654805 +0100
1003 trace_hardirqs_off();
1005 - if (thread_flags & _TIF_NEED_RESCHED) {
1006 + if (thread_flags & _TIF_NEED_RESCHED_MASK) {
1010 diff -Nur linux-4.9.6.orig/arch/Kconfig linux-4.9.6/arch/Kconfig
1011 --- linux-4.9.6.orig/arch/Kconfig 2017-01-26 08:25:24.000000000 +0100
1012 +++ linux-4.9.6/arch/Kconfig 2017-01-28 13:59:09.887654498 +0100
1014 tristate "OProfile system profiling"
1015 depends on PROFILING
1016 depends on HAVE_OPROFILE
1017 + depends on !PREEMPT_RT_FULL
1019 select RING_BUFFER_ALLOW_SWAP
1023 bool "Optimize very unlikely/likely branches"
1024 depends on HAVE_ARCH_JUMP_LABEL
1025 + depends on (!INTERRUPT_OFF_HIST && !PREEMPT_OFF_HIST && !WAKEUP_LATENCY_HIST && !MISSED_TIMER_OFFSETS_HIST)
1027 This option enables a transparent branch optimization that
1028 makes certain almost-always-true or almost-always-false branch
1029 diff -Nur linux-4.9.6.orig/arch/mips/Kconfig linux-4.9.6/arch/mips/Kconfig
1030 --- linux-4.9.6.orig/arch/mips/Kconfig 2017-01-26 08:25:24.000000000 +0100
1031 +++ linux-4.9.6/arch/mips/Kconfig 2017-01-28 13:59:09.895654805 +0100
1032 @@ -2514,7 +2514,7 @@
1035 bool "High Memory Support"
1036 - depends on 32BIT && CPU_SUPPORTS_HIGHMEM && SYS_SUPPORTS_HIGHMEM && !CPU_MIPS32_3_5_EVA
1037 + depends on 32BIT && CPU_SUPPORTS_HIGHMEM && SYS_SUPPORTS_HIGHMEM && !CPU_MIPS32_3_5_EVA && !PREEMPT_RT_FULL
1039 config CPU_SUPPORTS_HIGHMEM
1041 diff -Nur linux-4.9.6.orig/arch/powerpc/include/asm/thread_info.h linux-4.9.6/arch/powerpc/include/asm/thread_info.h
1042 --- linux-4.9.6.orig/arch/powerpc/include/asm/thread_info.h 2017-01-26 08:25:24.000000000 +0100
1043 +++ linux-4.9.6/arch/powerpc/include/asm/thread_info.h 2017-01-28 13:59:09.895654805 +0100
1045 int cpu; /* cpu we're on */
1046 int preempt_count; /* 0 => preemptable,
1048 + int preempt_lazy_count; /* 0 => preemptable,
1050 unsigned long local_flags; /* private flags for thread */
1051 #ifdef CONFIG_LIVEPATCH
1052 unsigned long *livepatch_sp;
1054 #define TIF_SYSCALL_TRACE 0 /* syscall trace active */
1055 #define TIF_SIGPENDING 1 /* signal pending */
1056 #define TIF_NEED_RESCHED 2 /* rescheduling necessary */
1057 -#define TIF_POLLING_NRFLAG 3 /* true if poll_idle() is polling
1058 - TIF_NEED_RESCHED */
1059 +#define TIF_NEED_RESCHED_LAZY 3 /* lazy rescheduling necessary */
1060 #define TIF_32BIT 4 /* 32 bit binary */
1061 #define TIF_RESTORE_TM 5 /* need to restore TM FP/VEC/VSX */
1062 #define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */
1064 #if defined(CONFIG_PPC64)
1065 #define TIF_ELF2ABI 18 /* function descriptors must die! */
1067 +#define TIF_POLLING_NRFLAG 19 /* true if poll_idle() is polling
1068 + TIF_NEED_RESCHED */
1070 /* as above, but as bit values */
1071 #define _TIF_SYSCALL_TRACE (1<<TIF_SYSCALL_TRACE)
1072 @@ -125,14 +128,16 @@
1073 #define _TIF_SYSCALL_TRACEPOINT (1<<TIF_SYSCALL_TRACEPOINT)
1074 #define _TIF_EMULATE_STACK_STORE (1<<TIF_EMULATE_STACK_STORE)
1075 #define _TIF_NOHZ (1<<TIF_NOHZ)
1076 +#define _TIF_NEED_RESCHED_LAZY (1<<TIF_NEED_RESCHED_LAZY)
1077 #define _TIF_SYSCALL_DOTRACE (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \
1078 _TIF_SECCOMP | _TIF_SYSCALL_TRACEPOINT | \
1081 #define _TIF_USER_WORK_MASK (_TIF_SIGPENDING | _TIF_NEED_RESCHED | \
1082 _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
1084 + _TIF_RESTORE_TM | _TIF_NEED_RESCHED_LAZY)
1085 #define _TIF_PERSYSCALL_MASK (_TIF_RESTOREALL|_TIF_NOERROR)
1086 +#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)
1088 /* Bits in local_flags */
1089 /* Don't move TLF_NAPPING without adjusting the code in entry_32.S */
1090 diff -Nur linux-4.9.6.orig/arch/powerpc/Kconfig linux-4.9.6/arch/powerpc/Kconfig
1091 --- linux-4.9.6.orig/arch/powerpc/Kconfig 2017-01-26 08:25:24.000000000 +0100
1092 +++ linux-4.9.6/arch/powerpc/Kconfig 2017-01-28 13:59:09.895654805 +0100
1095 config RWSEM_GENERIC_SPINLOCK
1097 + default y if PREEMPT_RT_FULL
1099 config RWSEM_XCHGADD_ALGORITHM
1102 + default y if !PREEMPT_RT_FULL
1104 config GENERIC_LOCKBREAK
1107 select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
1108 select GENERIC_STRNCPY_FROM_USER
1109 select GENERIC_STRNLEN_USER
1110 + select HAVE_PREEMPT_LAZY
1111 select HAVE_MOD_ARCH_SPECIFIC
1112 select MODULES_USE_ELF_RELA
1113 select CLONE_BACKWARDS
1117 bool "High memory support"
1119 + depends on PPC32 && !PREEMPT_RT_FULL
1121 source kernel/Kconfig.hz
1122 source kernel/Kconfig.preempt
1123 diff -Nur linux-4.9.6.orig/arch/powerpc/kernel/asm-offsets.c linux-4.9.6/arch/powerpc/kernel/asm-offsets.c
1124 --- linux-4.9.6.orig/arch/powerpc/kernel/asm-offsets.c 2017-01-26 08:25:24.000000000 +0100
1125 +++ linux-4.9.6/arch/powerpc/kernel/asm-offsets.c 2017-01-28 13:59:09.895654805 +0100
1127 DEFINE(TI_FLAGS, offsetof(struct thread_info, flags));
1128 DEFINE(TI_LOCAL_FLAGS, offsetof(struct thread_info, local_flags));
1129 DEFINE(TI_PREEMPT, offsetof(struct thread_info, preempt_count));
1130 + DEFINE(TI_PREEMPT_LAZY, offsetof(struct thread_info, preempt_lazy_count));
1131 DEFINE(TI_TASK, offsetof(struct thread_info, task));
1132 DEFINE(TI_CPU, offsetof(struct thread_info, cpu));
1134 diff -Nur linux-4.9.6.orig/arch/powerpc/kernel/entry_32.S linux-4.9.6/arch/powerpc/kernel/entry_32.S
1135 --- linux-4.9.6.orig/arch/powerpc/kernel/entry_32.S 2017-01-26 08:25:24.000000000 +0100
1136 +++ linux-4.9.6/arch/powerpc/kernel/entry_32.S 2017-01-28 13:59:09.895654805 +0100
1137 @@ -835,7 +835,14 @@
1138 cmpwi 0,r0,0 /* if non-zero, just restore regs and return */
1140 andi. r8,r8,_TIF_NEED_RESCHED
1142 + lwz r0,TI_PREEMPT_LAZY(r9)
1143 + cmpwi 0,r0,0 /* if non-zero, just restore regs and return */
1145 + lwz r0,TI_FLAGS(r9)
1146 + andi. r0,r0,_TIF_NEED_RESCHED_LAZY
1150 andi. r0,r3,MSR_EE /* interrupts off? */
1151 beq restore /* don't schedule if so */
1152 @@ -846,11 +853,11 @@
1154 bl trace_hardirqs_off
1156 -1: bl preempt_schedule_irq
1157 +2: bl preempt_schedule_irq
1158 CURRENT_THREAD_INFO(r9, r1)
1160 - andi. r0,r3,_TIF_NEED_RESCHED
1162 + andi. r0,r3,_TIF_NEED_RESCHED_MASK
1164 #ifdef CONFIG_TRACE_IRQFLAGS
1165 /* And now, to properly rebalance the above, we tell lockdep they
1166 * are being turned back on, which will happen when we return
1167 @@ -1171,7 +1178,7 @@
1168 #endif /* !(CONFIG_4xx || CONFIG_BOOKE) */
1170 do_work: /* r10 contains MSR_KERNEL here */
1171 - andi. r0,r9,_TIF_NEED_RESCHED
1172 + andi. r0,r9,_TIF_NEED_RESCHED_MASK
1175 do_resched: /* r10 contains MSR_KERNEL here */
1176 @@ -1192,7 +1199,7 @@
1177 MTMSRD(r10) /* disable interrupts */
1178 CURRENT_THREAD_INFO(r9, r1)
1180 - andi. r0,r9,_TIF_NEED_RESCHED
1181 + andi. r0,r9,_TIF_NEED_RESCHED_MASK
1183 andi. r0,r9,_TIF_USER_WORK_MASK
1185 diff -Nur linux-4.9.6.orig/arch/powerpc/kernel/entry_64.S linux-4.9.6/arch/powerpc/kernel/entry_64.S
1186 --- linux-4.9.6.orig/arch/powerpc/kernel/entry_64.S 2017-01-26 08:25:24.000000000 +0100
1187 +++ linux-4.9.6/arch/powerpc/kernel/entry_64.S 2017-01-28 13:59:09.895654805 +0100
1192 -1: andi. r0,r4,_TIF_NEED_RESCHED
1193 +1: andi. r0,r4,_TIF_NEED_RESCHED_MASK
1195 bl restore_interrupts
1197 @@ -718,10 +718,18 @@
1199 #ifdef CONFIG_PREEMPT
1200 /* Check if we need to preempt */
1201 + lwz r8,TI_PREEMPT(r9)
1202 + cmpwi 0,r8,0 /* if non-zero, just restore regs and return */
1204 andi. r0,r4,_TIF_NEED_RESCHED
1207 + andi. r0,r4,_TIF_NEED_RESCHED_LAZY
1209 + lwz r8,TI_PREEMPT_LAZY(r9)
1211 /* Check that preempt_count() == 0 and interrupts are enabled */
1212 - lwz r8,TI_PREEMPT(r9)
1218 /* Re-test flags and eventually loop */
1219 CURRENT_THREAD_INFO(r9, r1)
1221 - andi. r0,r4,_TIF_NEED_RESCHED
1222 + andi. r0,r4,_TIF_NEED_RESCHED_MASK
1226 diff -Nur linux-4.9.6.orig/arch/powerpc/kernel/irq.c linux-4.9.6/arch/powerpc/kernel/irq.c
1227 --- linux-4.9.6.orig/arch/powerpc/kernel/irq.c 2017-01-26 08:25:24.000000000 +0100
1228 +++ linux-4.9.6/arch/powerpc/kernel/irq.c 2017-01-28 13:59:09.895654805 +0100
1233 +#ifndef CONFIG_PREEMPT_RT_FULL
1234 void do_softirq_own_stack(void)
1236 struct thread_info *curtp, *irqtp;
1239 set_bits(irqtp->flags, &curtp->flags);
1243 irq_hw_number_t virq_to_hw(unsigned int virq)
1245 diff -Nur linux-4.9.6.orig/arch/powerpc/kernel/misc_32.S linux-4.9.6/arch/powerpc/kernel/misc_32.S
1246 --- linux-4.9.6.orig/arch/powerpc/kernel/misc_32.S 2017-01-26 08:25:24.000000000 +0100
1247 +++ linux-4.9.6/arch/powerpc/kernel/misc_32.S 2017-01-28 13:59:09.895654805 +0100
1249 * We store the saved ksp_limit in the unused part
1250 * of the STACK_FRAME_OVERHEAD
1252 +#ifndef CONFIG_PREEMPT_RT_FULL
1253 _GLOBAL(call_do_softirq)
1257 stw r10,THREAD+KSP_LIMIT(r2)
1263 * void call_do_irq(struct pt_regs *regs, struct thread_info *irqtp);
1264 diff -Nur linux-4.9.6.orig/arch/powerpc/kernel/misc_64.S linux-4.9.6/arch/powerpc/kernel/misc_64.S
1265 --- linux-4.9.6.orig/arch/powerpc/kernel/misc_64.S 2017-01-26 08:25:24.000000000 +0100
1266 +++ linux-4.9.6/arch/powerpc/kernel/misc_64.S 2017-01-28 13:59:09.895654805 +0100
1271 +#ifndef CONFIG_PREEMPT_RT_FULL
1272 _GLOBAL(call_do_softirq)
1281 _GLOBAL(call_do_irq)
1283 diff -Nur linux-4.9.6.orig/arch/powerpc/kvm/Kconfig linux-4.9.6/arch/powerpc/kvm/Kconfig
1284 --- linux-4.9.6.orig/arch/powerpc/kvm/Kconfig 2017-01-26 08:25:24.000000000 +0100
1285 +++ linux-4.9.6/arch/powerpc/kvm/Kconfig 2017-01-28 13:59:09.895654805 +0100
1288 bool "KVM in-kernel MPIC emulation"
1289 depends on KVM && E500
1290 + depends on !PREEMPT_RT_FULL
1291 select HAVE_KVM_IRQCHIP
1292 select HAVE_KVM_IRQFD
1293 select HAVE_KVM_IRQ_ROUTING
1294 diff -Nur linux-4.9.6.orig/arch/powerpc/platforms/ps3/device-init.c linux-4.9.6/arch/powerpc/platforms/ps3/device-init.c
1295 --- linux-4.9.6.orig/arch/powerpc/platforms/ps3/device-init.c 2017-01-26 08:25:24.000000000 +0100
1296 +++ linux-4.9.6/arch/powerpc/platforms/ps3/device-init.c 2017-01-28 13:59:09.895654805 +0100
1299 pr_debug("%s:%u: notification %s issued\n", __func__, __LINE__, op);
1301 - res = wait_event_interruptible(dev->done.wait,
1302 + res = swait_event_interruptible(dev->done.wait,
1303 dev->done.done || kthread_should_stop());
1304 if (kthread_should_stop())
1306 diff -Nur linux-4.9.6.orig/arch/sh/kernel/irq.c linux-4.9.6/arch/sh/kernel/irq.c
1307 --- linux-4.9.6.orig/arch/sh/kernel/irq.c 2017-01-26 08:25:24.000000000 +0100
1308 +++ linux-4.9.6/arch/sh/kernel/irq.c 2017-01-28 13:59:09.895654805 +0100
1310 hardirq_ctx[cpu] = NULL;
1313 +#ifndef CONFIG_PREEMPT_RT_FULL
1314 void do_softirq_own_stack(void)
1316 struct thread_info *curctx;
1318 "r5", "r6", "r7", "r8", "r9", "r15", "t", "pr"
1323 static inline void handle_one_irq(unsigned int irq)
1325 diff -Nur linux-4.9.6.orig/arch/sparc/Kconfig linux-4.9.6/arch/sparc/Kconfig
1326 --- linux-4.9.6.orig/arch/sparc/Kconfig 2017-01-26 08:25:24.000000000 +0100
1327 +++ linux-4.9.6/arch/sparc/Kconfig 2017-01-28 13:59:09.895654805 +0100
1328 @@ -194,12 +194,10 @@
1329 source kernel/Kconfig.hz
1331 config RWSEM_GENERIC_SPINLOCK
1333 - default y if SPARC32
1334 + def_bool PREEMPT_RT_FULL
1336 config RWSEM_XCHGADD_ALGORITHM
1338 - default y if SPARC64
1339 + def_bool !RWSEM_GENERIC_SPINLOCK && !PREEMPT_RT_FULL
1341 config GENERIC_HWEIGHT
1343 diff -Nur linux-4.9.6.orig/arch/sparc/kernel/irq_64.c linux-4.9.6/arch/sparc/kernel/irq_64.c
1344 --- linux-4.9.6.orig/arch/sparc/kernel/irq_64.c 2017-01-26 08:25:24.000000000 +0100
1345 +++ linux-4.9.6/arch/sparc/kernel/irq_64.c 2017-01-28 13:59:09.895654805 +0100
1347 set_irq_regs(old_regs);
1350 +#ifndef CONFIG_PREEMPT_RT_FULL
1351 void do_softirq_own_stack(void)
1353 void *orig_sp, *sp = softirq_stack[smp_processor_id()];
1355 __asm__ __volatile__("mov %0, %%sp"
1360 #ifdef CONFIG_HOTPLUG_CPU
1361 void fixup_irqs(void)
1362 diff -Nur linux-4.9.6.orig/arch/x86/crypto/aesni-intel_glue.c linux-4.9.6/arch/x86/crypto/aesni-intel_glue.c
1363 --- linux-4.9.6.orig/arch/x86/crypto/aesni-intel_glue.c 2017-01-26 08:25:24.000000000 +0100
1364 +++ linux-4.9.6/arch/x86/crypto/aesni-intel_glue.c 2017-01-28 13:59:09.895654805 +0100
1365 @@ -372,14 +372,14 @@
1366 err = blkcipher_walk_virt(desc, &walk);
1367 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
1369 - kernel_fpu_begin();
1370 while ((nbytes = walk.nbytes)) {
1371 + kernel_fpu_begin();
1372 aesni_ecb_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr,
1373 - nbytes & AES_BLOCK_MASK);
1374 + nbytes & AES_BLOCK_MASK);
1376 nbytes &= AES_BLOCK_SIZE - 1;
1377 err = blkcipher_walk_done(desc, &walk, nbytes);
1383 @@ -396,14 +396,14 @@
1384 err = blkcipher_walk_virt(desc, &walk);
1385 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
1387 - kernel_fpu_begin();
1388 while ((nbytes = walk.nbytes)) {
1389 + kernel_fpu_begin();
1390 aesni_ecb_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr,
1391 nbytes & AES_BLOCK_MASK);
1393 nbytes &= AES_BLOCK_SIZE - 1;
1394 err = blkcipher_walk_done(desc, &walk, nbytes);
1400 @@ -420,14 +420,14 @@
1401 err = blkcipher_walk_virt(desc, &walk);
1402 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
1404 - kernel_fpu_begin();
1405 while ((nbytes = walk.nbytes)) {
1406 + kernel_fpu_begin();
1407 aesni_cbc_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr,
1408 nbytes & AES_BLOCK_MASK, walk.iv);
1410 nbytes &= AES_BLOCK_SIZE - 1;
1411 err = blkcipher_walk_done(desc, &walk, nbytes);
1417 @@ -444,14 +444,14 @@
1418 err = blkcipher_walk_virt(desc, &walk);
1419 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
1421 - kernel_fpu_begin();
1422 while ((nbytes = walk.nbytes)) {
1423 + kernel_fpu_begin();
1424 aesni_cbc_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr,
1425 nbytes & AES_BLOCK_MASK, walk.iv);
1427 nbytes &= AES_BLOCK_SIZE - 1;
1428 err = blkcipher_walk_done(desc, &walk, nbytes);
1434 @@ -503,18 +503,20 @@
1435 err = blkcipher_walk_virt_block(desc, &walk, AES_BLOCK_SIZE);
1436 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
1438 - kernel_fpu_begin();
1439 while ((nbytes = walk.nbytes) >= AES_BLOCK_SIZE) {
1440 + kernel_fpu_begin();
1441 aesni_ctr_enc_tfm(ctx, walk.dst.virt.addr, walk.src.virt.addr,
1442 nbytes & AES_BLOCK_MASK, walk.iv);
1444 nbytes &= AES_BLOCK_SIZE - 1;
1445 err = blkcipher_walk_done(desc, &walk, nbytes);
1448 + kernel_fpu_begin();
1449 ctr_crypt_final(ctx, &walk);
1451 err = blkcipher_walk_done(desc, &walk, 0);
1457 diff -Nur linux-4.9.6.orig/arch/x86/crypto/cast5_avx_glue.c linux-4.9.6/arch/x86/crypto/cast5_avx_glue.c
1458 --- linux-4.9.6.orig/arch/x86/crypto/cast5_avx_glue.c 2017-01-26 08:25:24.000000000 +0100
1459 +++ linux-4.9.6/arch/x86/crypto/cast5_avx_glue.c 2017-01-28 13:59:09.895654805 +0100
1461 static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
1464 - bool fpu_enabled = false;
1466 struct cast5_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
1467 const unsigned int bsize = CAST5_BLOCK_SIZE;
1468 unsigned int nbytes;
1470 u8 *wsrc = walk->src.virt.addr;
1471 u8 *wdst = walk->dst.virt.addr;
1473 - fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes);
1474 + fpu_enabled = cast5_fpu_begin(false, nbytes);
1476 /* Process multi-block batch */
1477 if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) {
1478 @@ -103,10 +103,9 @@
1479 } while (nbytes >= bsize);
1482 + cast5_fpu_end(fpu_enabled);
1483 err = blkcipher_walk_done(desc, walk, nbytes);
1486 - cast5_fpu_end(fpu_enabled);
1491 static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
1492 struct scatterlist *src, unsigned int nbytes)
1494 - bool fpu_enabled = false;
1496 struct blkcipher_walk walk;
1499 @@ -236,12 +235,11 @@
1500 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
1502 while ((nbytes = walk.nbytes)) {
1503 - fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes);
1504 + fpu_enabled = cast5_fpu_begin(false, nbytes);
1505 nbytes = __cbc_decrypt(desc, &walk);
1506 + cast5_fpu_end(fpu_enabled);
1507 err = blkcipher_walk_done(desc, &walk, nbytes);
1510 - cast5_fpu_end(fpu_enabled);
1515 static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
1516 struct scatterlist *src, unsigned int nbytes)
1518 - bool fpu_enabled = false;
1520 struct blkcipher_walk walk;
1523 @@ -320,13 +318,12 @@
1524 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
1526 while ((nbytes = walk.nbytes) >= CAST5_BLOCK_SIZE) {
1527 - fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes);
1528 + fpu_enabled = cast5_fpu_begin(false, nbytes);
1529 nbytes = __ctr_crypt(desc, &walk);
1530 + cast5_fpu_end(fpu_enabled);
1531 err = blkcipher_walk_done(desc, &walk, nbytes);
1534 - cast5_fpu_end(fpu_enabled);
1537 ctr_crypt_final(desc, &walk);
1538 err = blkcipher_walk_done(desc, &walk, 0);
1539 diff -Nur linux-4.9.6.orig/arch/x86/crypto/glue_helper.c linux-4.9.6/arch/x86/crypto/glue_helper.c
1540 --- linux-4.9.6.orig/arch/x86/crypto/glue_helper.c 2017-01-26 08:25:24.000000000 +0100
1541 +++ linux-4.9.6/arch/x86/crypto/glue_helper.c 2017-01-28 13:59:09.895654805 +0100
1543 void *ctx = crypto_blkcipher_ctx(desc->tfm);
1544 const unsigned int bsize = 128 / 8;
1545 unsigned int nbytes, i, func_bytes;
1546 - bool fpu_enabled = false;
1550 err = blkcipher_walk_virt(desc, walk);
1552 u8 *wdst = walk->dst.virt.addr;
1554 fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
1555 - desc, fpu_enabled, nbytes);
1556 + desc, false, nbytes);
1558 for (i = 0; i < gctx->num_funcs; i++) {
1559 func_bytes = bsize * gctx->funcs[i].num_blocks;
1564 + glue_fpu_end(fpu_enabled);
1565 err = blkcipher_walk_done(desc, walk, nbytes);
1568 - glue_fpu_end(fpu_enabled);
1573 struct scatterlist *src, unsigned int nbytes)
1575 const unsigned int bsize = 128 / 8;
1576 - bool fpu_enabled = false;
1578 struct blkcipher_walk walk;
1581 @@ -203,12 +203,12 @@
1583 while ((nbytes = walk.nbytes)) {
1584 fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
1585 - desc, fpu_enabled, nbytes);
1586 + desc, false, nbytes);
1587 nbytes = __glue_cbc_decrypt_128bit(gctx, desc, &walk);
1588 + glue_fpu_end(fpu_enabled);
1589 err = blkcipher_walk_done(desc, &walk, nbytes);
1592 - glue_fpu_end(fpu_enabled);
1595 EXPORT_SYMBOL_GPL(glue_cbc_decrypt_128bit);
1597 struct scatterlist *src, unsigned int nbytes)
1599 const unsigned int bsize = 128 / 8;
1600 - bool fpu_enabled = false;
1602 struct blkcipher_walk walk;
1605 @@ -286,13 +286,12 @@
1607 while ((nbytes = walk.nbytes) >= bsize) {
1608 fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
1609 - desc, fpu_enabled, nbytes);
1610 + desc, false, nbytes);
1611 nbytes = __glue_ctr_crypt_128bit(gctx, desc, &walk);
1612 + glue_fpu_end(fpu_enabled);
1613 err = blkcipher_walk_done(desc, &walk, nbytes);
1616 - glue_fpu_end(fpu_enabled);
1619 glue_ctr_crypt_final_128bit(
1620 gctx->funcs[gctx->num_funcs - 1].fn_u.ctr, desc, &walk);
1622 void *tweak_ctx, void *crypt_ctx)
1624 const unsigned int bsize = 128 / 8;
1625 - bool fpu_enabled = false;
1627 struct blkcipher_walk walk;
1630 @@ -360,21 +359,21 @@
1632 /* set minimum length to bsize, for tweak_fn */
1633 fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
1634 - desc, fpu_enabled,
1636 nbytes < bsize ? bsize : nbytes);
1638 /* calculate first value of T */
1639 tweak_fn(tweak_ctx, walk.iv, walk.iv);
1640 + glue_fpu_end(fpu_enabled);
1643 + fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
1644 + desc, false, nbytes);
1645 nbytes = __glue_xts_crypt_128bit(gctx, crypt_ctx, desc, &walk);
1647 + glue_fpu_end(fpu_enabled);
1648 err = blkcipher_walk_done(desc, &walk, nbytes);
1649 nbytes = walk.nbytes;
1652 - glue_fpu_end(fpu_enabled);
1656 EXPORT_SYMBOL_GPL(glue_xts_crypt_128bit);
1657 diff -Nur linux-4.9.6.orig/arch/x86/entry/common.c linux-4.9.6/arch/x86/entry/common.c
1658 --- linux-4.9.6.orig/arch/x86/entry/common.c 2017-01-26 08:25:24.000000000 +0100
1659 +++ linux-4.9.6/arch/x86/entry/common.c 2017-01-28 13:59:09.899654958 +0100
1662 #define EXIT_TO_USERMODE_LOOP_FLAGS \
1663 (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
1664 - _TIF_NEED_RESCHED | _TIF_USER_RETURN_NOTIFY)
1665 + _TIF_NEED_RESCHED_MASK | _TIF_USER_RETURN_NOTIFY)
1667 static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags)
1669 @@ -145,9 +145,16 @@
1670 /* We have work to do. */
1673 - if (cached_flags & _TIF_NEED_RESCHED)
1674 + if (cached_flags & _TIF_NEED_RESCHED_MASK)
1677 +#ifdef ARCH_RT_DELAYS_SIGNAL_SEND
1678 + if (unlikely(current->forced_info.si_signo)) {
1679 + struct task_struct *t = current;
1680 + force_sig_info(t->forced_info.si_signo, &t->forced_info, t);
1681 + t->forced_info.si_signo = 0;
1684 if (cached_flags & _TIF_UPROBE)
1685 uprobe_notify_resume(regs);
1687 diff -Nur linux-4.9.6.orig/arch/x86/entry/entry_32.S linux-4.9.6/arch/x86/entry/entry_32.S
1688 --- linux-4.9.6.orig/arch/x86/entry/entry_32.S 2017-01-26 08:25:24.000000000 +0100
1689 +++ linux-4.9.6/arch/x86/entry/entry_32.S 2017-01-28 13:59:09.899654958 +0100
1690 @@ -308,8 +308,25 @@
1691 ENTRY(resume_kernel)
1692 DISABLE_INTERRUPTS(CLBR_ANY)
1694 + # preempt count == 0 + NEED_RS set?
1695 cmpl $0, PER_CPU_VAR(__preempt_count)
1696 +#ifndef CONFIG_PREEMPT_LAZY
1701 + # atleast preempt count == 0 ?
1702 + cmpl $_PREEMPT_ENABLED,PER_CPU_VAR(__preempt_count)
1705 + movl PER_CPU_VAR(current_task), %ebp
1706 + cmpl $0,TASK_TI_preempt_lazy_count(%ebp) # non-zero preempt_lazy_count ?
1709 + testl $_TIF_NEED_RESCHED_LAZY, TASK_TI_flags(%ebp)
1713 testl $X86_EFLAGS_IF, PT_EFLAGS(%esp) # interrupts off (exception path) ?
1715 call preempt_schedule_irq
1716 diff -Nur linux-4.9.6.orig/arch/x86/entry/entry_64.S linux-4.9.6/arch/x86/entry/entry_64.S
1717 --- linux-4.9.6.orig/arch/x86/entry/entry_64.S 2017-01-26 08:25:24.000000000 +0100
1718 +++ linux-4.9.6/arch/x86/entry/entry_64.S 2017-01-28 13:59:09.899654958 +0100
1719 @@ -546,7 +546,23 @@
1720 bt $9, EFLAGS(%rsp) /* were interrupts off? */
1722 0: cmpl $0, PER_CPU_VAR(__preempt_count)
1723 +#ifndef CONFIG_PREEMPT_LAZY
1726 + jz do_preempt_schedule_irq
1728 + # atleast preempt count == 0 ?
1729 + cmpl $_PREEMPT_ENABLED,PER_CPU_VAR(__preempt_count)
1732 + movq PER_CPU_VAR(current_task), %rcx
1733 + cmpl $0, TASK_TI_preempt_lazy_count(%rcx)
1736 + bt $TIF_NEED_RESCHED_LAZY,TASK_TI_flags(%rcx)
1738 +do_preempt_schedule_irq:
1740 call preempt_schedule_irq
1747 +#ifndef CONFIG_PREEMPT_RT_FULL
1748 /* Call softirq on interrupt stack. Interrupts are off. */
1749 ENTRY(do_softirq_own_stack)
1752 decl PER_CPU_VAR(irq_count)
1754 END(do_softirq_own_stack)
1758 idtentry xen_hypervisor_callback xen_do_hypervisor_callback has_error_code=0
1759 diff -Nur linux-4.9.6.orig/arch/x86/include/asm/preempt.h linux-4.9.6/arch/x86/include/asm/preempt.h
1760 --- linux-4.9.6.orig/arch/x86/include/asm/preempt.h 2017-01-26 08:25:24.000000000 +0100
1761 +++ linux-4.9.6/arch/x86/include/asm/preempt.h 2017-01-28 13:59:09.899654958 +0100
1763 * a decrement which hits zero means we have no preempt_count and should
1766 -static __always_inline bool __preempt_count_dec_and_test(void)
1767 +static __always_inline bool ____preempt_count_dec_and_test(void)
1769 GEN_UNARY_RMWcc("decl", __preempt_count, __percpu_arg(0), e);
1772 +static __always_inline bool __preempt_count_dec_and_test(void)
1774 + if (____preempt_count_dec_and_test())
1776 +#ifdef CONFIG_PREEMPT_LAZY
1777 + if (current_thread_info()->preempt_lazy_count)
1779 + return test_thread_flag(TIF_NEED_RESCHED_LAZY);
1786 * Returns true when we need to resched and can (barring IRQ state).
1788 static __always_inline bool should_resched(int preempt_offset)
1790 +#ifdef CONFIG_PREEMPT_LAZY
1793 + tmp = raw_cpu_read_4(__preempt_count);
1794 + if (tmp == preempt_offset)
1797 + /* preempt count == 0 ? */
1798 + tmp &= ~PREEMPT_NEED_RESCHED;
1801 + if (current_thread_info()->preempt_lazy_count)
1803 + return test_thread_flag(TIF_NEED_RESCHED_LAZY);
1805 return unlikely(raw_cpu_read_4(__preempt_count) == preempt_offset);
1809 #ifdef CONFIG_PREEMPT
1810 diff -Nur linux-4.9.6.orig/arch/x86/include/asm/signal.h linux-4.9.6/arch/x86/include/asm/signal.h
1811 --- linux-4.9.6.orig/arch/x86/include/asm/signal.h 2017-01-26 08:25:24.000000000 +0100
1812 +++ linux-4.9.6/arch/x86/include/asm/signal.h 2017-01-28 13:59:09.899654958 +0100
1814 #define SA_IA32_ABI 0x02000000u
1815 #define SA_X32_ABI 0x01000000u
1818 + * Because some traps use the IST stack, we must keep preemption
1819 + * disabled while calling do_trap(), but do_trap() may call
1820 + * force_sig_info() which will grab the signal spin_locks for the
1821 + * task, which in PREEMPT_RT_FULL are mutexes. By defining
1822 + * ARCH_RT_DELAYS_SIGNAL_SEND the force_sig_info() will set
1823 + * TIF_NOTIFY_RESUME and set up the signal to be sent on exit of the
1826 +#if defined(CONFIG_PREEMPT_RT_FULL)
1827 +#define ARCH_RT_DELAYS_SIGNAL_SEND
1830 #ifndef CONFIG_COMPAT
1831 typedef sigset_t compat_sigset_t;
1833 diff -Nur linux-4.9.6.orig/arch/x86/include/asm/stackprotector.h linux-4.9.6/arch/x86/include/asm/stackprotector.h
1834 --- linux-4.9.6.orig/arch/x86/include/asm/stackprotector.h 2017-01-26 08:25:24.000000000 +0100
1835 +++ linux-4.9.6/arch/x86/include/asm/stackprotector.h 2017-01-28 13:59:09.899654958 +0100
1838 static __always_inline void boot_init_stack_canary(void)
1841 + u64 uninitialized_var(canary);
1844 #ifdef CONFIG_X86_64
1846 * of randomness. The TSC only matters for very early init,
1847 * there it already has some randomness on most systems. Later
1848 * on during the bootup the random pool has true entropy too.
1850 + * For preempt-rt we need to weaken the randomness a bit, as
1851 + * we can't call into the random generator from atomic context
1852 + * due to locking constraints. We just leave canary
1853 + * uninitialized and use the TSC based randomness on top of it.
1855 +#ifndef CONFIG_PREEMPT_RT_FULL
1856 get_random_bytes(&canary, sizeof(canary));
1859 canary += tsc + (tsc << 32UL);
1861 diff -Nur linux-4.9.6.orig/arch/x86/include/asm/thread_info.h linux-4.9.6/arch/x86/include/asm/thread_info.h
1862 --- linux-4.9.6.orig/arch/x86/include/asm/thread_info.h 2017-01-26 08:25:24.000000000 +0100
1863 +++ linux-4.9.6/arch/x86/include/asm/thread_info.h 2017-01-28 13:59:09.899654958 +0100
1866 struct thread_info {
1867 unsigned long flags; /* low level flags */
1868 + int preempt_lazy_count; /* 0 => lazy preemptable
1872 #define INIT_THREAD_INFO(tsk) \
1875 + .preempt_lazy_count = 0, \
1878 #define init_stack (init_thread_union.stack)
1881 #include <asm/asm-offsets.h>
1883 +#define GET_THREAD_INFO(reg) \
1884 + _ASM_MOV PER_CPU_VAR(cpu_current_top_of_stack),reg ; \
1885 + _ASM_SUB $(THREAD_SIZE),reg ;
1891 #define TIF_SYSCALL_EMU 6 /* syscall emulation active */
1892 #define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */
1893 #define TIF_SECCOMP 8 /* secure computing */
1894 +#define TIF_NEED_RESCHED_LAZY 9 /* lazy rescheduling necessary */
1895 #define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */
1896 #define TIF_UPROBE 12 /* breakpointed or singlestepping */
1897 #define TIF_NOTSC 16 /* TSC is not accessible in userland */
1899 #define _TIF_SYSCALL_EMU (1 << TIF_SYSCALL_EMU)
1900 #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT)
1901 #define _TIF_SECCOMP (1 << TIF_SECCOMP)
1902 +#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY)
1903 #define _TIF_USER_RETURN_NOTIFY (1 << TIF_USER_RETURN_NOTIFY)
1904 #define _TIF_UPROBE (1 << TIF_UPROBE)
1905 #define _TIF_NOTSC (1 << TIF_NOTSC)
1907 #define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY)
1908 #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW)
1910 +#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)
1912 #define STACK_WARN (THREAD_SIZE/8)
1915 diff -Nur linux-4.9.6.orig/arch/x86/include/asm/uv/uv_bau.h linux-4.9.6/arch/x86/include/asm/uv/uv_bau.h
1916 --- linux-4.9.6.orig/arch/x86/include/asm/uv/uv_bau.h 2017-01-26 08:25:24.000000000 +0100
1917 +++ linux-4.9.6/arch/x86/include/asm/uv/uv_bau.h 2017-01-28 13:59:09.899654958 +0100
1919 cycles_t send_message;
1920 cycles_t period_end;
1921 cycles_t period_time;
1922 - spinlock_t uvhub_lock;
1923 - spinlock_t queue_lock;
1924 - spinlock_t disable_lock;
1925 + raw_spinlock_t uvhub_lock;
1926 + raw_spinlock_t queue_lock;
1927 + raw_spinlock_t disable_lock;
1930 int max_concurr_const;
1931 @@ -815,15 +815,15 @@
1932 * to be lowered below the current 'v'. atomic_add_unless can only stop
1935 -static inline int atomic_inc_unless_ge(spinlock_t *lock, atomic_t *v, int u)
1936 +static inline int atomic_inc_unless_ge(raw_spinlock_t *lock, atomic_t *v, int u)
1939 + raw_spin_lock(lock);
1940 if (atomic_read(v) >= u) {
1941 - spin_unlock(lock);
1942 + raw_spin_unlock(lock);
1946 - spin_unlock(lock);
1947 + raw_spin_unlock(lock);
1951 diff -Nur linux-4.9.6.orig/arch/x86/Kconfig linux-4.9.6/arch/x86/Kconfig
1952 --- linux-4.9.6.orig/arch/x86/Kconfig 2017-01-26 08:25:24.000000000 +0100
1953 +++ linux-4.9.6/arch/x86/Kconfig 2017-01-28 13:59:09.895654805 +0100
1958 + select HAVE_PREEMPT_LAZY
1959 select ACPI_LEGACY_TABLES_LOOKUP if ACPI
1960 select ACPI_SYSTEM_POWER_STATES_SUPPORT if ACPI
1962 @@ -232,8 +233,11 @@
1964 depends on ISA_DMA_API
1966 +config RWSEM_GENERIC_SPINLOCK
1967 + def_bool PREEMPT_RT_FULL
1969 config RWSEM_XCHGADD_ALGORITHM
1971 + def_bool !RWSEM_GENERIC_SPINLOCK && !PREEMPT_RT_FULL
1973 config GENERIC_CALIBRATE_DELAY
1977 bool "Enable Maximum number of SMP Processors and NUMA Nodes"
1978 depends on X86_64 && SMP && DEBUG_KERNEL
1979 - select CPUMASK_OFFSTACK
1980 + select CPUMASK_OFFSTACK if !PREEMPT_RT_FULL
1982 Enable maximum number of CPUS and NUMA Nodes for this architecture.
1984 diff -Nur linux-4.9.6.orig/arch/x86/kernel/acpi/boot.c linux-4.9.6/arch/x86/kernel/acpi/boot.c
1985 --- linux-4.9.6.orig/arch/x86/kernel/acpi/boot.c 2017-01-26 08:25:24.000000000 +0100
1986 +++ linux-4.9.6/arch/x86/kernel/acpi/boot.c 2017-01-28 13:59:09.899654958 +0100
1991 +#ifdef CONFIG_X86_IO_APIC
1992 static DEFINE_MUTEX(acpi_ioapic_lock);
1995 /* --------------------------------------------------------------------------
1996 Boot-time Configuration
1997 diff -Nur linux-4.9.6.orig/arch/x86/kernel/apic/io_apic.c linux-4.9.6/arch/x86/kernel/apic/io_apic.c
1998 --- linux-4.9.6.orig/arch/x86/kernel/apic/io_apic.c 2017-01-26 08:25:24.000000000 +0100
1999 +++ linux-4.9.6/arch/x86/kernel/apic/io_apic.c 2017-01-28 13:59:09.899654958 +0100
2000 @@ -1712,7 +1712,8 @@
2001 static inline bool ioapic_irqd_mask(struct irq_data *data)
2003 /* If we are moving the irq we need to mask it */
2004 - if (unlikely(irqd_is_setaffinity_pending(data))) {
2005 + if (unlikely(irqd_is_setaffinity_pending(data) &&
2006 + !irqd_irq_inprogress(data))) {
2007 mask_ioapic_irq(data);
2010 diff -Nur linux-4.9.6.orig/arch/x86/kernel/asm-offsets.c linux-4.9.6/arch/x86/kernel/asm-offsets.c
2011 --- linux-4.9.6.orig/arch/x86/kernel/asm-offsets.c 2017-01-26 08:25:24.000000000 +0100
2012 +++ linux-4.9.6/arch/x86/kernel/asm-offsets.c 2017-01-28 13:59:09.899654958 +0100
2016 OFFSET(TASK_TI_flags, task_struct, thread_info.flags);
2017 + OFFSET(TASK_TI_preempt_lazy_count, task_struct, thread_info.preempt_lazy_count);
2018 OFFSET(TASK_addr_limit, task_struct, thread.addr_limit);
2024 DEFINE(PTREGS_SIZE, sizeof(struct pt_regs));
2025 + DEFINE(_PREEMPT_ENABLED, PREEMPT_ENABLED);
2027 diff -Nur linux-4.9.6.orig/arch/x86/kernel/cpu/mcheck/mce.c linux-4.9.6/arch/x86/kernel/cpu/mcheck/mce.c
2028 --- linux-4.9.6.orig/arch/x86/kernel/cpu/mcheck/mce.c 2017-01-26 08:25:24.000000000 +0100
2029 +++ linux-4.9.6/arch/x86/kernel/cpu/mcheck/mce.c 2017-01-28 13:59:09.899654958 +0100
2031 #include <linux/debugfs.h>
2032 #include <linux/irq_work.h>
2033 #include <linux/export.h>
2034 +#include <linux/jiffies.h>
2035 +#include <linux/swork.h>
2036 #include <linux/jump_label.h>
2038 #include <asm/processor.h>
2039 @@ -1317,7 +1319,7 @@
2040 static unsigned long check_interval = INITIAL_CHECK_INTERVAL;
2042 static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
2043 -static DEFINE_PER_CPU(struct timer_list, mce_timer);
2044 +static DEFINE_PER_CPU(struct hrtimer, mce_timer);
2046 static unsigned long mce_adjust_timer_default(unsigned long interval)
2048 @@ -1326,32 +1328,18 @@
2050 static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default;
2052 -static void __restart_timer(struct timer_list *t, unsigned long interval)
2053 +static enum hrtimer_restart __restart_timer(struct hrtimer *timer, unsigned long interval)
2055 - unsigned long when = jiffies + interval;
2056 - unsigned long flags;
2058 - local_irq_save(flags);
2060 - if (timer_pending(t)) {
2061 - if (time_before(when, t->expires))
2062 - mod_timer(t, when);
2064 - t->expires = round_jiffies(when);
2065 - add_timer_on(t, smp_processor_id());
2068 - local_irq_restore(flags);
2070 + return HRTIMER_NORESTART;
2071 + hrtimer_forward_now(timer, ns_to_ktime(jiffies_to_nsecs(interval)));
2072 + return HRTIMER_RESTART;
2075 -static void mce_timer_fn(unsigned long data)
2076 +static enum hrtimer_restart mce_timer_fn(struct hrtimer *timer)
2078 - struct timer_list *t = this_cpu_ptr(&mce_timer);
2079 - int cpu = smp_processor_id();
2082 - WARN_ON(cpu != data);
2084 iv = __this_cpu_read(mce_next_interval);
2086 if (mce_available(this_cpu_ptr(&cpu_info))) {
2087 @@ -1374,7 +1362,7 @@
2090 __this_cpu_write(mce_next_interval, iv);
2091 - __restart_timer(t, iv);
2092 + return __restart_timer(timer, iv);
2096 @@ -1382,7 +1370,7 @@
2098 void mce_timer_kick(unsigned long interval)
2100 - struct timer_list *t = this_cpu_ptr(&mce_timer);
2101 + struct hrtimer *t = this_cpu_ptr(&mce_timer);
2102 unsigned long iv = __this_cpu_read(mce_next_interval);
2104 __restart_timer(t, interval);
2105 @@ -1397,7 +1385,7 @@
2108 for_each_online_cpu(cpu)
2109 - del_timer_sync(&per_cpu(mce_timer, cpu));
2110 + hrtimer_cancel(&per_cpu(mce_timer, cpu));
2113 static void mce_do_trigger(struct work_struct *work)
2114 @@ -1407,6 +1395,56 @@
2116 static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
2118 +static void __mce_notify_work(struct swork_event *event)
2120 + /* Not more than two messages every minute */
2121 + static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
2123 + /* wake processes polling /dev/mcelog */
2124 + wake_up_interruptible(&mce_chrdev_wait);
2127 + * There is no risk of missing notifications because
2128 + * work_pending is always cleared before the function is
2131 + if (mce_helper[0] && !work_pending(&mce_trigger_work))
2132 + schedule_work(&mce_trigger_work);
2134 + if (__ratelimit(&ratelimit))
2135 + pr_info(HW_ERR "Machine check events logged\n");
2138 +#ifdef CONFIG_PREEMPT_RT_FULL
2139 +static bool notify_work_ready __read_mostly;
2140 +static struct swork_event notify_work;
2142 +static int mce_notify_work_init(void)
2146 + err = swork_get();
2150 + INIT_SWORK(¬ify_work, __mce_notify_work);
2151 + notify_work_ready = true;
2155 +static void mce_notify_work(void)
2157 + if (notify_work_ready)
2158 + swork_queue(¬ify_work);
2161 +static void mce_notify_work(void)
2163 + __mce_notify_work(NULL);
2165 +static inline int mce_notify_work_init(void) { return 0; }
2169 * Notify the user(s) about new machine check events.
2170 * Can be called from interrupt context, but not from machine check/NMI
2171 @@ -1414,19 +1452,8 @@
2173 int mce_notify_irq(void)
2175 - /* Not more than two messages every minute */
2176 - static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
2178 if (test_and_clear_bit(0, &mce_need_notify)) {
2179 - /* wake processes polling /dev/mcelog */
2180 - wake_up_interruptible(&mce_chrdev_wait);
2182 - if (mce_helper[0])
2183 - schedule_work(&mce_trigger_work);
2185 - if (__ratelimit(&ratelimit))
2186 - pr_info(HW_ERR "Machine check events logged\n");
2188 + mce_notify_work();
2192 @@ -1732,7 +1759,7 @@
2196 -static void mce_start_timer(unsigned int cpu, struct timer_list *t)
2197 +static void mce_start_timer(unsigned int cpu, struct hrtimer *t)
2199 unsigned long iv = check_interval * HZ;
2201 @@ -1741,16 +1768,17 @@
2203 per_cpu(mce_next_interval, cpu) = iv;
2205 - t->expires = round_jiffies(jiffies + iv);
2206 - add_timer_on(t, cpu);
2207 + hrtimer_start_range_ns(t, ns_to_ktime(jiffies_to_usecs(iv) * 1000ULL),
2208 + 0, HRTIMER_MODE_REL_PINNED);
2211 static void __mcheck_cpu_init_timer(void)
2213 - struct timer_list *t = this_cpu_ptr(&mce_timer);
2214 + struct hrtimer *t = this_cpu_ptr(&mce_timer);
2215 unsigned int cpu = smp_processor_id();
2217 - setup_pinned_timer(t, mce_timer_fn, cpu);
2218 + hrtimer_init(t, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2219 + t->function = mce_timer_fn;
2220 mce_start_timer(cpu, t);
2223 @@ -2475,6 +2503,8 @@
2224 if (!mce_available(raw_cpu_ptr(&cpu_info)))
2227 + hrtimer_cancel(this_cpu_ptr(&mce_timer));
2229 if (!(action & CPU_TASKS_FROZEN))
2232 @@ -2497,6 +2527,7 @@
2234 wrmsrl(msr_ops.ctl(i), b->ctl);
2236 + __mcheck_cpu_init_timer();
2239 /* Get notified when a cpu comes on/off. Be hotplug friendly. */
2240 @@ -2504,7 +2535,6 @@
2241 mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
2243 unsigned int cpu = (unsigned long)hcpu;
2244 - struct timer_list *t = &per_cpu(mce_timer, cpu);
2246 switch (action & ~CPU_TASKS_FROZEN) {
2248 @@ -2524,11 +2554,9 @@
2250 case CPU_DOWN_PREPARE:
2251 smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
2252 - del_timer_sync(t);
2254 case CPU_DOWN_FAILED:
2255 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
2256 - mce_start_timer(cpu, t);
2260 @@ -2567,6 +2595,10 @@
2264 + err = mce_notify_work_init();
2268 if (!zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL)) {
2271 diff -Nur linux-4.9.6.orig/arch/x86/kernel/irq_32.c linux-4.9.6/arch/x86/kernel/irq_32.c
2272 --- linux-4.9.6.orig/arch/x86/kernel/irq_32.c 2017-01-26 08:25:24.000000000 +0100
2273 +++ linux-4.9.6/arch/x86/kernel/irq_32.c 2017-01-28 13:59:09.899654958 +0100
2275 cpu, per_cpu(hardirq_stack, cpu), per_cpu(softirq_stack, cpu));
2278 +#ifndef CONFIG_PREEMPT_RT_FULL
2279 void do_softirq_own_stack(void)
2281 struct irq_stack *irqstk;
2284 call_on_stack(__do_softirq, isp);
2288 bool handle_irq(struct irq_desc *desc, struct pt_regs *regs)
2290 diff -Nur linux-4.9.6.orig/arch/x86/kernel/process_32.c linux-4.9.6/arch/x86/kernel/process_32.c
2291 --- linux-4.9.6.orig/arch/x86/kernel/process_32.c 2017-01-26 08:25:24.000000000 +0100
2292 +++ linux-4.9.6/arch/x86/kernel/process_32.c 2017-01-28 13:59:09.899654958 +0100
2294 #include <linux/uaccess.h>
2295 #include <linux/io.h>
2296 #include <linux/kdebug.h>
2297 +#include <linux/highmem.h>
2299 #include <asm/pgtable.h>
2300 #include <asm/ldt.h>
2301 @@ -195,6 +196,35 @@
2303 EXPORT_SYMBOL_GPL(start_thread);
2305 +#ifdef CONFIG_PREEMPT_RT_FULL
2306 +static void switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p)
2311 + * Clear @prev's kmap_atomic mappings
2313 + for (i = 0; i < prev_p->kmap_idx; i++) {
2314 + int idx = i + KM_TYPE_NR * smp_processor_id();
2315 + pte_t *ptep = kmap_pte - idx;
2317 + kpte_clear_flush(ptep, __fix_to_virt(FIX_KMAP_BEGIN + idx));
2320 + * Restore @next_p's kmap_atomic mappings
2322 + for (i = 0; i < next_p->kmap_idx; i++) {
2323 + int idx = i + KM_TYPE_NR * smp_processor_id();
2325 + if (!pte_none(next_p->kmap_pte[i]))
2326 + set_pte(kmap_pte - idx, next_p->kmap_pte[i]);
2331 +switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p) { }
2336 * switch_to(x,y) should switch tasks from x to y.
2338 task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT))
2339 __switch_to_xtra(prev_p, next_p, tss);
2341 + switch_kmaps(prev_p, next_p);
2344 * Leave lazy mode, flushing any hypercalls made here.
2345 * This must be done before restoring TLS segments so
2346 diff -Nur linux-4.9.6.orig/arch/x86/kvm/lapic.c linux-4.9.6/arch/x86/kvm/lapic.c
2347 --- linux-4.9.6.orig/arch/x86/kvm/lapic.c 2017-01-26 08:25:24.000000000 +0100
2348 +++ linux-4.9.6/arch/x86/kvm/lapic.c 2017-01-28 13:59:09.899654958 +0100
2349 @@ -1939,6 +1939,7 @@
2350 hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC,
2351 HRTIMER_MODE_ABS_PINNED);
2352 apic->lapic_timer.timer.function = apic_timer_fn;
2353 + apic->lapic_timer.timer.irqsafe = 1;
2356 * APIC is created enabled. This will prevent kvm_lapic_set_base from
2357 diff -Nur linux-4.9.6.orig/arch/x86/kvm/x86.c linux-4.9.6/arch/x86/kvm/x86.c
2358 --- linux-4.9.6.orig/arch/x86/kvm/x86.c 2017-01-26 08:25:24.000000000 +0100
2359 +++ linux-4.9.6/arch/x86/kvm/x86.c 2017-01-28 13:59:09.903655111 +0100
2360 @@ -5932,6 +5932,13 @@
2364 +#ifdef CONFIG_PREEMPT_RT_FULL
2365 + if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
2366 + printk(KERN_ERR "RT requires X86_FEATURE_CONSTANT_TSC\n");
2367 + return -EOPNOTSUPP;
2371 r = kvm_mmu_module_init();
2373 goto out_free_percpu;
2374 diff -Nur linux-4.9.6.orig/arch/x86/mm/highmem_32.c linux-4.9.6/arch/x86/mm/highmem_32.c
2375 --- linux-4.9.6.orig/arch/x86/mm/highmem_32.c 2017-01-26 08:25:24.000000000 +0100
2376 +++ linux-4.9.6/arch/x86/mm/highmem_32.c 2017-01-28 13:59:09.903655111 +0100
2379 void *kmap_atomic_prot(struct page *page, pgprot_t prot)
2381 + pte_t pte = mk_pte(page, prot);
2382 unsigned long vaddr;
2385 - preempt_disable();
2386 + preempt_disable_nort();
2387 pagefault_disable();
2389 if (!PageHighMem(page))
2391 idx = type + KM_TYPE_NR*smp_processor_id();
2392 vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
2393 BUG_ON(!pte_none(*(kmap_pte-idx)));
2394 - set_pte(kmap_pte-idx, mk_pte(page, prot));
2395 +#ifdef CONFIG_PREEMPT_RT_FULL
2396 + current->kmap_pte[type] = pte;
2398 + set_pte(kmap_pte-idx, pte);
2399 arch_flush_lazy_mmu_mode();
2401 return (void *)vaddr;
2403 * is a bad idea also, in case the page changes cacheability
2404 * attributes or becomes a protected page in a hypervisor.
2406 +#ifdef CONFIG_PREEMPT_RT_FULL
2407 + current->kmap_pte[type] = __pte(0);
2409 kpte_clear_flush(kmap_pte-idx, vaddr);
2410 kmap_atomic_idx_pop();
2411 arch_flush_lazy_mmu_mode();
2417 + preempt_enable_nort();
2419 EXPORT_SYMBOL(__kunmap_atomic);
2421 diff -Nur linux-4.9.6.orig/arch/x86/mm/iomap_32.c linux-4.9.6/arch/x86/mm/iomap_32.c
2422 --- linux-4.9.6.orig/arch/x86/mm/iomap_32.c 2017-01-26 08:25:24.000000000 +0100
2423 +++ linux-4.9.6/arch/x86/mm/iomap_32.c 2017-01-28 13:59:09.903655111 +0100
2426 void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
2428 + pte_t pte = pfn_pte(pfn, prot);
2429 unsigned long vaddr;
2433 type = kmap_atomic_idx_push();
2434 idx = type + KM_TYPE_NR * smp_processor_id();
2435 vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
2436 - set_pte(kmap_pte - idx, pfn_pte(pfn, prot));
2437 + WARN_ON(!pte_none(*(kmap_pte - idx)));
2439 +#ifdef CONFIG_PREEMPT_RT_FULL
2440 + current->kmap_pte[type] = pte;
2442 + set_pte(kmap_pte - idx, pte);
2443 arch_flush_lazy_mmu_mode();
2445 return (void *)vaddr;
2447 * is a bad idea also, in case the page changes cacheability
2448 * attributes or becomes a protected page in a hypervisor.
2450 +#ifdef CONFIG_PREEMPT_RT_FULL
2451 + current->kmap_pte[type] = __pte(0);
2453 kpte_clear_flush(kmap_pte-idx, vaddr);
2454 kmap_atomic_idx_pop();
2456 diff -Nur linux-4.9.6.orig/arch/x86/platform/uv/tlb_uv.c linux-4.9.6/arch/x86/platform/uv/tlb_uv.c
2457 --- linux-4.9.6.orig/arch/x86/platform/uv/tlb_uv.c 2017-01-26 08:25:24.000000000 +0100
2458 +++ linux-4.9.6/arch/x86/platform/uv/tlb_uv.c 2017-01-28 13:59:09.903655111 +0100
2461 quiesce_local_uvhub(hmaster);
2463 - spin_lock(&hmaster->queue_lock);
2464 + raw_spin_lock(&hmaster->queue_lock);
2465 reset_with_ipi(&bau_desc->distribution, bcp);
2466 - spin_unlock(&hmaster->queue_lock);
2467 + raw_spin_unlock(&hmaster->queue_lock);
2469 end_uvhub_quiesce(hmaster);
2473 quiesce_local_uvhub(hmaster);
2475 - spin_lock(&hmaster->queue_lock);
2476 + raw_spin_lock(&hmaster->queue_lock);
2477 reset_with_ipi(&bau_desc->distribution, bcp);
2478 - spin_unlock(&hmaster->queue_lock);
2479 + raw_spin_unlock(&hmaster->queue_lock);
2481 end_uvhub_quiesce(hmaster);
2486 hmaster = bcp->uvhub_master;
2487 - spin_lock(&hmaster->disable_lock);
2488 + raw_spin_lock(&hmaster->disable_lock);
2489 if (!bcp->baudisabled) {
2490 stat->s_bau_disabled++;
2496 - spin_unlock(&hmaster->disable_lock);
2497 + raw_spin_unlock(&hmaster->disable_lock);
2500 static void count_max_concurr(int stat, struct bau_control *bcp,
2503 static void uv1_throttle(struct bau_control *hmaster, struct ptc_stats *stat)
2505 - spinlock_t *lock = &hmaster->uvhub_lock;
2506 + raw_spinlock_t *lock = &hmaster->uvhub_lock;
2509 v = &hmaster->active_descriptor_count;
2510 @@ -1002,7 +1002,7 @@
2511 struct bau_control *hmaster;
2513 hmaster = bcp->uvhub_master;
2514 - spin_lock(&hmaster->disable_lock);
2515 + raw_spin_lock(&hmaster->disable_lock);
2516 if (bcp->baudisabled && (get_cycles() >= bcp->set_bau_on_time)) {
2517 stat->s_bau_reenabled++;
2518 for_each_present_cpu(tcpu) {
2519 @@ -1014,10 +1014,10 @@
2520 tbcp->period_giveups = 0;
2523 - spin_unlock(&hmaster->disable_lock);
2524 + raw_spin_unlock(&hmaster->disable_lock);
2527 - spin_unlock(&hmaster->disable_lock);
2528 + raw_spin_unlock(&hmaster->disable_lock);
2532 @@ -1940,9 +1940,9 @@
2533 bcp->cong_reps = congested_reps;
2534 bcp->disabled_period = sec_2_cycles(disabled_period);
2535 bcp->giveup_limit = giveup_limit;
2536 - spin_lock_init(&bcp->queue_lock);
2537 - spin_lock_init(&bcp->uvhub_lock);
2538 - spin_lock_init(&bcp->disable_lock);
2539 + raw_spin_lock_init(&bcp->queue_lock);
2540 + raw_spin_lock_init(&bcp->uvhub_lock);
2541 + raw_spin_lock_init(&bcp->disable_lock);
2545 diff -Nur linux-4.9.6.orig/arch/x86/platform/uv/uv_time.c linux-4.9.6/arch/x86/platform/uv/uv_time.c
2546 --- linux-4.9.6.orig/arch/x86/platform/uv/uv_time.c 2017-01-26 08:25:24.000000000 +0100
2547 +++ linux-4.9.6/arch/x86/platform/uv/uv_time.c 2017-01-28 13:59:09.903655111 +0100
2550 /* There is one of these allocated per node */
2551 struct uv_rtc_timer_head {
2553 + raw_spinlock_t lock;
2554 /* next cpu waiting for timer, local node relative: */
2556 /* number of cpus on this node: */
2558 uv_rtc_deallocate_timers();
2561 - spin_lock_init(&head->lock);
2562 + raw_spin_lock_init(&head->lock);
2563 head->ncpus = uv_blade_nr_possible_cpus(bid);
2564 head->next_cpu = -1;
2565 blade_info[bid] = head;
2567 unsigned long flags;
2570 - spin_lock_irqsave(&head->lock, flags);
2571 + raw_spin_lock_irqsave(&head->lock, flags);
2573 next_cpu = head->next_cpu;
2575 @@ -243,12 +243,12 @@
2576 if (uv_setup_intr(cpu, expires)) {
2578 uv_rtc_find_next_timer(head, pnode);
2579 - spin_unlock_irqrestore(&head->lock, flags);
2580 + raw_spin_unlock_irqrestore(&head->lock, flags);
2585 - spin_unlock_irqrestore(&head->lock, flags);
2586 + raw_spin_unlock_irqrestore(&head->lock, flags);
2591 unsigned long flags;
2594 - spin_lock_irqsave(&head->lock, flags);
2595 + raw_spin_lock_irqsave(&head->lock, flags);
2597 if ((head->next_cpu == bcpu && uv_read_rtc(NULL) >= *t) || force)
2600 uv_rtc_find_next_timer(head, pnode);
2603 - spin_unlock_irqrestore(&head->lock, flags);
2604 + raw_spin_unlock_irqrestore(&head->lock, flags);
2608 @@ -299,13 +299,18 @@
2609 static cycle_t uv_read_rtc(struct clocksource *cs)
2611 unsigned long offset;
2614 + preempt_disable();
2615 if (uv_get_min_hub_revision_id() == 1)
2618 offset = (uv_blade_processor_id() * L1_CACHE_BYTES) % PAGE_SIZE;
2620 - return (cycle_t)uv_read_local_mmr(UVH_RTC | offset);
2621 + cycles = (cycle_t)uv_read_local_mmr(UVH_RTC | offset);
2628 diff -Nur linux-4.9.6.orig/block/blk-core.c linux-4.9.6/block/blk-core.c
2629 --- linux-4.9.6.orig/block/blk-core.c 2017-01-26 08:25:24.000000000 +0100
2630 +++ linux-4.9.6/block/blk-core.c 2017-01-28 13:59:09.903655111 +0100
2633 INIT_LIST_HEAD(&rq->queuelist);
2634 INIT_LIST_HEAD(&rq->timeout_list);
2635 +#ifdef CONFIG_PREEMPT_RT_FULL
2636 + INIT_WORK(&rq->work, __blk_mq_complete_request_remote_work);
2640 rq->__sector = (sector_t) -1;
2643 void blk_start_queue(struct request_queue *q)
2645 - WARN_ON(!irqs_disabled());
2646 + WARN_ON_NONRT(!irqs_disabled());
2648 queue_flag_clear(QUEUE_FLAG_STOPPED, q);
2654 - ret = wait_event_interruptible(q->mq_freeze_wq,
2655 + ret = swait_event_interruptible(q->mq_freeze_wq,
2656 !atomic_read(&q->mq_freeze_depth) ||
2657 blk_queue_dying(q));
2658 if (blk_queue_dying(q))
2660 struct request_queue *q =
2661 container_of(ref, struct request_queue, q_usage_counter);
2663 - wake_up_all(&q->mq_freeze_wq);
2664 + swake_up_all(&q->mq_freeze_wq);
2667 static void blk_rq_timed_out_timer(unsigned long data)
2669 q->bypass_depth = 1;
2670 __set_bit(QUEUE_FLAG_BYPASS, &q->queue_flags);
2672 - init_waitqueue_head(&q->mq_freeze_wq);
2673 + init_swait_queue_head(&q->mq_freeze_wq);
2676 * Init percpu_ref in atomic mode so that it's faster to shutdown.
2677 @@ -3177,7 +3180,7 @@
2678 blk_run_queue_async(q);
2681 - spin_unlock(q->queue_lock);
2682 + spin_unlock_irq(q->queue_lock);
2685 static void flush_plug_callbacks(struct blk_plug *plug, bool from_schedule)
2686 @@ -3225,7 +3228,6 @@
2687 void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
2689 struct request_queue *q;
2690 - unsigned long flags;
2694 @@ -3245,11 +3247,6 @@
2699 - * Save and disable interrupts here, to avoid doing it for every
2700 - * queue lock we have to take.
2702 - local_irq_save(flags);
2703 while (!list_empty(&list)) {
2704 rq = list_entry_rq(list.next);
2705 list_del_init(&rq->queuelist);
2706 @@ -3262,7 +3259,7 @@
2707 queue_unplugged(q, depth, from_schedule);
2710 - spin_lock(q->queue_lock);
2711 + spin_lock_irq(q->queue_lock);
2715 @@ -3289,8 +3286,6 @@
2718 queue_unplugged(q, depth, from_schedule);
2720 - local_irq_restore(flags);
2723 void blk_finish_plug(struct blk_plug *plug)
2724 diff -Nur linux-4.9.6.orig/block/blk-ioc.c linux-4.9.6/block/blk-ioc.c
2725 --- linux-4.9.6.orig/block/blk-ioc.c 2017-01-26 08:25:24.000000000 +0100
2726 +++ linux-4.9.6/block/blk-ioc.c 2017-01-28 13:59:09.903655111 +0100
2728 #include <linux/bio.h>
2729 #include <linux/blkdev.h>
2730 #include <linux/slab.h>
2731 +#include <linux/delay.h>
2736 spin_unlock(q->queue_lock);
2738 spin_unlock_irqrestore(&ioc->lock, flags);
2741 spin_lock_irqsave_nested(&ioc->lock, flags, 1);
2745 spin_unlock(icq->q->queue_lock);
2747 spin_unlock_irqrestore(&ioc->lock, flags);
2753 diff -Nur linux-4.9.6.orig/block/blk-mq.c linux-4.9.6/block/blk-mq.c
2754 --- linux-4.9.6.orig/block/blk-mq.c 2017-01-26 08:25:24.000000000 +0100
2755 +++ linux-4.9.6/block/blk-mq.c 2017-01-28 13:59:09.903655111 +0100
2758 static void blk_mq_freeze_queue_wait(struct request_queue *q)
2760 - wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter));
2761 + swait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter));
2766 WARN_ON_ONCE(freeze_depth < 0);
2767 if (!freeze_depth) {
2768 percpu_ref_reinit(&q->q_usage_counter);
2769 - wake_up_all(&q->mq_freeze_wq);
2770 + swake_up_all(&q->mq_freeze_wq);
2773 EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);
2775 * dying, we need to ensure that processes currently waiting on
2776 * the queue are notified as well.
2778 - wake_up_all(&q->mq_freeze_wq);
2779 + swake_up_all(&q->mq_freeze_wq);
2782 bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
2787 +#ifdef CONFIG_PREEMPT_RT_FULL
2788 + INIT_WORK(&rq->work, __blk_mq_complete_request_remote_work);
2790 INIT_LIST_HEAD(&rq->timeout_list);
2793 @@ -345,6 +348,17 @@
2795 EXPORT_SYMBOL(blk_mq_end_request);
2797 +#ifdef CONFIG_PREEMPT_RT_FULL
2799 +void __blk_mq_complete_request_remote_work(struct work_struct *work)
2801 + struct request *rq = container_of(work, struct request, work);
2803 + rq->q->softirq_done_fn(rq);
2808 static void __blk_mq_complete_request_remote(void *data)
2810 struct request *rq = data;
2812 rq->q->softirq_done_fn(rq);
2817 static void blk_mq_ipi_complete_request(struct request *rq)
2819 struct blk_mq_ctx *ctx = rq->mq_ctx;
2820 @@ -363,19 +379,23 @@
2825 + cpu = get_cpu_light();
2826 if (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags))
2827 shared = cpus_share_cache(cpu, ctx->cpu);
2829 if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) {
2830 +#ifdef CONFIG_PREEMPT_RT_FULL
2831 + schedule_work_on(ctx->cpu, &rq->work);
2833 rq->csd.func = __blk_mq_complete_request_remote;
2836 smp_call_function_single_async(ctx->cpu, &rq->csd);
2839 rq->q->softirq_done_fn(rq);
2845 static void __blk_mq_complete_request(struct request *rq)
2846 @@ -915,14 +935,14 @@
2849 if (!async && !(hctx->flags & BLK_MQ_F_BLOCKING)) {
2850 - int cpu = get_cpu();
2851 + int cpu = get_cpu_light();
2852 if (cpumask_test_cpu(cpu, hctx->cpumask)) {
2853 __blk_mq_run_hw_queue(hctx);
2863 kblockd_schedule_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work);
2864 diff -Nur linux-4.9.6.orig/block/blk-mq.h linux-4.9.6/block/blk-mq.h
2865 --- linux-4.9.6.orig/block/blk-mq.h 2017-01-26 08:25:24.000000000 +0100
2866 +++ linux-4.9.6/block/blk-mq.h 2017-01-28 13:59:09.903655111 +0100
2869 static inline struct blk_mq_ctx *blk_mq_get_ctx(struct request_queue *q)
2871 - return __blk_mq_get_ctx(q, get_cpu());
2872 + return __blk_mq_get_ctx(q, get_cpu_light());
2875 static inline void blk_mq_put_ctx(struct blk_mq_ctx *ctx)
2881 struct blk_mq_alloc_data {
2882 diff -Nur linux-4.9.6.orig/block/blk-softirq.c linux-4.9.6/block/blk-softirq.c
2883 --- linux-4.9.6.orig/block/blk-softirq.c 2017-01-26 08:25:24.000000000 +0100
2884 +++ linux-4.9.6/block/blk-softirq.c 2017-01-28 13:59:09.903655111 +0100
2886 raise_softirq_irqoff(BLOCK_SOFTIRQ);
2888 local_irq_restore(flags);
2889 + preempt_check_resched_rt();
2894 this_cpu_ptr(&blk_cpu_done));
2895 raise_softirq_irqoff(BLOCK_SOFTIRQ);
2897 + preempt_check_resched_rt();
2904 local_irq_restore(flags);
2905 + preempt_check_resched_rt();
2909 diff -Nur linux-4.9.6.orig/block/bounce.c linux-4.9.6/block/bounce.c
2910 --- linux-4.9.6.orig/block/bounce.c 2017-01-26 08:25:24.000000000 +0100
2911 +++ linux-4.9.6/block/bounce.c 2017-01-28 13:59:09.903655111 +0100
2913 unsigned long flags;
2916 - local_irq_save(flags);
2917 + local_irq_save_nort(flags);
2918 vto = kmap_atomic(to->bv_page);
2919 memcpy(vto + to->bv_offset, vfrom, to->bv_len);
2921 - local_irq_restore(flags);
2922 + local_irq_restore_nort(flags);
2925 #else /* CONFIG_HIGHMEM */
2926 diff -Nur linux-4.9.6.orig/crypto/algapi.c linux-4.9.6/crypto/algapi.c
2927 --- linux-4.9.6.orig/crypto/algapi.c 2017-01-26 08:25:24.000000000 +0100
2928 +++ linux-4.9.6/crypto/algapi.c 2017-01-28 13:59:09.903655111 +0100
2929 @@ -718,13 +718,13 @@
2931 int crypto_register_notifier(struct notifier_block *nb)
2933 - return blocking_notifier_chain_register(&crypto_chain, nb);
2934 + return srcu_notifier_chain_register(&crypto_chain, nb);
2936 EXPORT_SYMBOL_GPL(crypto_register_notifier);
2938 int crypto_unregister_notifier(struct notifier_block *nb)
2940 - return blocking_notifier_chain_unregister(&crypto_chain, nb);
2941 + return srcu_notifier_chain_unregister(&crypto_chain, nb);
2943 EXPORT_SYMBOL_GPL(crypto_unregister_notifier);
2945 diff -Nur linux-4.9.6.orig/crypto/api.c linux-4.9.6/crypto/api.c
2946 --- linux-4.9.6.orig/crypto/api.c 2017-01-26 08:25:24.000000000 +0100
2947 +++ linux-4.9.6/crypto/api.c 2017-01-28 13:59:09.903655111 +0100
2949 DECLARE_RWSEM(crypto_alg_sem);
2950 EXPORT_SYMBOL_GPL(crypto_alg_sem);
2952 -BLOCKING_NOTIFIER_HEAD(crypto_chain);
2953 +SRCU_NOTIFIER_HEAD(crypto_chain);
2954 EXPORT_SYMBOL_GPL(crypto_chain);
2956 static struct crypto_alg *crypto_larval_wait(struct crypto_alg *alg);
2957 @@ -236,10 +236,10 @@
2961 - ok = blocking_notifier_call_chain(&crypto_chain, val, v);
2962 + ok = srcu_notifier_call_chain(&crypto_chain, val, v);
2963 if (ok == NOTIFY_DONE) {
2964 request_module("cryptomgr");
2965 - ok = blocking_notifier_call_chain(&crypto_chain, val, v);
2966 + ok = srcu_notifier_call_chain(&crypto_chain, val, v);
2970 diff -Nur linux-4.9.6.orig/crypto/internal.h linux-4.9.6/crypto/internal.h
2971 --- linux-4.9.6.orig/crypto/internal.h 2017-01-26 08:25:24.000000000 +0100
2972 +++ linux-4.9.6/crypto/internal.h 2017-01-28 13:59:09.903655111 +0100
2975 extern struct list_head crypto_alg_list;
2976 extern struct rw_semaphore crypto_alg_sem;
2977 -extern struct blocking_notifier_head crypto_chain;
2978 +extern struct srcu_notifier_head crypto_chain;
2980 #ifdef CONFIG_PROC_FS
2981 void __init crypto_init_proc(void);
2984 static inline void crypto_notify(unsigned long val, void *v)
2986 - blocking_notifier_call_chain(&crypto_chain, val, v);
2987 + srcu_notifier_call_chain(&crypto_chain, val, v);
2990 #endif /* _CRYPTO_INTERNAL_H */
2991 diff -Nur linux-4.9.6.orig/Documentation/sysrq.txt linux-4.9.6/Documentation/sysrq.txt
2992 --- linux-4.9.6.orig/Documentation/sysrq.txt 2017-01-26 08:25:24.000000000 +0100
2993 +++ linux-4.9.6/Documentation/sysrq.txt 2017-01-28 13:59:09.887654498 +0100
2995 On other - If you know of the key combos for other architectures, please
2996 let me know so I can add them to this section.
2998 -On all - write a character to /proc/sysrq-trigger. e.g.:
3000 +On all - write a character to /proc/sysrq-trigger, e.g.:
3001 echo t > /proc/sysrq-trigger
3003 +On all - Enable network SysRq by writing a cookie to icmp_echo_sysrq, e.g.
3004 + echo 0x01020304 >/proc/sys/net/ipv4/icmp_echo_sysrq
3005 + Send an ICMP echo request with this pattern plus the particular
3006 + SysRq command key. Example:
3007 + # ping -c1 -s57 -p0102030468
3008 + will trigger the SysRq-H (help) command.
3011 * What are the 'command' keys?
3012 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
3013 'b' - Will immediately reboot the system without syncing or unmounting
3014 diff -Nur linux-4.9.6.orig/Documentation/trace/histograms.txt linux-4.9.6/Documentation/trace/histograms.txt
3015 --- linux-4.9.6.orig/Documentation/trace/histograms.txt 1970-01-01 01:00:00.000000000 +0100
3016 +++ linux-4.9.6/Documentation/trace/histograms.txt 2017-01-28 13:59:09.887654498 +0100
3018 + Using the Linux Kernel Latency Histograms
3021 +This document gives a short explanation how to enable, configure and use
3022 +latency histograms. Latency histograms are primarily relevant in the
3023 +context of real-time enabled kernels (CONFIG_PREEMPT/CONFIG_PREEMPT_RT)
3024 +and are used in the quality management of the Linux real-time
3028 +* Purpose of latency histograms
3030 +A latency histogram continuously accumulates the frequencies of latency
3031 +data. There are two types of histograms
3032 +- potential sources of latencies
3033 +- effective latencies
3036 +* Potential sources of latencies
3038 +Potential sources of latencies are code segments where interrupts,
3039 +preemption or both are disabled (aka critical sections). To create
3040 +histograms of potential sources of latency, the kernel stores the time
3041 +stamp at the start of a critical section, determines the time elapsed
3042 +when the end of the section is reached, and increments the frequency
3043 +counter of that latency value - irrespective of whether any concurrently
3044 +running process is affected by latency or not.
3045 +- Configuration items (in the Kernel hacking/Tracers submenu)
3046 + CONFIG_INTERRUPT_OFF_LATENCY
3047 + CONFIG_PREEMPT_OFF_LATENCY
3050 +* Effective latencies
3052 +Effective latencies are actually occuring during wakeup of a process. To
3053 +determine effective latencies, the kernel stores the time stamp when a
3054 +process is scheduled to be woken up, and determines the duration of the
3055 +wakeup time shortly before control is passed over to this process. Note
3056 +that the apparent latency in user space may be somewhat longer, since the
3057 +process may be interrupted after control is passed over to it but before
3058 +the execution in user space takes place. Simply measuring the interval
3059 +between enqueuing and wakeup may also not appropriate in cases when a
3060 +process is scheduled as a result of a timer expiration. The timer may have
3061 +missed its deadline, e.g. due to disabled interrupts, but this latency
3062 +would not be registered. Therefore, the offsets of missed timers are
3063 +recorded in a separate histogram. If both wakeup latency and missed timer
3064 +offsets are configured and enabled, a third histogram may be enabled that
3065 +records the overall latency as a sum of the timer latency, if any, and the
3066 +wakeup latency. This histogram is called "timerandwakeup".
3067 +- Configuration items (in the Kernel hacking/Tracers submenu)
3068 + CONFIG_WAKEUP_LATENCY
3069 + CONFIG_MISSED_TIMER_OFSETS
3074 +The interface to the administration of the latency histograms is located
3075 +in the debugfs file system. To mount it, either enter
3077 +mount -t sysfs nodev /sys
3078 +mount -t debugfs nodev /sys/kernel/debug
3080 +from shell command line level, or add
3082 +nodev /sys sysfs defaults 0 0
3083 +nodev /sys/kernel/debug debugfs defaults 0 0
3085 +to the file /etc/fstab. All latency histogram related files are then
3086 +available in the directory /sys/kernel/debug/tracing/latency_hist. A
3087 +particular histogram type is enabled by writing non-zero to the related
3088 +variable in the /sys/kernel/debug/tracing/latency_hist/enable directory.
3089 +Select "preemptirqsoff" for the histograms of potential sources of
3090 +latencies and "wakeup" for histograms of effective latencies etc. The
3091 +histogram data - one per CPU - are available in the files
3093 +/sys/kernel/debug/tracing/latency_hist/preemptoff/CPUx
3094 +/sys/kernel/debug/tracing/latency_hist/irqsoff/CPUx
3095 +/sys/kernel/debug/tracing/latency_hist/preemptirqsoff/CPUx
3096 +/sys/kernel/debug/tracing/latency_hist/wakeup/CPUx
3097 +/sys/kernel/debug/tracing/latency_hist/wakeup/sharedprio/CPUx
3098 +/sys/kernel/debug/tracing/latency_hist/missed_timer_offsets/CPUx
3099 +/sys/kernel/debug/tracing/latency_hist/timerandwakeup/CPUx
3101 +The histograms are reset by writing non-zero to the file "reset" in a
3102 +particular latency directory. To reset all latency data, use
3106 +TRACINGDIR=/sys/kernel/debug/tracing
3107 +HISTDIR=$TRACINGDIR/latency_hist
3109 +if test -d $HISTDIR
3112 + for i in `find . | grep /reset$`
3121 +Latency data are stored with a resolution of one microsecond. The
3122 +maximum latency is 10,240 microseconds. The data are only valid, if the
3123 +overflow register is empty. Every output line contains the latency in
3124 +microseconds in the first row and the number of samples in the second
3125 +row. To display only lines with a positive latency count, use, for
3128 +grep -v " 0$" /sys/kernel/debug/tracing/latency_hist/preemptoff/CPU0
3130 +#Minimum latency: 0 microseconds.
3131 +#Average latency: 0 microseconds.
3132 +#Maximum latency: 25 microseconds.
3133 +#Total samples: 3104770694
3134 +#There are 0 samples greater or equal than 10240 microseconds
3163 +* Wakeup latency of a selected process
3165 +To only collect wakeup latency data of a particular process, write the
3166 +PID of the requested process to
3168 +/sys/kernel/debug/tracing/latency_hist/wakeup/pid
3170 +PIDs are not considered, if this variable is set to 0.
3173 +* Details of the process with the highest wakeup latency so far
3175 +Selected data of the process that suffered from the highest wakeup
3176 +latency that occurred in a particular CPU are available in the file
3178 +/sys/kernel/debug/tracing/latency_hist/wakeup/max_latency-CPUx.
3180 +In addition, other relevant system data at the time when the
3181 +latency occurred are given.
3183 +The format of the data is (all in one line):
3184 +<PID> <Priority> <Latency> (<Timeroffset>) <Command> \
3185 +<- <PID> <Priority> <Command> <Timestamp>
3187 +The value of <Timeroffset> is only relevant in the combined timer
3188 +and wakeup latency recording. In the wakeup recording, it is
3189 +always 0, in the missed_timer_offsets recording, it is the same
3192 +When retrospectively searching for the origin of a latency and
3193 +tracing was not enabled, it may be helpful to know the name and
3194 +some basic data of the task that (finally) was switching to the
3195 +late real-tlme task. In addition to the victim's data, also the
3196 +data of the possible culprit are therefore displayed after the
3199 +Finally, the timestamp of the time when the latency occurred
3200 +in <seconds>.<microseconds> after the most recent system boot
3203 +These data are also reset when the wakeup histogram is reset.
3204 diff -Nur linux-4.9.6.orig/drivers/acpi/acpica/acglobal.h linux-4.9.6/drivers/acpi/acpica/acglobal.h
3205 --- linux-4.9.6.orig/drivers/acpi/acpica/acglobal.h 2017-01-26 08:25:24.000000000 +0100
3206 +++ linux-4.9.6/drivers/acpi/acpica/acglobal.h 2017-01-28 13:59:09.903655111 +0100
3210 ACPI_GLOBAL(acpi_spinlock, acpi_gbl_gpe_lock); /* For GPE data structs and registers */
3211 -ACPI_GLOBAL(acpi_spinlock, acpi_gbl_hardware_lock); /* For ACPI H/W except GPE registers */
3212 +ACPI_GLOBAL(acpi_raw_spinlock, acpi_gbl_hardware_lock); /* For ACPI H/W except GPE registers */
3213 ACPI_GLOBAL(acpi_spinlock, acpi_gbl_reference_count_lock);
3215 /* Mutex for _OSI support */
3216 diff -Nur linux-4.9.6.orig/drivers/acpi/acpica/hwregs.c linux-4.9.6/drivers/acpi/acpica/hwregs.c
3217 --- linux-4.9.6.orig/drivers/acpi/acpica/hwregs.c 2017-01-26 08:25:24.000000000 +0100
3218 +++ linux-4.9.6/drivers/acpi/acpica/hwregs.c 2017-01-28 13:59:09.903655111 +0100
3219 @@ -363,14 +363,14 @@
3220 ACPI_BITMASK_ALL_FIXED_STATUS,
3221 ACPI_FORMAT_UINT64(acpi_gbl_xpm1a_status.address)));
3223 - lock_flags = acpi_os_acquire_lock(acpi_gbl_hardware_lock);
3224 + raw_spin_lock_irqsave(acpi_gbl_hardware_lock, lock_flags);
3226 /* Clear the fixed events in PM1 A/B */
3228 status = acpi_hw_register_write(ACPI_REGISTER_PM1_STATUS,
3229 ACPI_BITMASK_ALL_FIXED_STATUS);
3231 - acpi_os_release_lock(acpi_gbl_hardware_lock, lock_flags);
3232 + raw_spin_unlock_irqrestore(acpi_gbl_hardware_lock, lock_flags);
3234 if (ACPI_FAILURE(status)) {
3236 diff -Nur linux-4.9.6.orig/drivers/acpi/acpica/hwxface.c linux-4.9.6/drivers/acpi/acpica/hwxface.c
3237 --- linux-4.9.6.orig/drivers/acpi/acpica/hwxface.c 2017-01-26 08:25:24.000000000 +0100
3238 +++ linux-4.9.6/drivers/acpi/acpica/hwxface.c 2017-01-28 13:59:09.903655111 +0100
3240 return_ACPI_STATUS(AE_BAD_PARAMETER);
3243 - lock_flags = acpi_os_acquire_lock(acpi_gbl_hardware_lock);
3244 + raw_spin_lock_irqsave(acpi_gbl_hardware_lock, lock_flags);
3247 * At this point, we know that the parent register is one of the
3252 - acpi_os_release_lock(acpi_gbl_hardware_lock, lock_flags);
3253 + raw_spin_unlock_irqrestore(acpi_gbl_hardware_lock, lock_flags);
3254 return_ACPI_STATUS(status);
3257 diff -Nur linux-4.9.6.orig/drivers/acpi/acpica/utmutex.c linux-4.9.6/drivers/acpi/acpica/utmutex.c
3258 --- linux-4.9.6.orig/drivers/acpi/acpica/utmutex.c 2017-01-26 08:25:24.000000000 +0100
3259 +++ linux-4.9.6/drivers/acpi/acpica/utmutex.c 2017-01-28 13:59:09.903655111 +0100
3261 return_ACPI_STATUS (status);
3264 - status = acpi_os_create_lock (&acpi_gbl_hardware_lock);
3265 + status = acpi_os_create_raw_lock (&acpi_gbl_hardware_lock);
3266 if (ACPI_FAILURE (status)) {
3267 return_ACPI_STATUS (status);
3270 /* Delete the spinlocks */
3272 acpi_os_delete_lock(acpi_gbl_gpe_lock);
3273 - acpi_os_delete_lock(acpi_gbl_hardware_lock);
3274 + acpi_os_delete_raw_lock(acpi_gbl_hardware_lock);
3275 acpi_os_delete_lock(acpi_gbl_reference_count_lock);
3277 /* Delete the reader/writer lock */
3278 diff -Nur linux-4.9.6.orig/drivers/ata/libata-sff.c linux-4.9.6/drivers/ata/libata-sff.c
3279 --- linux-4.9.6.orig/drivers/ata/libata-sff.c 2017-01-26 08:25:24.000000000 +0100
3280 +++ linux-4.9.6/drivers/ata/libata-sff.c 2017-01-28 13:59:09.907655265 +0100
3282 unsigned long flags;
3283 unsigned int consumed;
3285 - local_irq_save(flags);
3286 + local_irq_save_nort(flags);
3287 consumed = ata_sff_data_xfer32(dev, buf, buflen, rw);
3288 - local_irq_restore(flags);
3289 + local_irq_restore_nort(flags);
3294 unsigned long flags;
3296 /* FIXME: use a bounce buffer */
3297 - local_irq_save(flags);
3298 + local_irq_save_nort(flags);
3299 buf = kmap_atomic(page);
3301 /* do the actual data transfer */
3306 - local_irq_restore(flags);
3307 + local_irq_restore_nort(flags);
3309 buf = page_address(page);
3310 ap->ops->sff_data_xfer(qc->dev, buf + offset, qc->sect_size,
3312 unsigned long flags;
3314 /* FIXME: use bounce buffer */
3315 - local_irq_save(flags);
3316 + local_irq_save_nort(flags);
3317 buf = kmap_atomic(page);
3319 /* do the actual data transfer */
3324 - local_irq_restore(flags);
3325 + local_irq_restore_nort(flags);
3327 buf = page_address(page);
3328 consumed = ap->ops->sff_data_xfer(dev, buf + offset,
3329 diff -Nur linux-4.9.6.orig/drivers/block/zram/zcomp.c linux-4.9.6/drivers/block/zram/zcomp.c
3330 --- linux-4.9.6.orig/drivers/block/zram/zcomp.c 2017-01-26 08:25:24.000000000 +0100
3331 +++ linux-4.9.6/drivers/block/zram/zcomp.c 2017-01-28 13:59:09.907655265 +0100
3332 @@ -118,12 +118,19 @@
3334 struct zcomp_strm *zcomp_stream_get(struct zcomp *comp)
3336 - return *get_cpu_ptr(comp->stream);
3337 + struct zcomp_strm *zstrm;
3339 + zstrm = *this_cpu_ptr(comp->stream);
3340 + spin_lock(&zstrm->zcomp_lock);
3344 void zcomp_stream_put(struct zcomp *comp)
3346 - put_cpu_ptr(comp->stream);
3347 + struct zcomp_strm *zstrm;
3349 + zstrm = *this_cpu_ptr(comp->stream);
3350 + spin_unlock(&zstrm->zcomp_lock);
3353 int zcomp_compress(struct zcomp_strm *zstrm,
3355 pr_err("Can't allocate a compression stream\n");
3358 + spin_lock_init(&zstrm->zcomp_lock);
3359 *per_cpu_ptr(comp->stream, cpu) = zstrm;
3362 diff -Nur linux-4.9.6.orig/drivers/block/zram/zcomp.h linux-4.9.6/drivers/block/zram/zcomp.h
3363 --- linux-4.9.6.orig/drivers/block/zram/zcomp.h 2017-01-26 08:25:24.000000000 +0100
3364 +++ linux-4.9.6/drivers/block/zram/zcomp.h 2017-01-28 13:59:09.907655265 +0100
3366 /* compression/decompression buffer */
3368 struct crypto_comp *tfm;
3369 + spinlock_t zcomp_lock;
3372 /* dynamic per-device compression frontend */
3373 diff -Nur linux-4.9.6.orig/drivers/block/zram/zram_drv.c linux-4.9.6/drivers/block/zram/zram_drv.c
3374 --- linux-4.9.6.orig/drivers/block/zram/zram_drv.c 2017-01-26 08:25:24.000000000 +0100
3375 +++ linux-4.9.6/drivers/block/zram/zram_drv.c 2017-01-28 13:59:09.907655265 +0100
3380 + zram_meta_init_table_locks(meta, disksize);
3385 @@ -575,28 +577,28 @@
3386 struct zram_meta *meta = zram->meta;
3387 unsigned long handle;
3389 + struct zcomp_strm *zstrm;
3391 - bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
3392 + zram_lock_table(&meta->table[index]);
3393 handle = meta->table[index].handle;
3394 size = zram_get_obj_size(meta, index);
3396 if (!handle || zram_test_flag(meta, index, ZRAM_ZERO)) {
3397 - bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
3398 + zram_unlock_table(&meta->table[index]);
3403 + zstrm = zcomp_stream_get(zram->comp);
3404 cmem = zs_map_object(meta->mem_pool, handle, ZS_MM_RO);
3405 if (size == PAGE_SIZE) {
3406 copy_page(mem, cmem);
3408 - struct zcomp_strm *zstrm = zcomp_stream_get(zram->comp);
3410 ret = zcomp_decompress(zstrm, cmem, size, mem);
3411 - zcomp_stream_put(zram->comp);
3413 zs_unmap_object(meta->mem_pool, handle);
3414 - bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
3415 + zcomp_stream_put(zram->comp);
3416 + zram_unlock_table(&meta->table[index]);
3418 /* Should NEVER happen. Return bio error if it does. */
3419 if (unlikely(ret)) {
3420 @@ -616,14 +618,14 @@
3421 struct zram_meta *meta = zram->meta;
3422 page = bvec->bv_page;
3424 - bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
3425 + zram_lock_table(&meta->table[index]);
3426 if (unlikely(!meta->table[index].handle) ||
3427 zram_test_flag(meta, index, ZRAM_ZERO)) {
3428 - bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
3429 + zram_unlock_table(&meta->table[index]);
3430 handle_zero_page(bvec);
3433 - bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
3434 + zram_unlock_table(&meta->table[index]);
3436 if (is_partial_io(bvec))
3437 /* Use a temporary buffer to decompress the page */
3438 @@ -700,10 +702,10 @@
3440 kunmap_atomic(user_mem);
3441 /* Free memory associated with this sector now. */
3442 - bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
3443 + zram_lock_table(&meta->table[index]);
3444 zram_free_page(zram, index);
3445 zram_set_flag(meta, index, ZRAM_ZERO);
3446 - bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
3447 + zram_unlock_table(&meta->table[index]);
3449 atomic64_inc(&zram->stats.zero_pages);
3451 @@ -794,12 +796,12 @@
3452 * Free memory associated with this sector
3453 * before overwriting unused sectors.
3455 - bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
3456 + zram_lock_table(&meta->table[index]);
3457 zram_free_page(zram, index);
3459 meta->table[index].handle = handle;
3460 zram_set_obj_size(meta, index, clen);
3461 - bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
3462 + zram_unlock_table(&meta->table[index]);
3465 atomic64_add(clen, &zram->stats.compr_data_size);
3469 while (n >= PAGE_SIZE) {
3470 - bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
3471 + zram_lock_table(&meta->table[index]);
3472 zram_free_page(zram, index);
3473 - bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
3474 + zram_unlock_table(&meta->table[index]);
3475 atomic64_inc(&zram->stats.notify_free);
3479 zram = bdev->bd_disk->private_data;
3482 - bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
3483 + zram_lock_table(&meta->table[index]);
3484 zram_free_page(zram, index);
3485 - bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
3486 + zram_unlock_table(&meta->table[index]);
3487 atomic64_inc(&zram->stats.notify_free);
3490 diff -Nur linux-4.9.6.orig/drivers/block/zram/zram_drv.h linux-4.9.6/drivers/block/zram/zram_drv.h
3491 --- linux-4.9.6.orig/drivers/block/zram/zram_drv.h 2017-01-26 08:25:24.000000000 +0100
3492 +++ linux-4.9.6/drivers/block/zram/zram_drv.h 2017-01-28 13:59:09.907655265 +0100
3494 struct zram_table_entry {
3495 unsigned long handle;
3496 unsigned long value;
3497 +#ifdef CONFIG_PREEMPT_RT_BASE
3503 @@ -120,4 +123,42 @@
3505 bool claim; /* Protected by bdev->bd_mutex */
3508 +#ifndef CONFIG_PREEMPT_RT_BASE
3509 +static inline void zram_lock_table(struct zram_table_entry *table)
3511 + bit_spin_lock(ZRAM_ACCESS, &table->value);
3514 +static inline void zram_unlock_table(struct zram_table_entry *table)
3516 + bit_spin_unlock(ZRAM_ACCESS, &table->value);
3519 +static inline void zram_meta_init_table_locks(struct zram_meta *meta, u64 disksize) { }
3520 +#else /* CONFIG_PREEMPT_RT_BASE */
3521 +static inline void zram_lock_table(struct zram_table_entry *table)
3523 + spin_lock(&table->lock);
3524 + __set_bit(ZRAM_ACCESS, &table->value);
3527 +static inline void zram_unlock_table(struct zram_table_entry *table)
3529 + __clear_bit(ZRAM_ACCESS, &table->value);
3530 + spin_unlock(&table->lock);
3533 +static inline void zram_meta_init_table_locks(struct zram_meta *meta, u64 disksize)
3535 + size_t num_pages = disksize >> PAGE_SHIFT;
3538 + for (index = 0; index < num_pages; index++) {
3539 + spinlock_t *lock = &meta->table[index].lock;
3540 + spin_lock_init(lock);
3543 +#endif /* CONFIG_PREEMPT_RT_BASE */
3546 diff -Nur linux-4.9.6.orig/drivers/char/random.c linux-4.9.6/drivers/char/random.c
3547 --- linux-4.9.6.orig/drivers/char/random.c 2017-01-26 08:25:24.000000000 +0100
3548 +++ linux-4.9.6/drivers/char/random.c 2017-01-28 13:59:09.907655265 +0100
3549 @@ -1028,8 +1028,6 @@
3551 long delta, delta2, delta3;
3553 - preempt_disable();
3555 sample.jiffies = jiffies;
3556 sample.cycles = random_get_entropy();
3558 @@ -1070,7 +1068,6 @@
3560 credit_entropy_bits(r, min_t(int, fls(delta>>1), 11));
3565 void add_input_randomness(unsigned int type, unsigned int code,
3566 @@ -1123,28 +1120,27 @@
3567 return *(ptr + f->reg_idx++);
3570 -void add_interrupt_randomness(int irq, int irq_flags)
3571 +void add_interrupt_randomness(int irq, int irq_flags, __u64 ip)
3573 struct entropy_store *r;
3574 struct fast_pool *fast_pool = this_cpu_ptr(&irq_randomness);
3575 - struct pt_regs *regs = get_irq_regs();
3576 unsigned long now = jiffies;
3577 cycles_t cycles = random_get_entropy();
3578 __u32 c_high, j_high;
3584 - cycles = get_reg(fast_pool, regs);
3585 + cycles = get_reg(fast_pool, NULL);
3586 c_high = (sizeof(cycles) > 4) ? cycles >> 32 : 0;
3587 j_high = (sizeof(now) > 4) ? now >> 32 : 0;
3588 fast_pool->pool[0] ^= cycles ^ j_high ^ irq;
3589 fast_pool->pool[1] ^= now ^ c_high;
3590 - ip = regs ? instruction_pointer(regs) : _RET_IP_;
3593 fast_pool->pool[2] ^= ip;
3594 fast_pool->pool[3] ^= (sizeof(ip) > 4) ? ip >> 32 :
3595 - get_reg(fast_pool, regs);
3596 + get_reg(fast_pool, NULL);
3598 fast_mix(fast_pool);
3599 add_interrupt_bench(cycles);
3600 diff -Nur linux-4.9.6.orig/drivers/clocksource/tcb_clksrc.c linux-4.9.6/drivers/clocksource/tcb_clksrc.c
3601 --- linux-4.9.6.orig/drivers/clocksource/tcb_clksrc.c 2017-01-26 08:25:24.000000000 +0100
3602 +++ linux-4.9.6/drivers/clocksource/tcb_clksrc.c 2017-01-28 13:59:09.907655265 +0100
3604 * this 32 bit free-running counter. the second channel is not used.
3606 * - The third channel may be used to provide a 16-bit clockevent
3607 - * source, used in either periodic or oneshot mode. This runs
3608 - * at 32 KiHZ, and can handle delays of up to two seconds.
3609 + * source, used in either periodic or oneshot mode.
3611 * A boot clocksource and clockevent source are also currently needed,
3612 * unless the relevant platforms (ARM/AT91, AVR32/AT32) are changed so
3614 struct tc_clkevt_device {
3615 struct clock_event_device clkevt;
3623 return container_of(clkevt, struct tc_clkevt_device, clkevt);
3626 -/* For now, we always use the 32K clock ... this optimizes for NO_HZ,
3627 - * because using one of the divided clocks would usually mean the
3628 - * tick rate can never be less than several dozen Hz (vs 0.5 Hz).
3630 - * A divided clock could be good for high resolution timers, since
3631 - * 30.5 usec resolution can seem "low".
3633 static u32 timer_clock;
3635 +static void tc_clk_disable(struct clock_event_device *d)
3637 + struct tc_clkevt_device *tcd = to_tc_clkevt(d);
3639 + clk_disable(tcd->clk);
3640 + tcd->clk_enabled = false;
3643 +static void tc_clk_enable(struct clock_event_device *d)
3645 + struct tc_clkevt_device *tcd = to_tc_clkevt(d);
3647 + if (tcd->clk_enabled)
3649 + clk_enable(tcd->clk);
3650 + tcd->clk_enabled = true;
3653 static int tc_shutdown(struct clock_event_device *d)
3655 struct tc_clkevt_device *tcd = to_tc_clkevt(d);
3658 __raw_writel(0xff, regs + ATMEL_TC_REG(2, IDR));
3659 __raw_writel(ATMEL_TC_CLKDIS, regs + ATMEL_TC_REG(2, CCR));
3663 +static int tc_shutdown_clk_off(struct clock_event_device *d)
3666 if (!clockevent_state_detached(d))
3667 - clk_disable(tcd->clk);
3668 + tc_clk_disable(d);
3673 if (clockevent_state_oneshot(d) || clockevent_state_periodic(d))
3676 - clk_enable(tcd->clk);
3679 - /* slow clock, count up to RC, then irq and stop */
3680 + /* count up to RC, then irq and stop */
3681 __raw_writel(timer_clock | ATMEL_TC_CPCSTOP | ATMEL_TC_WAVE |
3682 ATMEL_TC_WAVESEL_UP_AUTO, regs + ATMEL_TC_REG(2, CMR));
3683 __raw_writel(ATMEL_TC_CPCS, regs + ATMEL_TC_REG(2, IER));
3684 @@ -134,12 +152,12 @@
3685 /* By not making the gentime core emulate periodic mode on top
3686 * of oneshot, we get lower overhead and improved accuracy.
3688 - clk_enable(tcd->clk);
3691 - /* slow clock, count up to RC, then irq and restart */
3692 + /* count up to RC, then irq and restart */
3693 __raw_writel(timer_clock | ATMEL_TC_WAVE | ATMEL_TC_WAVESEL_UP_AUTO,
3694 regs + ATMEL_TC_REG(2, CMR));
3695 - __raw_writel((32768 + HZ / 2) / HZ, tcaddr + ATMEL_TC_REG(2, RC));
3696 + __raw_writel((tcd->freq + HZ / 2) / HZ, tcaddr + ATMEL_TC_REG(2, RC));
3698 /* Enable clock and interrupts on RC compare */
3699 __raw_writel(ATMEL_TC_CPCS, regs + ATMEL_TC_REG(2, IER));
3700 @@ -166,9 +184,13 @@
3701 .features = CLOCK_EVT_FEAT_PERIODIC |
3702 CLOCK_EVT_FEAT_ONESHOT,
3703 /* Should be lower than at91rm9200's system timer */
3704 +#ifdef CONFIG_ATMEL_TCB_CLKSRC_USE_SLOW_CLOCK
3709 .set_next_event = tc_next_event,
3710 - .set_state_shutdown = tc_shutdown,
3711 + .set_state_shutdown = tc_shutdown_clk_off,
3712 .set_state_periodic = tc_set_periodic,
3713 .set_state_oneshot = tc_set_oneshot,
3719 -static int __init setup_clkevents(struct atmel_tc *tc, int clk32k_divisor_idx)
3720 +static int __init setup_clkevents(struct atmel_tc *tc, int divisor_idx)
3722 + unsigned divisor = atmel_tc_divisors[divisor_idx];
3724 struct clk *t2_clk = tc->clk[2];
3725 int irq = tc->irq[2];
3726 @@ -210,7 +233,11 @@
3727 clkevt.regs = tc->regs;
3728 clkevt.clk = t2_clk;
3730 - timer_clock = clk32k_divisor_idx;
3731 + timer_clock = divisor_idx;
3733 + clkevt.freq = 32768;
3735 + clkevt.freq = clk_get_rate(t2_clk) / divisor;
3737 clkevt.clkevt.cpumask = cpumask_of(0);
3743 - clockevents_config_and_register(&clkevt.clkevt, 32768, 1, 0xffff);
3744 + clockevents_config_and_register(&clkevt.clkevt, clkevt.freq, 1, 0xffff);
3748 @@ -358,7 +385,11 @@
3749 goto err_disable_t1;
3751 /* channel 2: periodic and oneshot timer support */
3752 +#ifdef CONFIG_ATMEL_TCB_CLKSRC_USE_SLOW_CLOCK
3753 ret = setup_clkevents(tc, clk32k_divisor_idx);
3755 + ret = setup_clkevents(tc, best_divisor_idx);
3758 goto err_unregister_clksrc;
3760 diff -Nur linux-4.9.6.orig/drivers/clocksource/timer-atmel-pit.c linux-4.9.6/drivers/clocksource/timer-atmel-pit.c
3761 --- linux-4.9.6.orig/drivers/clocksource/timer-atmel-pit.c 2017-01-26 08:25:24.000000000 +0100
3762 +++ linux-4.9.6/drivers/clocksource/timer-atmel-pit.c 2017-01-28 13:59:09.907655265 +0100
3767 + bool irq_requested;
3773 /* disable irq, leaving the clocksource active */
3774 pit_write(data->base, AT91_PIT_MR, (data->cycle - 1) | AT91_PIT_PITEN);
3775 + if (data->irq_requested) {
3776 + free_irq(data->irq, data);
3777 + data->irq_requested = false;
3782 +static irqreturn_t at91sam926x_pit_interrupt(int irq, void *dev_id);
3784 * Clockevent device: interrupts every 1/HZ (== pit_cycles * MCK/16)
3786 static int pit_clkevt_set_periodic(struct clock_event_device *dev)
3788 struct pit_data *data = clkevt_to_pit_data(dev);
3791 + ret = request_irq(data->irq, at91sam926x_pit_interrupt,
3792 + IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
3793 + "at91_tick", data);
3795 + panic(pr_fmt("Unable to setup IRQ\n"));
3797 + data->irq_requested = true;
3799 /* update clocksource counter */
3800 data->cnt += data->cycle * PIT_PICNT(pit_read(data->base, AT91_PIT_PIVR));
3801 @@ -230,15 +245,6 @@
3805 - /* Set up irq handler */
3806 - ret = request_irq(data->irq, at91sam926x_pit_interrupt,
3807 - IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
3808 - "at91_tick", data);
3810 - pr_err("Unable to setup IRQ\n");
3814 /* Set up and register clockevents */
3815 data->clkevt.name = "pit";
3816 data->clkevt.features = CLOCK_EVT_FEAT_PERIODIC;
3817 diff -Nur linux-4.9.6.orig/drivers/clocksource/timer-atmel-st.c linux-4.9.6/drivers/clocksource/timer-atmel-st.c
3818 --- linux-4.9.6.orig/drivers/clocksource/timer-atmel-st.c 2017-01-26 08:25:24.000000000 +0100
3819 +++ linux-4.9.6/drivers/clocksource/timer-atmel-st.c 2017-01-28 13:59:09.907655265 +0100
3820 @@ -115,18 +115,29 @@
3821 last_crtr = read_CRTR();
3824 +static int atmel_st_irq;
3826 static int clkevt32k_shutdown(struct clock_event_device *evt)
3828 clkdev32k_disable_and_flush_irq();
3830 regmap_write(regmap_st, AT91_ST_IER, irqmask);
3831 + free_irq(atmel_st_irq, regmap_st);
3835 static int clkevt32k_set_oneshot(struct clock_event_device *dev)
3839 clkdev32k_disable_and_flush_irq();
3841 + ret = request_irq(atmel_st_irq, at91rm9200_timer_interrupt,
3842 + IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
3843 + "at91_tick", regmap_st);
3845 + panic(pr_fmt("Unable to setup IRQ\n"));
3848 * ALM for oneshot irqs, set by next_event()
3849 * before 32 seconds have passed.
3850 @@ -139,8 +150,16 @@
3852 static int clkevt32k_set_periodic(struct clock_event_device *dev)
3856 clkdev32k_disable_and_flush_irq();
3858 + ret = request_irq(atmel_st_irq, at91rm9200_timer_interrupt,
3859 + IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
3860 + "at91_tick", regmap_st);
3862 + panic(pr_fmt("Unable to setup IRQ\n"));
3864 /* PIT for periodic irqs; fixed rate of 1/HZ */
3865 irqmask = AT91_ST_PITS;
3866 regmap_write(regmap_st, AT91_ST_PIMR, timer_latch);
3870 unsigned int sclk_rate, val;
3874 regmap_st = syscon_node_to_regmap(node);
3875 if (IS_ERR(regmap_st)) {
3876 @@ -212,21 +231,12 @@
3877 regmap_read(regmap_st, AT91_ST_SR, &val);
3879 /* Get the interrupts property */
3880 - irq = irq_of_parse_and_map(node, 0);
3882 + atmel_st_irq = irq_of_parse_and_map(node, 0);
3883 + if (!atmel_st_irq) {
3884 pr_err("Unable to get IRQ from DT\n");
3888 - /* Make IRQs happen for the system timer */
3889 - ret = request_irq(irq, at91rm9200_timer_interrupt,
3890 - IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
3891 - "at91_tick", regmap_st);
3893 - pr_err("Unable to setup IRQ\n");
3897 sclk = of_clk_get(node, 0);
3899 pr_err("Unable to get slow clock\n");
3900 diff -Nur linux-4.9.6.orig/drivers/connector/cn_proc.c linux-4.9.6/drivers/connector/cn_proc.c
3901 --- linux-4.9.6.orig/drivers/connector/cn_proc.c 2017-01-26 08:25:24.000000000 +0100
3902 +++ linux-4.9.6/drivers/connector/cn_proc.c 2017-01-28 13:59:09.907655265 +0100
3904 #include <linux/pid_namespace.h>
3906 #include <linux/cn_proc.h>
3907 +#include <linux/locallock.h>
3910 * Size of a cn_msg followed by a proc_event structure. Since the
3913 /* proc_event_counts is used as the sequence number of the netlink message */
3914 static DEFINE_PER_CPU(__u32, proc_event_counts) = { 0 };
3915 +static DEFINE_LOCAL_IRQ_LOCK(send_msg_lock);
3917 static inline void send_msg(struct cn_msg *msg)
3919 - preempt_disable();
3920 + local_lock(send_msg_lock);
3922 msg->seq = __this_cpu_inc_return(proc_event_counts) - 1;
3923 ((struct proc_event *)msg->data)->cpu = smp_processor_id();
3926 cn_netlink_send(msg, 0, CN_IDX_PROC, GFP_NOWAIT);
3929 + local_unlock(send_msg_lock);
3932 void proc_fork_connector(struct task_struct *task)
3933 diff -Nur linux-4.9.6.orig/drivers/cpufreq/Kconfig.x86 linux-4.9.6/drivers/cpufreq/Kconfig.x86
3934 --- linux-4.9.6.orig/drivers/cpufreq/Kconfig.x86 2017-01-26 08:25:24.000000000 +0100
3935 +++ linux-4.9.6/drivers/cpufreq/Kconfig.x86 2017-01-28 13:59:09.907655265 +0100
3938 config X86_POWERNOW_K8
3939 tristate "AMD Opteron/Athlon64 PowerNow!"
3940 - depends on ACPI && ACPI_PROCESSOR && X86_ACPI_CPUFREQ
3941 + depends on ACPI && ACPI_PROCESSOR && X86_ACPI_CPUFREQ && !PREEMPT_RT_BASE
3943 This adds the CPUFreq driver for K8/early Opteron/Athlon64 processors.
3944 Support for K10 and newer processors is now in acpi-cpufreq.
3945 diff -Nur linux-4.9.6.orig/drivers/gpu/drm/i915/i915_gem_execbuffer.c linux-4.9.6/drivers/gpu/drm/i915/i915_gem_execbuffer.c
3946 --- linux-4.9.6.orig/drivers/gpu/drm/i915/i915_gem_execbuffer.c 2017-01-26 08:25:24.000000000 +0100
3947 +++ linux-4.9.6/drivers/gpu/drm/i915/i915_gem_execbuffer.c 2017-01-28 13:59:09.907655265 +0100
3948 @@ -1537,7 +1537,9 @@
3952 +#ifndef CONFIG_PREEMPT_RT_BASE
3953 trace_i915_gem_ring_dispatch(params->request, params->dispatch_flags);
3956 i915_gem_execbuffer_move_to_active(vmas, params->request);
3958 diff -Nur linux-4.9.6.orig/drivers/gpu/drm/i915/i915_gem_shrinker.c linux-4.9.6/drivers/gpu/drm/i915/i915_gem_shrinker.c
3959 --- linux-4.9.6.orig/drivers/gpu/drm/i915/i915_gem_shrinker.c 2017-01-26 08:25:24.000000000 +0100
3960 +++ linux-4.9.6/drivers/gpu/drm/i915/i915_gem_shrinker.c 2017-01-28 13:59:09.907655265 +0100
3962 if (!mutex_is_locked(mutex))
3965 -#if defined(CONFIG_DEBUG_MUTEXES) || defined(CONFIG_MUTEX_SPIN_ON_OWNER)
3966 +#if (defined(CONFIG_DEBUG_MUTEXES) || defined(CONFIG_MUTEX_SPIN_ON_OWNER)) && !defined(CONFIG_PREEMPT_RT_BASE)
3967 return mutex->owner == task;
3969 /* Since UP may be pre-empted, we cannot assume that we own the lock */
3970 diff -Nur linux-4.9.6.orig/drivers/gpu/drm/i915/i915_irq.c linux-4.9.6/drivers/gpu/drm/i915/i915_irq.c
3971 --- linux-4.9.6.orig/drivers/gpu/drm/i915/i915_irq.c 2017-01-26 08:25:24.000000000 +0100
3972 +++ linux-4.9.6/drivers/gpu/drm/i915/i915_irq.c 2017-01-28 13:59:09.907655265 +0100
3974 spin_lock_irqsave(&dev_priv->uncore.lock, irqflags);
3976 /* preempt_disable_rt() should go right here in PREEMPT_RT patchset. */
3977 + preempt_disable_rt();
3979 /* Get optional system timestamp before query. */
3982 *etime = ktime_get();
3984 /* preempt_enable_rt() should go right here in PREEMPT_RT patchset. */
3985 + preempt_enable_rt();
3987 spin_unlock_irqrestore(&dev_priv->uncore.lock, irqflags);
3989 diff -Nur linux-4.9.6.orig/drivers/gpu/drm/i915/intel_display.c linux-4.9.6/drivers/gpu/drm/i915/intel_display.c
3990 --- linux-4.9.6.orig/drivers/gpu/drm/i915/intel_display.c 2017-01-26 08:25:24.000000000 +0100
3991 +++ linux-4.9.6/drivers/gpu/drm/i915/intel_display.c 2017-01-28 13:59:09.911655420 +0100
3992 @@ -12131,7 +12131,7 @@
3993 struct intel_crtc *intel_crtc = to_intel_crtc(crtc);
3994 struct intel_flip_work *work;
3996 - WARN_ON(!in_interrupt());
3997 + WARN_ON_NONRT(!in_interrupt());
4001 diff -Nur linux-4.9.6.orig/drivers/gpu/drm/i915/intel_sprite.c linux-4.9.6/drivers/gpu/drm/i915/intel_sprite.c
4002 --- linux-4.9.6.orig/drivers/gpu/drm/i915/intel_sprite.c 2017-01-26 08:25:24.000000000 +0100
4003 +++ linux-4.9.6/drivers/gpu/drm/i915/intel_sprite.c 2017-01-28 13:59:09.911655420 +0100
4005 #include <drm/drm_rect.h>
4006 #include <drm/drm_atomic.h>
4007 #include <drm/drm_plane_helper.h>
4008 +#include <linux/locallock.h>
4009 #include "intel_drv.h"
4010 #include "intel_frontbuffer.h"
4011 #include <drm/i915_drm.h>
4013 1000 * adjusted_mode->crtc_htotal);
4016 +static DEFINE_LOCAL_IRQ_LOCK(pipe_update_lock);
4019 * intel_pipe_update_start() - start update of a set of display registers
4020 * @crtc: the crtc of which the registers are going to be updated
4022 min = vblank_start - intel_usecs_to_scanlines(adjusted_mode, 100);
4023 max = vblank_start - 1;
4025 - local_irq_disable();
4026 + local_lock_irq(pipe_update_lock);
4028 if (min <= 0 || max <= 0)
4030 @@ -125,11 +128,11 @@
4034 - local_irq_enable();
4035 + local_unlock_irq(pipe_update_lock);
4037 timeout = schedule_timeout(timeout);
4039 - local_irq_disable();
4040 + local_lock_irq(pipe_update_lock);
4043 finish_wait(wq, &wait);
4045 crtc->base.state->event = NULL;
4048 - local_irq_enable();
4049 + local_unlock_irq(pipe_update_lock);
4051 if (crtc->debug.start_vbl_count &&
4052 crtc->debug.start_vbl_count != end_vbl_count) {
4053 diff -Nur linux-4.9.6.orig/drivers/gpu/drm/msm/msm_gem_shrinker.c linux-4.9.6/drivers/gpu/drm/msm/msm_gem_shrinker.c
4054 --- linux-4.9.6.orig/drivers/gpu/drm/msm/msm_gem_shrinker.c 2017-01-26 08:25:24.000000000 +0100
4055 +++ linux-4.9.6/drivers/gpu/drm/msm/msm_gem_shrinker.c 2017-01-28 13:59:09.911655420 +0100
4057 if (!mutex_is_locked(mutex))
4060 -#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_MUTEXES)
4061 +#if (defined(CONFIG_SMP) || defined(CONFIG_DEBUG_MUTEXES)) && !defined(CONFIG_PREEMPT_RT_BASE)
4062 return mutex->owner == task;
4064 /* Since UP may be pre-empted, we cannot assume that we own the lock */
4065 diff -Nur linux-4.9.6.orig/drivers/gpu/drm/radeon/radeon_display.c linux-4.9.6/drivers/gpu/drm/radeon/radeon_display.c
4066 --- linux-4.9.6.orig/drivers/gpu/drm/radeon/radeon_display.c 2017-01-26 08:25:24.000000000 +0100
4067 +++ linux-4.9.6/drivers/gpu/drm/radeon/radeon_display.c 2017-01-28 13:59:09.911655420 +0100
4068 @@ -1845,6 +1845,7 @@
4069 struct radeon_device *rdev = dev->dev_private;
4071 /* preempt_disable_rt() should go right here in PREEMPT_RT patchset. */
4072 + preempt_disable_rt();
4074 /* Get optional system timestamp before query. */
4076 @@ -1937,6 +1938,7 @@
4077 *etime = ktime_get();
4079 /* preempt_enable_rt() should go right here in PREEMPT_RT patchset. */
4080 + preempt_enable_rt();
4082 /* Decode into vertical and horizontal scanout position. */
4083 *vpos = position & 0x1fff;
4084 diff -Nur linux-4.9.6.orig/drivers/hv/vmbus_drv.c linux-4.9.6/drivers/hv/vmbus_drv.c
4085 --- linux-4.9.6.orig/drivers/hv/vmbus_drv.c 2017-01-26 08:25:24.000000000 +0100
4086 +++ linux-4.9.6/drivers/hv/vmbus_drv.c 2017-01-28 13:59:09.911655420 +0100
4089 struct hv_message *msg;
4090 union hv_synic_event_flags *event;
4091 + struct pt_regs *regs = get_irq_regs();
4092 + u64 ip = regs ? instruction_pointer(regs) : 0;
4093 bool handled = false;
4095 page_addr = hv_context.synic_event_page[cpu];
4097 tasklet_schedule(hv_context.msg_dpc[cpu]);
4100 - add_interrupt_randomness(HYPERVISOR_CALLBACK_VECTOR, 0);
4101 + add_interrupt_randomness(HYPERVISOR_CALLBACK_VECTOR, 0, ip);
4105 diff -Nur linux-4.9.6.orig/drivers/ide/alim15x3.c linux-4.9.6/drivers/ide/alim15x3.c
4106 --- linux-4.9.6.orig/drivers/ide/alim15x3.c 2017-01-26 08:25:24.000000000 +0100
4107 +++ linux-4.9.6/drivers/ide/alim15x3.c 2017-01-28 13:59:09.911655420 +0100
4110 isa_dev = pci_get_device(PCI_VENDOR_ID_AL, PCI_DEVICE_ID_AL_M1533, NULL);
4112 - local_irq_save(flags);
4113 + local_irq_save_nort(flags);
4115 if (m5229_revision < 0xC2) {
4120 pci_dev_put(isa_dev);
4121 - local_irq_restore(flags);
4122 + local_irq_restore_nort(flags);
4126 diff -Nur linux-4.9.6.orig/drivers/ide/hpt366.c linux-4.9.6/drivers/ide/hpt366.c
4127 --- linux-4.9.6.orig/drivers/ide/hpt366.c 2017-01-26 08:25:24.000000000 +0100
4128 +++ linux-4.9.6/drivers/ide/hpt366.c 2017-01-28 13:59:09.911655420 +0100
4129 @@ -1236,7 +1236,7 @@
4131 dma_old = inb(base + 2);
4133 - local_irq_save(flags);
4134 + local_irq_save_nort(flags);
4137 pci_read_config_byte(dev, hwif->channel ? 0x4b : 0x43, &masterdma);
4138 @@ -1247,7 +1247,7 @@
4139 if (dma_new != dma_old)
4140 outb(dma_new, base + 2);
4142 - local_irq_restore(flags);
4143 + local_irq_restore_nort(flags);
4145 printk(KERN_INFO " %s: BM-DMA at 0x%04lx-0x%04lx\n",
4146 hwif->name, base, base + 7);
4147 diff -Nur linux-4.9.6.orig/drivers/ide/ide-io.c linux-4.9.6/drivers/ide/ide-io.c
4148 --- linux-4.9.6.orig/drivers/ide/ide-io.c 2017-01-26 08:25:24.000000000 +0100
4149 +++ linux-4.9.6/drivers/ide/ide-io.c 2017-01-28 13:59:09.911655420 +0100
4151 /* disable_irq_nosync ?? */
4152 disable_irq(hwif->irq);
4153 /* local CPU only, as if we were handling an interrupt */
4154 - local_irq_disable();
4155 + local_irq_disable_nort();
4156 if (hwif->polling) {
4157 startstop = handler(drive);
4158 } else if (drive_is_ready(drive)) {
4159 diff -Nur linux-4.9.6.orig/drivers/ide/ide-iops.c linux-4.9.6/drivers/ide/ide-iops.c
4160 --- linux-4.9.6.orig/drivers/ide/ide-iops.c 2017-01-26 08:25:24.000000000 +0100
4161 +++ linux-4.9.6/drivers/ide/ide-iops.c 2017-01-28 13:59:09.911655420 +0100
4162 @@ -129,12 +129,12 @@
4163 if ((stat & ATA_BUSY) == 0)
4166 - local_irq_restore(flags);
4167 + local_irq_restore_nort(flags);
4172 - local_irq_restore(flags);
4173 + local_irq_restore_nort(flags);
4176 * Allow status to settle, then read it again.
4177 diff -Nur linux-4.9.6.orig/drivers/ide/ide-io-std.c linux-4.9.6/drivers/ide/ide-io-std.c
4178 --- linux-4.9.6.orig/drivers/ide/ide-io-std.c 2017-01-26 08:25:24.000000000 +0100
4179 +++ linux-4.9.6/drivers/ide/ide-io-std.c 2017-01-28 13:59:09.911655420 +0100
4181 unsigned long uninitialized_var(flags);
4183 if ((io_32bit & 2) && !mmio) {
4184 - local_irq_save(flags);
4185 + local_irq_save_nort(flags);
4186 ata_vlb_sync(io_ports->nsect_addr);
4190 insl(data_addr, buf, words);
4192 if ((io_32bit & 2) && !mmio)
4193 - local_irq_restore(flags);
4194 + local_irq_restore_nort(flags);
4196 if (((len + 1) & 3) < 2)
4199 unsigned long uninitialized_var(flags);
4201 if ((io_32bit & 2) && !mmio) {
4202 - local_irq_save(flags);
4203 + local_irq_save_nort(flags);
4204 ata_vlb_sync(io_ports->nsect_addr);
4208 outsl(data_addr, buf, words);
4210 if ((io_32bit & 2) && !mmio)
4211 - local_irq_restore(flags);
4212 + local_irq_restore_nort(flags);
4214 if (((len + 1) & 3) < 2)
4216 diff -Nur linux-4.9.6.orig/drivers/ide/ide-probe.c linux-4.9.6/drivers/ide/ide-probe.c
4217 --- linux-4.9.6.orig/drivers/ide/ide-probe.c 2017-01-26 08:25:24.000000000 +0100
4218 +++ linux-4.9.6/drivers/ide/ide-probe.c 2017-01-28 13:59:09.911655420 +0100
4219 @@ -196,10 +196,10 @@
4222 /* local CPU only; some systems need this */
4223 - local_irq_save(flags);
4224 + local_irq_save_nort(flags);
4225 /* read 512 bytes of id info */
4226 hwif->tp_ops->input_data(drive, NULL, id, SECTOR_SIZE);
4227 - local_irq_restore(flags);
4228 + local_irq_restore_nort(flags);
4230 drive->dev_flags |= IDE_DFLAG_ID_READ;
4232 diff -Nur linux-4.9.6.orig/drivers/ide/ide-taskfile.c linux-4.9.6/drivers/ide/ide-taskfile.c
4233 --- linux-4.9.6.orig/drivers/ide/ide-taskfile.c 2017-01-26 08:25:24.000000000 +0100
4234 +++ linux-4.9.6/drivers/ide/ide-taskfile.c 2017-01-28 13:59:09.911655420 +0100
4237 page_is_high = PageHighMem(page);
4239 - local_irq_save(flags);
4240 + local_irq_save_nort(flags);
4242 buf = kmap_atomic(page) + offset;
4248 - local_irq_restore(flags);
4249 + local_irq_restore_nort(flags);
4256 if ((drive->dev_flags & IDE_DFLAG_UNMASK) == 0)
4257 - local_irq_disable();
4258 + local_irq_disable_nort();
4260 ide_set_handler(drive, &task_pio_intr, WAIT_WORSTCASE);
4262 diff -Nur linux-4.9.6.orig/drivers/infiniband/ulp/ipoib/ipoib_multicast.c linux-4.9.6/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
4263 --- linux-4.9.6.orig/drivers/infiniband/ulp/ipoib/ipoib_multicast.c 2017-01-26 08:25:24.000000000 +0100
4264 +++ linux-4.9.6/drivers/infiniband/ulp/ipoib/ipoib_multicast.c 2017-01-28 13:59:09.911655420 +0100
4267 ipoib_dbg_mcast(priv, "restarting multicast task\n");
4269 - local_irq_save(flags);
4270 + local_irq_save_nort(flags);
4271 netif_addr_lock(dev);
4272 spin_lock(&priv->lock);
4276 spin_unlock(&priv->lock);
4277 netif_addr_unlock(dev);
4278 - local_irq_restore(flags);
4279 + local_irq_restore_nort(flags);
4282 * make sure the in-flight joins have finished before we attempt
4283 diff -Nur linux-4.9.6.orig/drivers/input/gameport/gameport.c linux-4.9.6/drivers/input/gameport/gameport.c
4284 --- linux-4.9.6.orig/drivers/input/gameport/gameport.c 2017-01-26 08:25:24.000000000 +0100
4285 +++ linux-4.9.6/drivers/input/gameport/gameport.c 2017-01-28 13:59:09.915655573 +0100
4289 for (i = 0; i < 50; i++) {
4290 - local_irq_save(flags);
4291 + local_irq_save_nort(flags);
4292 t1 = ktime_get_ns();
4293 for (t = 0; t < 50; t++)
4294 gameport_read(gameport);
4295 t2 = ktime_get_ns();
4296 t3 = ktime_get_ns();
4297 - local_irq_restore(flags);
4298 + local_irq_restore_nort(flags);
4300 t = (t2 - t1) - (t3 - t2);
4302 @@ -124,12 +124,12 @@
4305 for(i = 0; i < 50; i++) {
4306 - local_irq_save(flags);
4307 + local_irq_save_nort(flags);
4309 for (t = 0; t < 50; t++) gameport_read(gameport);
4312 - local_irq_restore(flags);
4313 + local_irq_restore_nort(flags);
4315 if ((t = DELTA(t2,t1) - DELTA(t3,t2)) < tx) tx = t;
4317 @@ -148,11 +148,11 @@
4320 for(i = 0; i < 50; i++) {
4321 - local_irq_save(flags);
4322 + local_irq_save_nort(flags);
4324 for (t = 0; t < 50; t++) gameport_read(gameport);
4326 - local_irq_restore(flags);
4327 + local_irq_restore_nort(flags);
4329 if (t2 - t1 < tx) tx = t2 - t1;
4331 diff -Nur linux-4.9.6.orig/drivers/iommu/amd_iommu.c linux-4.9.6/drivers/iommu/amd_iommu.c
4332 --- linux-4.9.6.orig/drivers/iommu/amd_iommu.c 2017-01-26 08:25:24.000000000 +0100
4333 +++ linux-4.9.6/drivers/iommu/amd_iommu.c 2017-01-28 13:59:09.915655573 +0100
4334 @@ -1923,10 +1923,10 @@
4338 - * Must be called with IRQs disabled. Warn here to detect early
4340 + * Must be called with IRQs disabled on a non RT kernel. Warn here to
4341 + * detect early when its not.
4343 - WARN_ON(!irqs_disabled());
4344 + WARN_ON_NONRT(!irqs_disabled());
4347 spin_lock(&domain->lock);
4348 @@ -2094,10 +2094,10 @@
4349 struct protection_domain *domain;
4352 - * Must be called with IRQs disabled. Warn here to detect early
4354 + * Must be called with IRQs disabled on a non RT kernel. Warn here to
4355 + * detect early when its not.
4357 - WARN_ON(!irqs_disabled());
4358 + WARN_ON_NONRT(!irqs_disabled());
4360 if (WARN_ON(!dev_data->domain))
4362 diff -Nur linux-4.9.6.orig/drivers/iommu/intel-iommu.c linux-4.9.6/drivers/iommu/intel-iommu.c
4363 --- linux-4.9.6.orig/drivers/iommu/intel-iommu.c 2017-01-26 08:25:24.000000000 +0100
4364 +++ linux-4.9.6/drivers/iommu/intel-iommu.c 2017-01-28 13:59:09.915655573 +0100
4366 struct deferred_flush_table *tables;
4369 -DEFINE_PER_CPU(struct deferred_flush_data, deferred_flush);
4370 +static DEFINE_PER_CPU(struct deferred_flush_data, deferred_flush);
4372 /* bitmap for indexing intel_iommus */
4373 static int g_num_of_iommus;
4374 @@ -3715,10 +3715,8 @@
4375 struct intel_iommu *iommu;
4376 struct deferred_flush_entry *entry;
4377 struct deferred_flush_data *flush_data;
4378 - unsigned int cpuid;
4380 - cpuid = get_cpu();
4381 - flush_data = per_cpu_ptr(&deferred_flush, cpuid);
4382 + flush_data = raw_cpu_ptr(&deferred_flush);
4384 /* Flush all CPUs' entries to avoid deferring too much. If
4385 * this becomes a bottleneck, can just flush us, and rely on
4386 @@ -3751,8 +3749,6 @@
4389 spin_unlock_irqrestore(&flush_data->lock, flags);
4394 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
4395 diff -Nur linux-4.9.6.orig/drivers/iommu/iova.c linux-4.9.6/drivers/iommu/iova.c
4396 --- linux-4.9.6.orig/drivers/iommu/iova.c 2017-01-26 08:25:24.000000000 +0100
4397 +++ linux-4.9.6/drivers/iommu/iova.c 2017-01-28 13:59:09.915655573 +0100
4399 #include <linux/slab.h>
4400 #include <linux/smp.h>
4401 #include <linux/bitops.h>
4402 +#include <linux/cpu.h>
4404 static bool iova_rcache_insert(struct iova_domain *iovad,
4406 @@ -420,10 +421,8 @@
4408 /* Try replenishing IOVAs by flushing rcache. */
4409 flushed_rcache = true;
4410 - preempt_disable();
4411 for_each_online_cpu(cpu)
4412 free_cpu_cached_iovas(cpu, iovad);
4418 bool can_insert = false;
4419 unsigned long flags;
4421 - cpu_rcache = get_cpu_ptr(rcache->cpu_rcaches);
4422 + cpu_rcache = raw_cpu_ptr(rcache->cpu_rcaches);
4423 spin_lock_irqsave(&cpu_rcache->lock, flags);
4425 if (!iova_magazine_full(cpu_rcache->loaded)) {
4427 iova_magazine_push(cpu_rcache->loaded, iova_pfn);
4429 spin_unlock_irqrestore(&cpu_rcache->lock, flags);
4430 - put_cpu_ptr(rcache->cpu_rcaches);
4433 iova_magazine_free_pfns(mag_to_free, iovad);
4435 bool has_pfn = false;
4436 unsigned long flags;
4438 - cpu_rcache = get_cpu_ptr(rcache->cpu_rcaches);
4439 + cpu_rcache = raw_cpu_ptr(rcache->cpu_rcaches);
4440 spin_lock_irqsave(&cpu_rcache->lock, flags);
4442 if (!iova_magazine_empty(cpu_rcache->loaded)) {
4444 iova_pfn = iova_magazine_pop(cpu_rcache->loaded, limit_pfn);
4446 spin_unlock_irqrestore(&cpu_rcache->lock, flags);
4447 - put_cpu_ptr(rcache->cpu_rcaches);
4451 diff -Nur linux-4.9.6.orig/drivers/leds/trigger/Kconfig linux-4.9.6/drivers/leds/trigger/Kconfig
4452 --- linux-4.9.6.orig/drivers/leds/trigger/Kconfig 2017-01-26 08:25:24.000000000 +0100
4453 +++ linux-4.9.6/drivers/leds/trigger/Kconfig 2017-01-28 13:59:09.915655573 +0100
4456 config LEDS_TRIGGER_CPU
4457 bool "LED CPU Trigger"
4458 - depends on LEDS_TRIGGERS
4459 + depends on LEDS_TRIGGERS && !PREEMPT_RT_BASE
4461 This allows LEDs to be controlled by active CPUs. This shows
4462 the active CPUs across an array of LEDs so you can see which
4463 diff -Nur linux-4.9.6.orig/drivers/md/bcache/Kconfig linux-4.9.6/drivers/md/bcache/Kconfig
4464 --- linux-4.9.6.orig/drivers/md/bcache/Kconfig 2017-01-26 08:25:24.000000000 +0100
4465 +++ linux-4.9.6/drivers/md/bcache/Kconfig 2017-01-28 13:59:09.915655573 +0100
4469 tristate "Block device as cache"
4470 + depends on !PREEMPT_RT_FULL
4472 Allows a block device to be used as cache for other devices; uses
4473 a btree for indexing and the layout is optimized for SSDs.
4474 diff -Nur linux-4.9.6.orig/drivers/md/dm-rq.c linux-4.9.6/drivers/md/dm-rq.c
4475 --- linux-4.9.6.orig/drivers/md/dm-rq.c 2017-01-26 08:25:24.000000000 +0100
4476 +++ linux-4.9.6/drivers/md/dm-rq.c 2017-01-28 13:59:09.915655573 +0100
4478 /* Establish tio->ti before queuing work (map_tio_request) */
4480 kthread_queue_work(&md->kworker, &tio->work);
4481 - BUG_ON(!irqs_disabled());
4482 + BUG_ON_NONRT(!irqs_disabled());
4486 diff -Nur linux-4.9.6.orig/drivers/md/raid5.c linux-4.9.6/drivers/md/raid5.c
4487 --- linux-4.9.6.orig/drivers/md/raid5.c 2017-01-26 08:25:24.000000000 +0100
4488 +++ linux-4.9.6/drivers/md/raid5.c 2017-01-28 13:59:09.915655573 +0100
4489 @@ -1928,8 +1928,9 @@
4490 struct raid5_percpu *percpu;
4494 + cpu = get_cpu_light();
4495 percpu = per_cpu_ptr(conf->percpu, cpu);
4496 + spin_lock(&percpu->lock);
4497 if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) {
4498 ops_run_biofill(sh);
4500 @@ -1985,7 +1986,8 @@
4501 if (test_and_clear_bit(R5_Overlap, &dev->flags))
4502 wake_up(&sh->raid_conf->wait_for_overlap);
4505 + spin_unlock(&percpu->lock);
4509 static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp,
4510 @@ -6391,6 +6393,7 @@
4514 + spin_lock_init(&per_cpu_ptr(conf->percpu, cpu)->lock);
4518 @@ -6401,7 +6404,6 @@
4519 conf->percpu = alloc_percpu(struct raid5_percpu);
4523 err = cpuhp_state_add_instance(CPUHP_MD_RAID5_PREPARE, &conf->node);
4525 conf->scribble_disks = max(conf->raid_disks,
4526 diff -Nur linux-4.9.6.orig/drivers/md/raid5.h linux-4.9.6/drivers/md/raid5.h
4527 --- linux-4.9.6.orig/drivers/md/raid5.h 2017-01-26 08:25:24.000000000 +0100
4528 +++ linux-4.9.6/drivers/md/raid5.h 2017-01-28 13:59:09.915655573 +0100
4530 int recovery_disabled;
4531 /* per cpu variables */
4532 struct raid5_percpu {
4533 + spinlock_t lock; /* Protection for -RT */
4534 struct page *spare_page; /* Used when checking P/Q in raid6 */
4535 struct flex_array *scribble; /* space for constructing buffer
4536 * lists and performing address
4537 diff -Nur linux-4.9.6.orig/drivers/misc/Kconfig linux-4.9.6/drivers/misc/Kconfig
4538 --- linux-4.9.6.orig/drivers/misc/Kconfig 2017-01-26 08:25:24.000000000 +0100
4539 +++ linux-4.9.6/drivers/misc/Kconfig 2017-01-28 13:59:09.915655573 +0100
4542 bool "Atmel AT32/AT91 Timer/Counter Library"
4543 depends on (AVR32 || ARCH_AT91)
4544 + default y if PREEMPT_RT_FULL
4546 Select this if you want a library to allocate the Timer/Counter
4547 blocks found on many Atmel processors. This facilitates using
4549 are combined to make a single 32-bit timer.
4551 When GENERIC_CLOCKEVENTS is defined, the third timer channel
4552 - may be used as a clock event device supporting oneshot mode
4553 - (delays of up to two seconds) based on the 32 KiHz clock.
4554 + may be used as a clock event device supporting oneshot mode.
4556 config ATMEL_TCB_CLKSRC_BLOCK
4559 TC can be used for other purposes, such as PWM generation and
4562 +config ATMEL_TCB_CLKSRC_USE_SLOW_CLOCK
4563 + bool "TC Block use 32 KiHz clock"
4564 + depends on ATMEL_TCB_CLKSRC
4565 + default y if !PREEMPT_RT_FULL
4567 + Select this to use 32 KiHz base clock rate as TC block clock
4568 + source for clock events.
4572 tristate "Dummy IRQ handler"
4574 diff -Nur linux-4.9.6.orig/drivers/mmc/host/mmci.c linux-4.9.6/drivers/mmc/host/mmci.c
4575 --- linux-4.9.6.orig/drivers/mmc/host/mmci.c 2017-01-26 08:25:24.000000000 +0100
4576 +++ linux-4.9.6/drivers/mmc/host/mmci.c 2017-01-28 13:59:09.915655573 +0100
4577 @@ -1147,15 +1147,12 @@
4578 struct sg_mapping_iter *sg_miter = &host->sg_miter;
4579 struct variant_data *variant = host->variant;
4580 void __iomem *base = host->base;
4581 - unsigned long flags;
4584 status = readl(base + MMCISTATUS);
4586 dev_dbg(mmc_dev(host->mmc), "irq1 (pio) %08x\n", status);
4588 - local_irq_save(flags);
4591 unsigned int remain, len;
4593 @@ -1195,8 +1192,6 @@
4595 sg_miter_stop(sg_miter);
4597 - local_irq_restore(flags);
4600 * If we have less than the fifo 'half-full' threshold to transfer,
4601 * trigger a PIO interrupt as soon as any data is available.
4602 diff -Nur linux-4.9.6.orig/drivers/net/ethernet/3com/3c59x.c linux-4.9.6/drivers/net/ethernet/3com/3c59x.c
4603 --- linux-4.9.6.orig/drivers/net/ethernet/3com/3c59x.c 2017-01-26 08:25:24.000000000 +0100
4604 +++ linux-4.9.6/drivers/net/ethernet/3com/3c59x.c 2017-01-28 13:59:09.919655726 +0100
4607 struct vortex_private *vp = netdev_priv(dev);
4608 unsigned long flags;
4609 - local_irq_save(flags);
4610 + local_irq_save_nort(flags);
4611 (vp->full_bus_master_rx ? boomerang_interrupt:vortex_interrupt)(dev->irq,dev);
4612 - local_irq_restore(flags);
4613 + local_irq_restore_nort(flags);
4617 @@ -1910,12 +1910,12 @@
4618 * Block interrupts because vortex_interrupt does a bare spin_lock()
4620 unsigned long flags;
4621 - local_irq_save(flags);
4622 + local_irq_save_nort(flags);
4623 if (vp->full_bus_master_tx)
4624 boomerang_interrupt(dev->irq, dev);
4626 vortex_interrupt(dev->irq, dev);
4627 - local_irq_restore(flags);
4628 + local_irq_restore_nort(flags);
4632 diff -Nur linux-4.9.6.orig/drivers/net/ethernet/realtek/8139too.c linux-4.9.6/drivers/net/ethernet/realtek/8139too.c
4633 --- linux-4.9.6.orig/drivers/net/ethernet/realtek/8139too.c 2017-01-26 08:25:24.000000000 +0100
4634 +++ linux-4.9.6/drivers/net/ethernet/realtek/8139too.c 2017-01-28 13:59:09.919655726 +0100
4635 @@ -2233,7 +2233,7 @@
4636 struct rtl8139_private *tp = netdev_priv(dev);
4637 const int irq = tp->pci_dev->irq;
4640 + disable_irq_nosync(irq);
4641 rtl8139_interrupt(irq, dev);
4644 diff -Nur linux-4.9.6.orig/drivers/net/wireless/intersil/orinoco/orinoco_usb.c linux-4.9.6/drivers/net/wireless/intersil/orinoco/orinoco_usb.c
4645 --- linux-4.9.6.orig/drivers/net/wireless/intersil/orinoco/orinoco_usb.c 2017-01-26 08:25:24.000000000 +0100
4646 +++ linux-4.9.6/drivers/net/wireless/intersil/orinoco/orinoco_usb.c 2017-01-28 13:59:09.919655726 +0100
4648 while (!ctx->done.done && msecs--)
4651 - wait_event_interruptible(ctx->done.wait,
4652 + swait_event_interruptible(ctx->done.wait,
4656 diff -Nur linux-4.9.6.orig/drivers/pci/access.c linux-4.9.6/drivers/pci/access.c
4657 --- linux-4.9.6.orig/drivers/pci/access.c 2017-01-26 08:25:24.000000000 +0100
4658 +++ linux-4.9.6/drivers/pci/access.c 2017-01-28 13:59:09.919655726 +0100
4660 WARN_ON(!dev->block_cfg_access);
4662 dev->block_cfg_access = 0;
4663 - wake_up_all(&pci_cfg_wait);
4664 + wake_up_all_locked(&pci_cfg_wait);
4665 raw_spin_unlock_irqrestore(&pci_lock, flags);
4667 EXPORT_SYMBOL_GPL(pci_cfg_access_unlock);
4668 diff -Nur linux-4.9.6.orig/drivers/scsi/fcoe/fcoe.c linux-4.9.6/drivers/scsi/fcoe/fcoe.c
4669 --- linux-4.9.6.orig/drivers/scsi/fcoe/fcoe.c 2017-01-26 08:25:24.000000000 +0100
4670 +++ linux-4.9.6/drivers/scsi/fcoe/fcoe.c 2017-01-28 13:59:09.919655726 +0100
4671 @@ -1455,11 +1455,11 @@
4672 static int fcoe_alloc_paged_crc_eof(struct sk_buff *skb, int tlen)
4674 struct fcoe_percpu_s *fps;
4676 + int rc, cpu = get_cpu_light();
4678 - fps = &get_cpu_var(fcoe_percpu);
4679 + fps = &per_cpu(fcoe_percpu, cpu);
4680 rc = fcoe_get_paged_crc_eof(skb, tlen, fps);
4681 - put_cpu_var(fcoe_percpu);
4686 @@ -1646,11 +1646,11 @@
4690 - stats = per_cpu_ptr(lport->stats, get_cpu());
4691 + stats = per_cpu_ptr(lport->stats, get_cpu_light());
4692 stats->InvalidCRCCount++;
4693 if (stats->InvalidCRCCount < 5)
4694 printk(KERN_WARNING "fcoe: dropping frame with CRC error\n");
4700 @@ -1693,7 +1693,7 @@
4702 hp = (struct fcoe_hdr *) skb_network_header(skb);
4704 - stats = per_cpu_ptr(lport->stats, get_cpu());
4705 + stats = per_cpu_ptr(lport->stats, get_cpu_light());
4706 if (unlikely(FC_FCOE_DECAPS_VER(hp) != FC_FCOE_VER)) {
4707 if (stats->ErrorFrames < 5)
4708 printk(KERN_WARNING "fcoe: FCoE version "
4709 @@ -1725,13 +1725,13 @@
4712 if (!fcoe_filter_frames(lport, fp)) {
4715 fc_exch_recv(lport, fp);
4719 stats->ErrorFrames++;
4725 diff -Nur linux-4.9.6.orig/drivers/scsi/fcoe/fcoe_ctlr.c linux-4.9.6/drivers/scsi/fcoe/fcoe_ctlr.c
4726 --- linux-4.9.6.orig/drivers/scsi/fcoe/fcoe_ctlr.c 2017-01-26 08:25:24.000000000 +0100
4727 +++ linux-4.9.6/drivers/scsi/fcoe/fcoe_ctlr.c 2017-01-28 13:59:09.919655726 +0100
4730 INIT_LIST_HEAD(&del_list);
4732 - stats = per_cpu_ptr(fip->lp->stats, get_cpu());
4733 + stats = per_cpu_ptr(fip->lp->stats, get_cpu_light());
4735 list_for_each_entry_safe(fcf, next, &fip->fcfs, list) {
4736 deadline = fcf->time + fcf->fka_period + fcf->fka_period / 2;
4738 sel_time = fcf->time;
4744 list_for_each_entry_safe(fcf, next, &del_list, list) {
4745 /* Removes fcf from current list */
4746 diff -Nur linux-4.9.6.orig/drivers/scsi/libfc/fc_exch.c linux-4.9.6/drivers/scsi/libfc/fc_exch.c
4747 --- linux-4.9.6.orig/drivers/scsi/libfc/fc_exch.c 2017-01-26 08:25:24.000000000 +0100
4748 +++ linux-4.9.6/drivers/scsi/libfc/fc_exch.c 2017-01-28 13:59:09.919655726 +0100
4749 @@ -814,10 +814,10 @@
4751 memset(ep, 0, sizeof(*ep));
4754 + cpu = get_cpu_light();
4755 pool = per_cpu_ptr(mp->pool, cpu);
4756 spin_lock_bh(&pool->lock);
4760 /* peek cache of free slot */
4761 if (pool->left != FC_XID_UNKNOWN) {
4762 diff -Nur linux-4.9.6.orig/drivers/scsi/libsas/sas_ata.c linux-4.9.6/drivers/scsi/libsas/sas_ata.c
4763 --- linux-4.9.6.orig/drivers/scsi/libsas/sas_ata.c 2017-01-26 08:25:24.000000000 +0100
4764 +++ linux-4.9.6/drivers/scsi/libsas/sas_ata.c 2017-01-28 13:59:09.919655726 +0100
4766 /* TODO: audit callers to ensure they are ready for qc_issue to
4767 * unconditionally re-enable interrupts
4769 - local_irq_save(flags);
4770 + local_irq_save_nort(flags);
4771 spin_unlock(ap->lock);
4773 /* If the device fell off, no sense in issuing commands */
4777 spin_lock(ap->lock);
4778 - local_irq_restore(flags);
4779 + local_irq_restore_nort(flags);
4783 diff -Nur linux-4.9.6.orig/drivers/scsi/qla2xxx/qla_inline.h linux-4.9.6/drivers/scsi/qla2xxx/qla_inline.h
4784 --- linux-4.9.6.orig/drivers/scsi/qla2xxx/qla_inline.h 2017-01-26 08:25:24.000000000 +0100
4785 +++ linux-4.9.6/drivers/scsi/qla2xxx/qla_inline.h 2017-01-28 13:59:09.919655726 +0100
4788 unsigned long flags;
4789 struct qla_hw_data *ha = rsp->hw;
4790 - local_irq_save(flags);
4791 + local_irq_save_nort(flags);
4792 if (IS_P3P_TYPE(ha))
4793 qla82xx_poll(0, rsp);
4795 ha->isp_ops->intr_handler(0, rsp);
4796 - local_irq_restore(flags);
4797 + local_irq_restore_nort(flags);
4800 static inline uint8_t *
4801 diff -Nur linux-4.9.6.orig/drivers/scsi/qla2xxx/qla_isr.c linux-4.9.6/drivers/scsi/qla2xxx/qla_isr.c
4802 --- linux-4.9.6.orig/drivers/scsi/qla2xxx/qla_isr.c 2017-01-26 08:25:24.000000000 +0100
4803 +++ linux-4.9.6/drivers/scsi/qla2xxx/qla_isr.c 2017-01-28 13:59:09.919655726 +0100
4804 @@ -3125,7 +3125,11 @@
4807 kref_get(&qentry->irq_notify.kref);
4808 +#ifdef CONFIG_PREEMPT_RT_BASE
4809 + swork_queue(&qentry->irq_notify.swork);
4811 schedule_work(&qentry->irq_notify.work);
4816 diff -Nur linux-4.9.6.orig/drivers/thermal/x86_pkg_temp_thermal.c linux-4.9.6/drivers/thermal/x86_pkg_temp_thermal.c
4817 --- linux-4.9.6.orig/drivers/thermal/x86_pkg_temp_thermal.c 2017-01-26 08:25:24.000000000 +0100
4818 +++ linux-4.9.6/drivers/thermal/x86_pkg_temp_thermal.c 2017-01-28 13:59:09.923655879 +0100
4820 #include <linux/pm.h>
4821 #include <linux/thermal.h>
4822 #include <linux/debugfs.h>
4823 +#include <linux/swork.h>
4824 #include <asm/cpu_device_id.h>
4825 #include <asm/mce.h>
4831 -static int pkg_temp_thermal_platform_thermal_notify(__u64 msr_val)
4832 +static void platform_thermal_notify_work(struct swork_event *event)
4834 unsigned long flags;
4835 int cpu = smp_processor_id();
4837 pkg_work_scheduled[phy_id]) {
4838 disable_pkg_thres_interrupt();
4839 spin_unlock_irqrestore(&pkg_work_lock, flags);
4843 pkg_work_scheduled[phy_id] = 1;
4844 spin_unlock_irqrestore(&pkg_work_lock, flags);
4845 @@ -379,9 +380,48 @@
4846 schedule_delayed_work_on(cpu,
4847 &per_cpu(pkg_temp_thermal_threshold_work, cpu),
4848 msecs_to_jiffies(notify_delay_ms));
4851 +#ifdef CONFIG_PREEMPT_RT_FULL
4852 +static struct swork_event notify_work;
4854 +static int thermal_notify_work_init(void)
4858 + err = swork_get();
4862 + INIT_SWORK(¬ify_work, platform_thermal_notify_work);
4866 +static void thermal_notify_work_cleanup(void)
4871 +static int pkg_temp_thermal_platform_thermal_notify(__u64 msr_val)
4873 + swork_queue(¬ify_work);
4877 +#else /* !CONFIG_PREEMPT_RT_FULL */
4879 +static int thermal_notify_work_init(void) { return 0; }
4881 +static void thermal_notify_work_cleanup(void) { }
4883 +static int pkg_temp_thermal_platform_thermal_notify(__u64 msr_val)
4885 + platform_thermal_notify_work(NULL);
4889 +#endif /* CONFIG_PREEMPT_RT_FULL */
4891 static int find_siblings_cpu(int cpu)
4895 if (!x86_match_cpu(pkg_temp_thermal_ids))
4898 + if (!thermal_notify_work_init())
4901 spin_lock_init(&pkg_work_lock);
4902 platform_thermal_package_notify =
4903 pkg_temp_thermal_platform_thermal_notify;
4905 kfree(pkg_work_scheduled);
4906 platform_thermal_package_notify = NULL;
4907 platform_thermal_package_rate_control = NULL;
4909 + thermal_notify_work_cleanup();
4914 mutex_unlock(&phy_dev_list_mutex);
4915 platform_thermal_package_notify = NULL;
4916 platform_thermal_package_rate_control = NULL;
4917 + thermal_notify_work_cleanup();
4918 for_each_online_cpu(i)
4919 cancel_delayed_work_sync(
4920 &per_cpu(pkg_temp_thermal_threshold_work, i));
4921 diff -Nur linux-4.9.6.orig/drivers/tty/serial/8250/8250_core.c linux-4.9.6/drivers/tty/serial/8250/8250_core.c
4922 --- linux-4.9.6.orig/drivers/tty/serial/8250/8250_core.c 2017-01-26 08:25:24.000000000 +0100
4923 +++ linux-4.9.6/drivers/tty/serial/8250/8250_core.c 2017-01-28 13:59:09.923655879 +0100
4926 static unsigned int skip_txen_test; /* force skip of txen test at init time */
4928 -#define PASS_LIMIT 512
4930 + * On -rt we can have a more delays, and legitimately
4931 + * so - so don't drop work spuriously and spam the
4934 +#ifdef CONFIG_PREEMPT_RT_FULL
4935 +# define PASS_LIMIT 1000000
4937 +# define PASS_LIMIT 512
4940 #include <asm/serial.h>
4942 diff -Nur linux-4.9.6.orig/drivers/tty/serial/8250/8250_port.c linux-4.9.6/drivers/tty/serial/8250/8250_port.c
4943 --- linux-4.9.6.orig/drivers/tty/serial/8250/8250_port.c 2017-01-26 08:25:24.000000000 +0100
4944 +++ linux-4.9.6/drivers/tty/serial/8250/8250_port.c 2017-01-28 13:59:09.923655879 +0100
4946 #include <linux/nmi.h>
4947 #include <linux/mutex.h>
4948 #include <linux/slab.h>
4949 +#include <linux/kdb.h>
4950 #include <linux/uaccess.h>
4951 #include <linux/pm_runtime.h>
4952 #include <linux/timer.h>
4953 @@ -3144,9 +3145,9 @@
4955 serial8250_rpm_get(up);
4958 + if (port->sysrq || oops_in_progress)
4960 - else if (oops_in_progress)
4961 + else if (in_kdb_printk())
4962 locked = spin_trylock_irqsave(&port->lock, flags);
4964 spin_lock_irqsave(&port->lock, flags);
4965 diff -Nur linux-4.9.6.orig/drivers/tty/serial/amba-pl011.c linux-4.9.6/drivers/tty/serial/amba-pl011.c
4966 --- linux-4.9.6.orig/drivers/tty/serial/amba-pl011.c 2017-01-26 08:25:24.000000000 +0100
4967 +++ linux-4.9.6/drivers/tty/serial/amba-pl011.c 2017-01-28 13:59:09.923655879 +0100
4968 @@ -2194,13 +2194,19 @@
4970 clk_enable(uap->clk);
4972 - local_irq_save(flags);
4974 + * local_irq_save(flags);
4976 + * This local_irq_save() is nonsense. If we come in via sysrq
4977 + * handling then interrupts are already disabled. Aside of
4978 + * that the port.sysrq check is racy on SMP regardless.
4980 if (uap->port.sysrq)
4982 else if (oops_in_progress)
4983 - locked = spin_trylock(&uap->port.lock);
4984 + locked = spin_trylock_irqsave(&uap->port.lock, flags);
4986 - spin_lock(&uap->port.lock);
4987 + spin_lock_irqsave(&uap->port.lock, flags);
4990 * First save the CR then disable the interrupts
4991 @@ -2224,8 +2230,7 @@
4992 pl011_write(old_cr, uap, REG_CR);
4995 - spin_unlock(&uap->port.lock);
4996 - local_irq_restore(flags);
4997 + spin_unlock_irqrestore(&uap->port.lock, flags);
4999 clk_disable(uap->clk);
5001 diff -Nur linux-4.9.6.orig/drivers/tty/serial/omap-serial.c linux-4.9.6/drivers/tty/serial/omap-serial.c
5002 --- linux-4.9.6.orig/drivers/tty/serial/omap-serial.c 2017-01-26 08:25:24.000000000 +0100
5003 +++ linux-4.9.6/drivers/tty/serial/omap-serial.c 2017-01-28 13:59:09.923655879 +0100
5004 @@ -1257,13 +1257,10 @@
5006 pm_runtime_get_sync(up->dev);
5008 - local_irq_save(flags);
5009 - if (up->port.sysrq)
5011 - else if (oops_in_progress)
5012 - locked = spin_trylock(&up->port.lock);
5013 + if (up->port.sysrq || oops_in_progress)
5014 + locked = spin_trylock_irqsave(&up->port.lock, flags);
5016 - spin_lock(&up->port.lock);
5017 + spin_lock_irqsave(&up->port.lock, flags);
5020 * First save the IER then disable the interrupts
5021 @@ -1292,8 +1289,7 @@
5022 pm_runtime_mark_last_busy(up->dev);
5023 pm_runtime_put_autosuspend(up->dev);
5025 - spin_unlock(&up->port.lock);
5026 - local_irq_restore(flags);
5027 + spin_unlock_irqrestore(&up->port.lock, flags);
5031 diff -Nur linux-4.9.6.orig/drivers/usb/core/hcd.c linux-4.9.6/drivers/usb/core/hcd.c
5032 --- linux-4.9.6.orig/drivers/usb/core/hcd.c 2017-01-26 08:25:24.000000000 +0100
5033 +++ linux-4.9.6/drivers/usb/core/hcd.c 2017-01-28 13:59:09.923655879 +0100
5034 @@ -1761,9 +1761,9 @@
5035 * and no one may trigger the above deadlock situation when
5036 * running complete() in tasklet.
5038 - local_irq_save(flags);
5039 + local_irq_save_nort(flags);
5041 - local_irq_restore(flags);
5042 + local_irq_restore_nort(flags);
5044 usb_anchor_resume_wakeups(anchor);
5045 atomic_dec(&urb->use_count);
5046 diff -Nur linux-4.9.6.orig/drivers/usb/gadget/function/f_fs.c linux-4.9.6/drivers/usb/gadget/function/f_fs.c
5047 --- linux-4.9.6.orig/drivers/usb/gadget/function/f_fs.c 2017-01-26 08:25:24.000000000 +0100
5048 +++ linux-4.9.6/drivers/usb/gadget/function/f_fs.c 2017-01-28 13:59:09.923655879 +0100
5049 @@ -1593,7 +1593,7 @@
5050 pr_info("%s(): freeing\n", __func__);
5051 ffs_data_clear(ffs);
5052 BUG_ON(waitqueue_active(&ffs->ev.waitq) ||
5053 - waitqueue_active(&ffs->ep0req_completion.wait));
5054 + swait_active(&ffs->ep0req_completion.wait));
5055 kfree(ffs->dev_name);
5058 diff -Nur linux-4.9.6.orig/drivers/usb/gadget/legacy/inode.c linux-4.9.6/drivers/usb/gadget/legacy/inode.c
5059 --- linux-4.9.6.orig/drivers/usb/gadget/legacy/inode.c 2017-01-26 08:25:24.000000000 +0100
5060 +++ linux-4.9.6/drivers/usb/gadget/legacy/inode.c 2017-01-28 13:59:09.923655879 +0100
5062 spin_unlock_irq (&epdata->dev->lock);
5064 if (likely (value == 0)) {
5065 - value = wait_event_interruptible (done.wait, done.done);
5066 + value = swait_event_interruptible (done.wait, done.done);
5068 spin_lock_irq (&epdata->dev->lock);
5069 if (likely (epdata->ep != NULL)) {
5071 usb_ep_dequeue (epdata->ep, epdata->req);
5072 spin_unlock_irq (&epdata->dev->lock);
5074 - wait_event (done.wait, done.done);
5075 + swait_event (done.wait, done.done);
5076 if (epdata->status == -ECONNRESET)
5077 epdata->status = -EINTR;
5079 diff -Nur linux-4.9.6.orig/fs/aio.c linux-4.9.6/fs/aio.c
5080 --- linux-4.9.6.orig/fs/aio.c 2017-01-26 08:25:24.000000000 +0100
5081 +++ linux-4.9.6/fs/aio.c 2017-01-28 13:59:09.923655879 +0100
5083 #include <linux/ramfs.h>
5084 #include <linux/percpu-refcount.h>
5085 #include <linux/mount.h>
5086 +#include <linux/swork.h>
5088 #include <asm/kmap_types.h>
5089 #include <asm/uaccess.h>
5091 struct page **ring_pages;
5094 - struct work_struct free_work;
5095 + struct swork_event free_work;
5098 * signals when all in-flight requests are done
5101 .kill_sb = kill_anon_super,
5103 + BUG_ON(swork_get());
5104 aio_mnt = kern_mount(&aio_fs);
5105 if (IS_ERR(aio_mnt))
5106 panic("Failed to create aio fs mount.");
5108 return cancel(&kiocb->common);
5111 -static void free_ioctx(struct work_struct *work)
5112 +static void free_ioctx(struct swork_event *sev)
5114 - struct kioctx *ctx = container_of(work, struct kioctx, free_work);
5115 + struct kioctx *ctx = container_of(sev, struct kioctx, free_work);
5117 pr_debug("freeing %p\n", ctx);
5120 if (ctx->rq_wait && atomic_dec_and_test(&ctx->rq_wait->count))
5121 complete(&ctx->rq_wait->comp);
5123 - INIT_WORK(&ctx->free_work, free_ioctx);
5124 - schedule_work(&ctx->free_work);
5125 + INIT_SWORK(&ctx->free_work, free_ioctx);
5126 + swork_queue(&ctx->free_work);
5131 * and ctx->users has dropped to 0, so we know no more kiocbs can be submitted -
5132 * now it's safe to cancel any that need to be.
5134 -static void free_ioctx_users(struct percpu_ref *ref)
5135 +static void free_ioctx_users_work(struct swork_event *sev)
5137 - struct kioctx *ctx = container_of(ref, struct kioctx, users);
5138 + struct kioctx *ctx = container_of(sev, struct kioctx, free_work);
5139 struct aio_kiocb *req;
5141 spin_lock_irq(&ctx->ctx_lock);
5142 @@ -632,6 +634,14 @@
5143 percpu_ref_put(&ctx->reqs);
5146 +static void free_ioctx_users(struct percpu_ref *ref)
5148 + struct kioctx *ctx = container_of(ref, struct kioctx, users);
5150 + INIT_SWORK(&ctx->free_work, free_ioctx_users_work);
5151 + swork_queue(&ctx->free_work);
5154 static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
5157 diff -Nur linux-4.9.6.orig/fs/autofs4/autofs_i.h linux-4.9.6/fs/autofs4/autofs_i.h
5158 --- linux-4.9.6.orig/fs/autofs4/autofs_i.h 2017-01-26 08:25:24.000000000 +0100
5159 +++ linux-4.9.6/fs/autofs4/autofs_i.h 2017-01-28 13:59:09.923655879 +0100
5161 #include <linux/sched.h>
5162 #include <linux/mount.h>
5163 #include <linux/namei.h>
5164 +#include <linux/delay.h>
5165 #include <asm/current.h>
5166 #include <linux/uaccess.h>
5168 diff -Nur linux-4.9.6.orig/fs/autofs4/expire.c linux-4.9.6/fs/autofs4/expire.c
5169 --- linux-4.9.6.orig/fs/autofs4/expire.c 2017-01-26 08:25:24.000000000 +0100
5170 +++ linux-4.9.6/fs/autofs4/expire.c 2017-01-28 13:59:09.923655879 +0100
5172 parent = p->d_parent;
5173 if (!spin_trylock(&parent->d_lock)) {
5174 spin_unlock(&p->d_lock);
5179 spin_unlock(&p->d_lock);
5180 diff -Nur linux-4.9.6.orig/fs/buffer.c linux-4.9.6/fs/buffer.c
5181 --- linux-4.9.6.orig/fs/buffer.c 2017-01-26 08:25:24.000000000 +0100
5182 +++ linux-4.9.6/fs/buffer.c 2017-01-28 13:59:09.927656033 +0100
5184 * decide that the page is now completely done.
5186 first = page_buffers(page);
5187 - local_irq_save(flags);
5188 - bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
5189 + flags = bh_uptodate_lock_irqsave(first);
5190 clear_buffer_async_read(bh);
5195 tmp = tmp->b_this_page;
5196 } while (tmp != bh);
5197 - bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
5198 - local_irq_restore(flags);
5199 + bh_uptodate_unlock_irqrestore(first, flags);
5202 * If none of the buffers had errors and they are all
5207 - bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
5208 - local_irq_restore(flags);
5210 + bh_uptodate_unlock_irqrestore(first, flags);
5217 first = page_buffers(page);
5218 - local_irq_save(flags);
5219 - bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
5220 + flags = bh_uptodate_lock_irqsave(first);
5222 clear_buffer_async_write(bh);
5224 @@ -371,15 +366,12 @@
5226 tmp = tmp->b_this_page;
5228 - bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
5229 - local_irq_restore(flags);
5230 + bh_uptodate_unlock_irqrestore(first, flags);
5231 end_page_writeback(page);
5235 - bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
5236 - local_irq_restore(flags);
5238 + bh_uptodate_unlock_irqrestore(first, flags);
5240 EXPORT_SYMBOL(end_buffer_async_write);
5242 @@ -3383,6 +3375,7 @@
5243 struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
5245 INIT_LIST_HEAD(&ret->b_assoc_buffers);
5246 + buffer_head_init_locks(ret);
5248 __this_cpu_inc(bh_accounting.nr);
5250 diff -Nur linux-4.9.6.orig/fs/cifs/readdir.c linux-4.9.6/fs/cifs/readdir.c
5251 --- linux-4.9.6.orig/fs/cifs/readdir.c 2017-01-26 08:25:24.000000000 +0100
5252 +++ linux-4.9.6/fs/cifs/readdir.c 2017-01-28 13:59:09.927656033 +0100
5254 struct inode *inode;
5255 struct super_block *sb = parent->d_sb;
5256 struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
5257 - DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
5258 + DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
5260 cifs_dbg(FYI, "%s: for %s\n", __func__, name->name);
5262 diff -Nur linux-4.9.6.orig/fs/dcache.c linux-4.9.6/fs/dcache.c
5263 --- linux-4.9.6.orig/fs/dcache.c 2017-01-26 08:25:24.000000000 +0100
5264 +++ linux-4.9.6/fs/dcache.c 2017-01-28 13:59:09.927656033 +0100
5266 #include <linux/mm.h>
5267 #include <linux/fs.h>
5268 #include <linux/fsnotify.h>
5269 +#include <linux/delay.h>
5270 #include <linux/slab.h>
5271 #include <linux/init.h>
5272 #include <linux/hash.h>
5275 void dput(struct dentry *dentry)
5277 + struct dentry *parent;
5279 if (unlikely(!dentry))
5282 @@ -788,9 +791,18 @@
5286 - dentry = dentry_kill(dentry);
5289 + parent = dentry_kill(dentry);
5293 + if (parent == dentry) {
5294 + /* the task with the highest priority won't schedule */
5295 + r = cond_resched();
5304 @@ -2324,7 +2336,7 @@
5305 if (dentry->d_lockref.count == 1) {
5306 if (!spin_trylock(&inode->i_lock)) {
5307 spin_unlock(&dentry->d_lock);
5312 dentry->d_flags &= ~DCACHE_CANT_MOUNT;
5313 @@ -2384,21 +2396,24 @@
5315 static void d_wait_lookup(struct dentry *dentry)
5317 - if (d_in_lookup(dentry)) {
5318 - DECLARE_WAITQUEUE(wait, current);
5319 - add_wait_queue(dentry->d_wait, &wait);
5321 - set_current_state(TASK_UNINTERRUPTIBLE);
5322 - spin_unlock(&dentry->d_lock);
5324 - spin_lock(&dentry->d_lock);
5325 - } while (d_in_lookup(dentry));
5327 + struct swait_queue __wait;
5329 + if (!d_in_lookup(dentry))
5332 + INIT_LIST_HEAD(&__wait.task_list);
5334 + prepare_to_swait(dentry->d_wait, &__wait, TASK_UNINTERRUPTIBLE);
5335 + spin_unlock(&dentry->d_lock);
5337 + spin_lock(&dentry->d_lock);
5338 + } while (d_in_lookup(dentry));
5339 + finish_swait(dentry->d_wait, &__wait);
5342 struct dentry *d_alloc_parallel(struct dentry *parent,
5343 const struct qstr *name,
5344 - wait_queue_head_t *wq)
5345 + struct swait_queue_head *wq)
5347 unsigned int hash = name->hash;
5348 struct hlist_bl_head *b = in_lookup_hash(parent, hash);
5349 @@ -2507,7 +2522,7 @@
5351 dentry->d_flags &= ~DCACHE_PAR_LOOKUP;
5352 __hlist_bl_del(&dentry->d_u.d_in_lookup_hash);
5353 - wake_up_all(dentry->d_wait);
5354 + swake_up_all(dentry->d_wait);
5355 dentry->d_wait = NULL;
5357 INIT_HLIST_NODE(&dentry->d_u.d_alias);
5358 @@ -3604,6 +3619,11 @@
5360 void __init vfs_caches_init_early(void)
5364 + for (i = 0; i < ARRAY_SIZE(in_lookup_hashtable); i++)
5365 + INIT_HLIST_BL_HEAD(&in_lookup_hashtable[i]);
5367 dcache_init_early();
5370 diff -Nur linux-4.9.6.orig/fs/eventpoll.c linux-4.9.6/fs/eventpoll.c
5371 --- linux-4.9.6.orig/fs/eventpoll.c 2017-01-26 08:25:24.000000000 +0100
5372 +++ linux-4.9.6/fs/eventpoll.c 2017-01-28 13:59:09.927656033 +0100
5373 @@ -510,12 +510,12 @@
5375 static void ep_poll_safewake(wait_queue_head_t *wq)
5377 - int this_cpu = get_cpu();
5378 + int this_cpu = get_cpu_light();
5380 ep_call_nested(&poll_safewake_ncalls, EP_MAX_NESTS,
5381 ep_poll_wakeup_proc, NULL, wq, (void *) (long) this_cpu);
5387 static void ep_remove_wait_queue(struct eppoll_entry *pwq)
5388 diff -Nur linux-4.9.6.orig/fs/exec.c linux-4.9.6/fs/exec.c
5389 --- linux-4.9.6.orig/fs/exec.c 2017-01-26 08:25:24.000000000 +0100
5390 +++ linux-4.9.6/fs/exec.c 2017-01-28 13:59:09.927656033 +0100
5391 @@ -1017,12 +1017,14 @@
5395 + preempt_disable_rt();
5396 active_mm = tsk->active_mm;
5398 tsk->active_mm = mm;
5399 activate_mm(active_mm, mm);
5400 tsk->mm->vmacache_seqnum = 0;
5401 vmacache_flush(tsk);
5402 + preempt_enable_rt();
5405 up_read(&old_mm->mmap_sem);
5406 diff -Nur linux-4.9.6.orig/fs/fuse/dir.c linux-4.9.6/fs/fuse/dir.c
5407 --- linux-4.9.6.orig/fs/fuse/dir.c 2017-01-26 08:25:24.000000000 +0100
5408 +++ linux-4.9.6/fs/fuse/dir.c 2017-01-28 13:59:09.927656033 +0100
5409 @@ -1191,7 +1191,7 @@
5410 struct inode *dir = d_inode(parent);
5411 struct fuse_conn *fc;
5412 struct inode *inode;
5413 - DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
5414 + DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
5418 diff -Nur linux-4.9.6.orig/fs/jbd2/checkpoint.c linux-4.9.6/fs/jbd2/checkpoint.c
5419 --- linux-4.9.6.orig/fs/jbd2/checkpoint.c 2017-01-26 08:25:24.000000000 +0100
5420 +++ linux-4.9.6/fs/jbd2/checkpoint.c 2017-01-28 13:59:09.927656033 +0100
5422 nblocks = jbd2_space_needed(journal);
5423 while (jbd2_log_space_left(journal) < nblocks) {
5424 write_unlock(&journal->j_state_lock);
5425 + if (current->plug)
5427 mutex_lock(&journal->j_checkpoint_mutex);
5430 diff -Nur linux-4.9.6.orig/fs/locks.c linux-4.9.6/fs/locks.c
5431 --- linux-4.9.6.orig/fs/locks.c 2017-01-26 08:25:24.000000000 +0100
5432 +++ linux-4.9.6/fs/locks.c 2017-01-28 13:59:09.927656033 +0100
5437 - percpu_down_read_preempt_disable(&file_rwsem);
5438 + percpu_down_read(&file_rwsem);
5439 spin_lock(&ctx->flc_lock);
5440 if (request->fl_flags & FL_ACCESS)
5445 spin_unlock(&ctx->flc_lock);
5446 - percpu_up_read_preempt_enable(&file_rwsem);
5447 + percpu_up_read(&file_rwsem);
5449 locks_free_lock(new_fl);
5450 locks_dispose_list(&dispose);
5451 @@ -1013,7 +1013,7 @@
5452 new_fl2 = locks_alloc_lock();
5455 - percpu_down_read_preempt_disable(&file_rwsem);
5456 + percpu_down_read(&file_rwsem);
5457 spin_lock(&ctx->flc_lock);
5459 * New lock request. Walk all POSIX locks and look for conflicts. If
5460 @@ -1185,7 +1185,7 @@
5463 spin_unlock(&ctx->flc_lock);
5464 - percpu_up_read_preempt_enable(&file_rwsem);
5465 + percpu_up_read(&file_rwsem);
5467 * Free any unused locks.
5469 @@ -1460,7 +1460,7 @@
5473 - percpu_down_read_preempt_disable(&file_rwsem);
5474 + percpu_down_read(&file_rwsem);
5475 spin_lock(&ctx->flc_lock);
5477 time_out_leases(inode, &dispose);
5478 @@ -1512,13 +1512,13 @@
5479 locks_insert_block(fl, new_fl);
5480 trace_break_lease_block(inode, new_fl);
5481 spin_unlock(&ctx->flc_lock);
5482 - percpu_up_read_preempt_enable(&file_rwsem);
5483 + percpu_up_read(&file_rwsem);
5485 locks_dispose_list(&dispose);
5486 error = wait_event_interruptible_timeout(new_fl->fl_wait,
5487 !new_fl->fl_next, break_time);
5489 - percpu_down_read_preempt_disable(&file_rwsem);
5490 + percpu_down_read(&file_rwsem);
5491 spin_lock(&ctx->flc_lock);
5492 trace_break_lease_unblock(inode, new_fl);
5493 locks_delete_block(new_fl);
5494 @@ -1535,7 +1535,7 @@
5497 spin_unlock(&ctx->flc_lock);
5498 - percpu_up_read_preempt_enable(&file_rwsem);
5499 + percpu_up_read(&file_rwsem);
5500 locks_dispose_list(&dispose);
5501 locks_free_lock(new_fl);
5503 @@ -1609,7 +1609,7 @@
5505 ctx = smp_load_acquire(&inode->i_flctx);
5506 if (ctx && !list_empty_careful(&ctx->flc_lease)) {
5507 - percpu_down_read_preempt_disable(&file_rwsem);
5508 + percpu_down_read(&file_rwsem);
5509 spin_lock(&ctx->flc_lock);
5510 time_out_leases(inode, &dispose);
5511 list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
5512 @@ -1619,7 +1619,7 @@
5515 spin_unlock(&ctx->flc_lock);
5516 - percpu_up_read_preempt_enable(&file_rwsem);
5517 + percpu_up_read(&file_rwsem);
5519 locks_dispose_list(&dispose);
5521 @@ -1694,7 +1694,7 @@
5525 - percpu_down_read_preempt_disable(&file_rwsem);
5526 + percpu_down_read(&file_rwsem);
5527 spin_lock(&ctx->flc_lock);
5528 time_out_leases(inode, &dispose);
5529 error = check_conflicting_open(dentry, arg, lease->fl_flags);
5530 @@ -1765,7 +1765,7 @@
5531 lease->fl_lmops->lm_setup(lease, priv);
5533 spin_unlock(&ctx->flc_lock);
5534 - percpu_up_read_preempt_enable(&file_rwsem);
5535 + percpu_up_read(&file_rwsem);
5536 locks_dispose_list(&dispose);
5538 inode_unlock(inode);
5539 @@ -1788,7 +1788,7 @@
5543 - percpu_down_read_preempt_disable(&file_rwsem);
5544 + percpu_down_read(&file_rwsem);
5545 spin_lock(&ctx->flc_lock);
5546 list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
5547 if (fl->fl_file == filp &&
5548 @@ -1801,7 +1801,7 @@
5550 error = fl->fl_lmops->lm_change(victim, F_UNLCK, &dispose);
5551 spin_unlock(&ctx->flc_lock);
5552 - percpu_up_read_preempt_enable(&file_rwsem);
5553 + percpu_up_read(&file_rwsem);
5554 locks_dispose_list(&dispose);
5557 @@ -2532,13 +2532,13 @@
5558 if (list_empty(&ctx->flc_lease))
5561 - percpu_down_read_preempt_disable(&file_rwsem);
5562 + percpu_down_read(&file_rwsem);
5563 spin_lock(&ctx->flc_lock);
5564 list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, fl_list)
5565 if (filp == fl->fl_file)
5566 lease_modify(fl, F_UNLCK, &dispose);
5567 spin_unlock(&ctx->flc_lock);
5568 - percpu_up_read_preempt_enable(&file_rwsem);
5569 + percpu_up_read(&file_rwsem);
5571 locks_dispose_list(&dispose);
5573 diff -Nur linux-4.9.6.orig/fs/namei.c linux-4.9.6/fs/namei.c
5574 --- linux-4.9.6.orig/fs/namei.c 2017-01-26 08:25:24.000000000 +0100
5575 +++ linux-4.9.6/fs/namei.c 2017-01-28 13:59:09.927656033 +0100
5576 @@ -1629,7 +1629,7 @@
5578 struct dentry *dentry = ERR_PTR(-ENOENT), *old;
5579 struct inode *inode = dir->d_inode;
5580 - DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
5581 + DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
5583 inode_lock_shared(inode);
5584 /* Don't go there if it's already dead */
5585 @@ -3086,7 +3086,7 @@
5586 struct dentry *dentry;
5587 int error, create_error = 0;
5588 umode_t mode = op->mode;
5589 - DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
5590 + DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
5592 if (unlikely(IS_DEADDIR(dir_inode)))
5594 diff -Nur linux-4.9.6.orig/fs/namespace.c linux-4.9.6/fs/namespace.c
5595 --- linux-4.9.6.orig/fs/namespace.c 2017-01-26 08:25:24.000000000 +0100
5596 +++ linux-4.9.6/fs/namespace.c 2017-01-28 13:59:09.931656187 +0100
5598 #include <linux/mnt_namespace.h>
5599 #include <linux/user_namespace.h>
5600 #include <linux/namei.h>
5601 +#include <linux/delay.h>
5602 #include <linux/security.h>
5603 #include <linux/idr.h>
5604 #include <linux/init.h> /* init_rootfs */
5605 @@ -356,8 +357,11 @@
5606 * incremented count after it has set MNT_WRITE_HOLD.
5609 - while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD)
5611 + while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) {
5614 + preempt_disable();
5617 * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
5618 * be set to match its requirements. So we must not load that until
5619 diff -Nur linux-4.9.6.orig/fs/nfs/delegation.c linux-4.9.6/fs/nfs/delegation.c
5620 --- linux-4.9.6.orig/fs/nfs/delegation.c 2017-01-26 08:25:24.000000000 +0100
5621 +++ linux-4.9.6/fs/nfs/delegation.c 2017-01-28 13:59:09.931656187 +0100
5622 @@ -150,11 +150,11 @@
5624 /* Block nfs4_proc_unlck */
5625 mutex_lock(&sp->so_delegreturn_mutex);
5626 - seq = raw_seqcount_begin(&sp->so_reclaim_seqcount);
5627 + seq = read_seqbegin(&sp->so_reclaim_seqlock);
5628 err = nfs4_open_delegation_recall(ctx, state, stateid, type);
5630 err = nfs_delegation_claim_locks(ctx, state, stateid);
5631 - if (!err && read_seqcount_retry(&sp->so_reclaim_seqcount, seq))
5632 + if (!err && read_seqretry(&sp->so_reclaim_seqlock, seq))
5634 mutex_unlock(&sp->so_delegreturn_mutex);
5635 put_nfs_open_context(ctx);
5636 diff -Nur linux-4.9.6.orig/fs/nfs/dir.c linux-4.9.6/fs/nfs/dir.c
5637 --- linux-4.9.6.orig/fs/nfs/dir.c 2017-01-26 08:25:24.000000000 +0100
5638 +++ linux-4.9.6/fs/nfs/dir.c 2017-01-28 13:59:09.931656187 +0100
5640 void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
5642 struct qstr filename = QSTR_INIT(entry->name, entry->len);
5643 - DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
5644 + DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
5645 struct dentry *dentry;
5646 struct dentry *alias;
5647 struct inode *dir = d_inode(parent);
5648 @@ -1487,7 +1487,7 @@
5649 struct file *file, unsigned open_flags,
5650 umode_t mode, int *opened)
5652 - DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
5653 + DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
5654 struct nfs_open_context *ctx;
5656 struct iattr attr = { .ia_valid = ATTR_OPEN };
5657 @@ -1802,7 +1802,11 @@
5659 trace_nfs_rmdir_enter(dir, dentry);
5660 if (d_really_is_positive(dentry)) {
5661 +#ifdef CONFIG_PREEMPT_RT_BASE
5662 + down(&NFS_I(d_inode(dentry))->rmdir_sem);
5664 down_write(&NFS_I(d_inode(dentry))->rmdir_sem);
5666 error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name);
5667 /* Ensure the VFS deletes this inode */
5669 @@ -1812,7 +1816,11 @@
5671 nfs_dentry_handle_enoent(dentry);
5673 +#ifdef CONFIG_PREEMPT_RT_BASE
5674 + up(&NFS_I(d_inode(dentry))->rmdir_sem);
5676 up_write(&NFS_I(d_inode(dentry))->rmdir_sem);
5679 error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name);
5680 trace_nfs_rmdir_exit(dir, dentry, error);
5681 diff -Nur linux-4.9.6.orig/fs/nfs/inode.c linux-4.9.6/fs/nfs/inode.c
5682 --- linux-4.9.6.orig/fs/nfs/inode.c 2017-01-26 08:25:24.000000000 +0100
5683 +++ linux-4.9.6/fs/nfs/inode.c 2017-01-28 13:59:09.931656187 +0100
5684 @@ -1957,7 +1957,11 @@
5685 nfsi->nrequests = 0;
5686 nfsi->commit_info.ncommit = 0;
5687 atomic_set(&nfsi->commit_info.rpcs_out, 0);
5688 +#ifdef CONFIG_PREEMPT_RT_BASE
5689 + sema_init(&nfsi->rmdir_sem, 1);
5691 init_rwsem(&nfsi->rmdir_sem);
5693 nfs4_init_once(nfsi);
5696 diff -Nur linux-4.9.6.orig/fs/nfs/nfs4_fs.h linux-4.9.6/fs/nfs/nfs4_fs.h
5697 --- linux-4.9.6.orig/fs/nfs/nfs4_fs.h 2017-01-26 08:25:24.000000000 +0100
5698 +++ linux-4.9.6/fs/nfs/nfs4_fs.h 2017-01-28 13:59:09.931656187 +0100
5700 unsigned long so_flags;
5701 struct list_head so_states;
5702 struct nfs_seqid_counter so_seqid;
5703 - seqcount_t so_reclaim_seqcount;
5704 + seqlock_t so_reclaim_seqlock;
5705 struct mutex so_delegreturn_mutex;
5708 diff -Nur linux-4.9.6.orig/fs/nfs/nfs4proc.c linux-4.9.6/fs/nfs/nfs4proc.c
5709 --- linux-4.9.6.orig/fs/nfs/nfs4proc.c 2017-01-26 08:25:24.000000000 +0100
5710 +++ linux-4.9.6/fs/nfs/nfs4proc.c 2017-01-28 13:59:09.931656187 +0100
5711 @@ -2697,7 +2697,7 @@
5715 - seq = raw_seqcount_begin(&sp->so_reclaim_seqcount);
5716 + seq = raw_seqcount_begin(&sp->so_reclaim_seqlock.seqcount);
5718 ret = _nfs4_proc_open(opendata);
5720 @@ -2735,7 +2735,7 @@
5722 if (d_inode(dentry) == state->inode) {
5723 nfs_inode_attach_open_context(ctx);
5724 - if (read_seqcount_retry(&sp->so_reclaim_seqcount, seq))
5725 + if (read_seqretry(&sp->so_reclaim_seqlock, seq))
5726 nfs4_schedule_stateid_recovery(server, state);
5729 diff -Nur linux-4.9.6.orig/fs/nfs/nfs4state.c linux-4.9.6/fs/nfs/nfs4state.c
5730 --- linux-4.9.6.orig/fs/nfs/nfs4state.c 2017-01-26 08:25:24.000000000 +0100
5731 +++ linux-4.9.6/fs/nfs/nfs4state.c 2017-01-28 13:59:09.931656187 +0100
5733 nfs4_init_seqid_counter(&sp->so_seqid);
5734 atomic_set(&sp->so_count, 1);
5735 INIT_LIST_HEAD(&sp->so_lru);
5736 - seqcount_init(&sp->so_reclaim_seqcount);
5737 + seqlock_init(&sp->so_reclaim_seqlock);
5738 mutex_init(&sp->so_delegreturn_mutex);
5741 @@ -1497,8 +1497,12 @@
5742 * recovering after a network partition or a reboot from a
5743 * server that doesn't support a grace period.
5745 +#ifdef CONFIG_PREEMPT_RT_FULL
5746 + write_seqlock(&sp->so_reclaim_seqlock);
5748 + write_seqcount_begin(&sp->so_reclaim_seqlock.seqcount);
5750 spin_lock(&sp->so_lock);
5751 - raw_write_seqcount_begin(&sp->so_reclaim_seqcount);
5753 list_for_each_entry(state, &sp->so_states, open_states) {
5754 if (!test_and_clear_bit(ops->state_flag_bit, &state->flags))
5755 @@ -1567,14 +1571,20 @@
5756 spin_lock(&sp->so_lock);
5759 - raw_write_seqcount_end(&sp->so_reclaim_seqcount);
5760 spin_unlock(&sp->so_lock);
5761 +#ifdef CONFIG_PREEMPT_RT_FULL
5762 + write_sequnlock(&sp->so_reclaim_seqlock);
5764 + write_seqcount_end(&sp->so_reclaim_seqlock.seqcount);
5768 nfs4_put_open_state(state);
5769 - spin_lock(&sp->so_lock);
5770 - raw_write_seqcount_end(&sp->so_reclaim_seqcount);
5771 - spin_unlock(&sp->so_lock);
5772 +#ifdef CONFIG_PREEMPT_RT_FULL
5773 + write_sequnlock(&sp->so_reclaim_seqlock);
5775 + write_seqcount_end(&sp->so_reclaim_seqlock.seqcount);
5780 diff -Nur linux-4.9.6.orig/fs/nfs/unlink.c linux-4.9.6/fs/nfs/unlink.c
5781 --- linux-4.9.6.orig/fs/nfs/unlink.c 2017-01-26 08:25:24.000000000 +0100
5782 +++ linux-4.9.6/fs/nfs/unlink.c 2017-01-28 13:59:09.931656187 +0100
5784 #include <linux/sunrpc/clnt.h>
5785 #include <linux/nfs_fs.h>
5786 #include <linux/sched.h>
5787 -#include <linux/wait.h>
5788 +#include <linux/swait.h>
5789 #include <linux/namei.h>
5790 #include <linux/fsnotify.h>
5793 rpc_restart_call_prepare(task);
5796 +#ifdef CONFIG_PREEMPT_RT_BASE
5797 +static void nfs_down_anon(struct semaphore *sema)
5802 +static void nfs_up_anon(struct semaphore *sema)
5808 +static void nfs_down_anon(struct rw_semaphore *rwsem)
5810 + down_read_non_owner(rwsem);
5813 +static void nfs_up_anon(struct rw_semaphore *rwsem)
5815 + up_read_non_owner(rwsem);
5820 * nfs_async_unlink_release - Release the sillydelete data.
5821 * @task: rpc_task of the sillydelete
5823 struct dentry *dentry = data->dentry;
5824 struct super_block *sb = dentry->d_sb;
5826 - up_read_non_owner(&NFS_I(d_inode(dentry->d_parent))->rmdir_sem);
5827 + nfs_up_anon(&NFS_I(d_inode(dentry->d_parent))->rmdir_sem);
5828 d_lookup_done(dentry);
5829 nfs_free_unlinkdata(data);
5831 @@ -117,10 +140,10 @@
5832 struct inode *dir = d_inode(dentry->d_parent);
5833 struct dentry *alias;
5835 - down_read_non_owner(&NFS_I(dir)->rmdir_sem);
5836 + nfs_down_anon(&NFS_I(dir)->rmdir_sem);
5837 alias = d_alloc_parallel(dentry->d_parent, &data->args.name, &data->wq);
5838 if (IS_ERR(alias)) {
5839 - up_read_non_owner(&NFS_I(dir)->rmdir_sem);
5840 + nfs_up_anon(&NFS_I(dir)->rmdir_sem);
5843 if (!d_in_lookup(alias)) {
5846 spin_unlock(&alias->d_lock);
5848 - up_read_non_owner(&NFS_I(dir)->rmdir_sem);
5849 + nfs_up_anon(&NFS_I(dir)->rmdir_sem);
5851 * If we'd displaced old cached devname, free it. At that
5852 * point dentry is definitely not a root, so we won't need
5856 data->res.dir_attr = &data->dir_attr;
5857 - init_waitqueue_head(&data->wq);
5858 + init_swait_queue_head(&data->wq);
5861 spin_lock(&dentry->d_lock);
5862 diff -Nur linux-4.9.6.orig/fs/ntfs/aops.c linux-4.9.6/fs/ntfs/aops.c
5863 --- linux-4.9.6.orig/fs/ntfs/aops.c 2017-01-26 08:25:24.000000000 +0100
5864 +++ linux-4.9.6/fs/ntfs/aops.c 2017-01-28 13:59:09.931656187 +0100
5867 if (file_ofs < init_size)
5868 ofs = init_size - file_ofs;
5869 - local_irq_save(flags);
5870 + local_irq_save_nort(flags);
5871 kaddr = kmap_atomic(page);
5872 memset(kaddr + bh_offset(bh) + ofs, 0,
5874 flush_dcache_page(page);
5875 kunmap_atomic(kaddr);
5876 - local_irq_restore(flags);
5877 + local_irq_restore_nort(flags);
5880 clear_buffer_uptodate(bh);
5882 "0x%llx.", (unsigned long long)bh->b_blocknr);
5884 first = page_buffers(page);
5885 - local_irq_save(flags);
5886 - bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
5887 + flags = bh_uptodate_lock_irqsave(first);
5888 clear_buffer_async_read(bh);
5893 tmp = tmp->b_this_page;
5894 } while (tmp != bh);
5895 - bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
5896 - local_irq_restore(flags);
5897 + bh_uptodate_unlock_irqrestore(first, flags);
5899 * If none of the buffers had errors then we can set the page uptodate,
5900 * but we first have to perform the post read mst fixups, if the
5901 @@ -145,13 +143,13 @@
5902 recs = PAGE_SIZE / rec_size;
5903 /* Should have been verified before we got here... */
5905 - local_irq_save(flags);
5906 + local_irq_save_nort(flags);
5907 kaddr = kmap_atomic(page);
5908 for (i = 0; i < recs; i++)
5909 post_read_mst_fixup((NTFS_RECORD*)(kaddr +
5910 i * rec_size), rec_size);
5911 kunmap_atomic(kaddr);
5912 - local_irq_restore(flags);
5913 + local_irq_restore_nort(flags);
5914 flush_dcache_page(page);
5915 if (likely(page_uptodate && !PageError(page)))
5916 SetPageUptodate(page);
5921 - bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
5922 - local_irq_restore(flags);
5924 + bh_uptodate_unlock_irqrestore(first, flags);
5928 diff -Nur linux-4.9.6.orig/fs/proc/base.c linux-4.9.6/fs/proc/base.c
5929 --- linux-4.9.6.orig/fs/proc/base.c 2017-01-26 08:25:24.000000000 +0100
5930 +++ linux-4.9.6/fs/proc/base.c 2017-01-28 13:59:09.931656187 +0100
5931 @@ -1834,7 +1834,7 @@
5933 child = d_hash_and_lookup(dir, &qname);
5935 - DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
5936 + DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
5937 child = d_alloc_parallel(dir, &qname, &wq);
5939 goto end_instantiate;
5940 diff -Nur linux-4.9.6.orig/fs/proc/proc_sysctl.c linux-4.9.6/fs/proc/proc_sysctl.c
5941 --- linux-4.9.6.orig/fs/proc/proc_sysctl.c 2017-01-26 08:25:24.000000000 +0100
5942 +++ linux-4.9.6/fs/proc/proc_sysctl.c 2017-01-28 13:59:09.935656342 +0100
5945 child = d_lookup(dir, &qname);
5947 - DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
5948 + DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
5949 child = d_alloc_parallel(dir, &qname, &wq);
5952 diff -Nur linux-4.9.6.orig/fs/timerfd.c linux-4.9.6/fs/timerfd.c
5953 --- linux-4.9.6.orig/fs/timerfd.c 2017-01-26 08:25:24.000000000 +0100
5954 +++ linux-4.9.6/fs/timerfd.c 2017-01-28 13:59:09.935656342 +0100
5955 @@ -460,7 +460,10 @@
5958 spin_unlock_irq(&ctx->wqh.lock);
5961 + hrtimer_wait_for_timer(&ctx->t.alarm.timer);
5963 + hrtimer_wait_for_timer(&ctx->t.tmr);
5967 diff -Nur linux-4.9.6.orig/include/acpi/platform/aclinux.h linux-4.9.6/include/acpi/platform/aclinux.h
5968 --- linux-4.9.6.orig/include/acpi/platform/aclinux.h 2017-01-26 08:25:24.000000000 +0100
5969 +++ linux-4.9.6/include/acpi/platform/aclinux.h 2017-01-28 13:59:09.935656342 +0100
5972 #define acpi_cache_t struct kmem_cache
5973 #define acpi_spinlock spinlock_t *
5974 +#define acpi_raw_spinlock raw_spinlock_t *
5975 #define acpi_cpu_flags unsigned long
5977 /* Use native linux version of acpi_os_allocate_zeroed */
5978 @@ -151,6 +152,20 @@
5979 #define ACPI_USE_ALTERNATE_PROTOTYPE_acpi_os_get_thread_id
5980 #define ACPI_USE_ALTERNATE_PROTOTYPE_acpi_os_create_lock
5982 +#define acpi_os_create_raw_lock(__handle) \
5984 + raw_spinlock_t *lock = ACPI_ALLOCATE(sizeof(*lock)); \
5987 + *(__handle) = lock; \
5988 + raw_spin_lock_init(*(__handle)); \
5990 + lock ? AE_OK : AE_NO_MEMORY; \
5993 +#define acpi_os_delete_raw_lock(__handle) kfree(__handle)
5997 * OSL interfaces used by debugger/disassembler
5999 diff -Nur linux-4.9.6.orig/include/asm-generic/bug.h linux-4.9.6/include/asm-generic/bug.h
6000 --- linux-4.9.6.orig/include/asm-generic/bug.h 2017-01-26 08:25:24.000000000 +0100
6001 +++ linux-4.9.6/include/asm-generic/bug.h 2017-01-28 13:59:09.935656342 +0100
6002 @@ -215,6 +215,20 @@
6003 # define WARN_ON_SMP(x) ({0;})
6006 +#ifdef CONFIG_PREEMPT_RT_BASE
6007 +# define BUG_ON_RT(c) BUG_ON(c)
6008 +# define BUG_ON_NONRT(c) do { } while (0)
6009 +# define WARN_ON_RT(condition) WARN_ON(condition)
6010 +# define WARN_ON_NONRT(condition) do { } while (0)
6011 +# define WARN_ON_ONCE_NONRT(condition) do { } while (0)
6013 +# define BUG_ON_RT(c) do { } while (0)
6014 +# define BUG_ON_NONRT(c) BUG_ON(c)
6015 +# define WARN_ON_RT(condition) do { } while (0)
6016 +# define WARN_ON_NONRT(condition) WARN_ON(condition)
6017 +# define WARN_ON_ONCE_NONRT(condition) WARN_ON_ONCE(condition)
6020 #endif /* __ASSEMBLY__ */
6023 diff -Nur linux-4.9.6.orig/include/linux/blkdev.h linux-4.9.6/include/linux/blkdev.h
6024 --- linux-4.9.6.orig/include/linux/blkdev.h 2017-01-26 08:25:24.000000000 +0100
6025 +++ linux-4.9.6/include/linux/blkdev.h 2017-01-28 13:59:09.935656342 +0100
6027 struct list_head queuelist;
6029 struct call_single_data csd;
6030 + struct work_struct work;
6035 struct throtl_data *td;
6037 struct rcu_head rcu_head;
6038 - wait_queue_head_t mq_freeze_wq;
6039 + struct swait_queue_head mq_freeze_wq;
6040 struct percpu_ref q_usage_counter;
6041 struct list_head all_q_node;
6043 diff -Nur linux-4.9.6.orig/include/linux/blk-mq.h linux-4.9.6/include/linux/blk-mq.h
6044 --- linux-4.9.6.orig/include/linux/blk-mq.h 2017-01-26 08:25:24.000000000 +0100
6045 +++ linux-4.9.6/include/linux/blk-mq.h 2017-01-28 13:59:09.935656342 +0100
6047 return unique_tag & BLK_MQ_UNIQUE_TAG_MASK;
6051 +void __blk_mq_complete_request_remote_work(struct work_struct *work);
6052 int blk_mq_request_started(struct request *rq);
6053 void blk_mq_start_request(struct request *rq);
6054 void blk_mq_end_request(struct request *rq, int error);
6055 diff -Nur linux-4.9.6.orig/include/linux/bottom_half.h linux-4.9.6/include/linux/bottom_half.h
6056 --- linux-4.9.6.orig/include/linux/bottom_half.h 2017-01-26 08:25:24.000000000 +0100
6057 +++ linux-4.9.6/include/linux/bottom_half.h 2017-01-28 13:59:09.935656342 +0100
6060 #include <linux/preempt.h>
6062 +#ifdef CONFIG_PREEMPT_RT_FULL
6064 +extern void __local_bh_disable(void);
6065 +extern void _local_bh_enable(void);
6066 +extern void __local_bh_enable(void);
6068 +static inline void local_bh_disable(void)
6070 + __local_bh_disable();
6073 +static inline void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
6075 + __local_bh_disable();
6078 +static inline void local_bh_enable(void)
6080 + __local_bh_enable();
6083 +static inline void __local_bh_enable_ip(unsigned long ip, unsigned int cnt)
6085 + __local_bh_enable();
6088 +static inline void local_bh_enable_ip(unsigned long ip)
6090 + __local_bh_enable();
6095 #ifdef CONFIG_TRACE_IRQFLAGS
6096 extern void __local_bh_disable_ip(unsigned long ip, unsigned int cnt);
6100 __local_bh_enable_ip(_THIS_IP_, SOFTIRQ_DISABLE_OFFSET);
6104 #endif /* _LINUX_BH_H */
6105 diff -Nur linux-4.9.6.orig/include/linux/buffer_head.h linux-4.9.6/include/linux/buffer_head.h
6106 --- linux-4.9.6.orig/include/linux/buffer_head.h 2017-01-26 08:25:24.000000000 +0100
6107 +++ linux-4.9.6/include/linux/buffer_head.h 2017-01-28 13:59:09.935656342 +0100
6109 struct address_space *b_assoc_map; /* mapping this buffer is
6111 atomic_t b_count; /* users using this buffer_head */
6112 +#ifdef CONFIG_PREEMPT_RT_BASE
6113 + spinlock_t b_uptodate_lock;
6114 +#if IS_ENABLED(CONFIG_JBD2)
6115 + spinlock_t b_state_lock;
6116 + spinlock_t b_journal_head_lock;
6121 +static inline unsigned long bh_uptodate_lock_irqsave(struct buffer_head *bh)
6123 + unsigned long flags;
6125 +#ifndef CONFIG_PREEMPT_RT_BASE
6126 + local_irq_save(flags);
6127 + bit_spin_lock(BH_Uptodate_Lock, &bh->b_state);
6129 + spin_lock_irqsave(&bh->b_uptodate_lock, flags);
6135 +bh_uptodate_unlock_irqrestore(struct buffer_head *bh, unsigned long flags)
6137 +#ifndef CONFIG_PREEMPT_RT_BASE
6138 + bit_spin_unlock(BH_Uptodate_Lock, &bh->b_state);
6139 + local_irq_restore(flags);
6141 + spin_unlock_irqrestore(&bh->b_uptodate_lock, flags);
6145 +static inline void buffer_head_init_locks(struct buffer_head *bh)
6147 +#ifdef CONFIG_PREEMPT_RT_BASE
6148 + spin_lock_init(&bh->b_uptodate_lock);
6149 +#if IS_ENABLED(CONFIG_JBD2)
6150 + spin_lock_init(&bh->b_state_lock);
6151 + spin_lock_init(&bh->b_journal_head_lock);
6157 * macro tricks to expand the set_buffer_foo(), clear_buffer_foo()
6158 * and buffer_foo() functions.
6159 diff -Nur linux-4.9.6.orig/include/linux/cgroup-defs.h linux-4.9.6/include/linux/cgroup-defs.h
6160 --- linux-4.9.6.orig/include/linux/cgroup-defs.h 2017-01-26 08:25:24.000000000 +0100
6161 +++ linux-4.9.6/include/linux/cgroup-defs.h 2017-01-28 13:59:09.935656342 +0100
6163 #include <linux/percpu-refcount.h>
6164 #include <linux/percpu-rwsem.h>
6165 #include <linux/workqueue.h>
6166 +#include <linux/swork.h>
6168 #ifdef CONFIG_CGROUPS
6171 /* percpu_ref killing and RCU release */
6172 struct rcu_head rcu_head;
6173 struct work_struct destroy_work;
6174 + struct swork_event destroy_swork;
6178 diff -Nur linux-4.9.6.orig/include/linux/completion.h linux-4.9.6/include/linux/completion.h
6179 --- linux-4.9.6.orig/include/linux/completion.h 2017-01-26 08:25:24.000000000 +0100
6180 +++ linux-4.9.6/include/linux/completion.h 2017-01-28 13:59:09.935656342 +0100
6182 * Atomic wait-for-completion handler data structures.
6183 * See kernel/sched/completion.c for details.
6186 -#include <linux/wait.h>
6187 +#include <linux/swait.h>
6190 * struct completion - structure used to maintain state for a "completion"
6195 - wait_queue_head_t wait;
6196 + struct swait_queue_head wait;
6199 #define COMPLETION_INITIALIZER(work) \
6200 - { 0, __WAIT_QUEUE_HEAD_INITIALIZER((work).wait) }
6201 + { 0, __SWAIT_QUEUE_HEAD_INITIALIZER((work).wait) }
6203 #define COMPLETION_INITIALIZER_ONSTACK(work) \
6204 ({ init_completion(&work); work; })
6206 static inline void init_completion(struct completion *x)
6209 - init_waitqueue_head(&x->wait);
6210 + init_swait_queue_head(&x->wait);
6214 diff -Nur linux-4.9.6.orig/include/linux/cpu.h linux-4.9.6/include/linux/cpu.h
6215 --- linux-4.9.6.orig/include/linux/cpu.h 2017-01-26 08:25:24.000000000 +0100
6216 +++ linux-4.9.6/include/linux/cpu.h 2017-01-28 13:59:09.935656342 +0100
6218 extern void put_online_cpus(void);
6219 extern void cpu_hotplug_disable(void);
6220 extern void cpu_hotplug_enable(void);
6221 +extern void pin_current_cpu(void);
6222 +extern void unpin_current_cpu(void);
6223 #define hotcpu_notifier(fn, pri) cpu_notifier(fn, pri)
6224 #define __hotcpu_notifier(fn, pri) __cpu_notifier(fn, pri)
6225 #define register_hotcpu_notifier(nb) register_cpu_notifier(nb)
6227 #define put_online_cpus() do { } while (0)
6228 #define cpu_hotplug_disable() do { } while (0)
6229 #define cpu_hotplug_enable() do { } while (0)
6230 +static inline void pin_current_cpu(void) { }
6231 +static inline void unpin_current_cpu(void) { }
6232 #define hotcpu_notifier(fn, pri) do { (void)(fn); } while (0)
6233 #define __hotcpu_notifier(fn, pri) do { (void)(fn); } while (0)
6234 /* These aren't inline functions due to a GCC bug. */
6235 diff -Nur linux-4.9.6.orig/include/linux/dcache.h linux-4.9.6/include/linux/dcache.h
6236 --- linux-4.9.6.orig/include/linux/dcache.h 2017-01-26 08:25:24.000000000 +0100
6237 +++ linux-4.9.6/include/linux/dcache.h 2017-01-28 13:59:09.935656342 +0100
6239 #include <linux/rcupdate.h>
6240 #include <linux/lockref.h>
6241 #include <linux/stringhash.h>
6242 +#include <linux/wait.h>
6249 struct list_head d_lru; /* LRU list */
6250 - wait_queue_head_t *d_wait; /* in-lookup ones only */
6251 + struct swait_queue_head *d_wait; /* in-lookup ones only */
6253 struct list_head d_child; /* child of parent list */
6254 struct list_head d_subdirs; /* our children */
6256 extern struct dentry * d_alloc(struct dentry *, const struct qstr *);
6257 extern struct dentry * d_alloc_pseudo(struct super_block *, const struct qstr *);
6258 extern struct dentry * d_alloc_parallel(struct dentry *, const struct qstr *,
6259 - wait_queue_head_t *);
6260 + struct swait_queue_head *);
6261 extern struct dentry * d_splice_alias(struct inode *, struct dentry *);
6262 extern struct dentry * d_add_ci(struct dentry *, struct inode *, struct qstr *);
6263 extern struct dentry * d_exact_alias(struct dentry *, struct inode *);
6264 diff -Nur linux-4.9.6.orig/include/linux/delay.h linux-4.9.6/include/linux/delay.h
6265 --- linux-4.9.6.orig/include/linux/delay.h 2017-01-26 08:25:24.000000000 +0100
6266 +++ linux-4.9.6/include/linux/delay.h 2017-01-28 13:59:09.935656342 +0100
6268 msleep(seconds * 1000);
6271 +#ifdef CONFIG_PREEMPT_RT_FULL
6272 +extern void cpu_chill(void);
6274 +# define cpu_chill() cpu_relax()
6277 #endif /* defined(_LINUX_DELAY_H) */
6278 diff -Nur linux-4.9.6.orig/include/linux/highmem.h linux-4.9.6/include/linux/highmem.h
6279 --- linux-4.9.6.orig/include/linux/highmem.h 2017-01-26 08:25:24.000000000 +0100
6280 +++ linux-4.9.6/include/linux/highmem.h 2017-01-28 13:59:09.935656342 +0100
6282 #include <linux/mm.h>
6283 #include <linux/uaccess.h>
6284 #include <linux/hardirq.h>
6285 +#include <linux/sched.h>
6287 #include <asm/cacheflush.h>
6291 static inline void *kmap_atomic(struct page *page)
6293 - preempt_disable();
6294 + preempt_disable_nort();
6295 pagefault_disable();
6296 return page_address(page);
6299 static inline void __kunmap_atomic(void *addr)
6303 + preempt_enable_nort();
6306 #define kmap_atomic_pfn(pfn) kmap_atomic(pfn_to_page(pfn))
6309 #if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32)
6311 +#ifndef CONFIG_PREEMPT_RT_FULL
6312 DECLARE_PER_CPU(int, __kmap_atomic_idx);
6315 static inline int kmap_atomic_idx_push(void)
6317 +#ifndef CONFIG_PREEMPT_RT_FULL
6318 int idx = __this_cpu_inc_return(__kmap_atomic_idx) - 1;
6320 -#ifdef CONFIG_DEBUG_HIGHMEM
6321 +# ifdef CONFIG_DEBUG_HIGHMEM
6322 WARN_ON_ONCE(in_irq() && !irqs_disabled());
6323 BUG_ON(idx >= KM_TYPE_NR);
6328 + current->kmap_idx++;
6329 + BUG_ON(current->kmap_idx > KM_TYPE_NR);
6330 + return current->kmap_idx - 1;
6334 static inline int kmap_atomic_idx(void)
6336 +#ifndef CONFIG_PREEMPT_RT_FULL
6337 return __this_cpu_read(__kmap_atomic_idx) - 1;
6339 + return current->kmap_idx - 1;
6343 static inline void kmap_atomic_idx_pop(void)
6345 -#ifdef CONFIG_DEBUG_HIGHMEM
6346 +#ifndef CONFIG_PREEMPT_RT_FULL
6347 +# ifdef CONFIG_DEBUG_HIGHMEM
6348 int idx = __this_cpu_dec_return(__kmap_atomic_idx);
6353 __this_cpu_dec(__kmap_atomic_idx);
6356 + current->kmap_idx--;
6357 +# ifdef CONFIG_DEBUG_HIGHMEM
6358 + BUG_ON(current->kmap_idx < 0);
6363 diff -Nur linux-4.9.6.orig/include/linux/hrtimer.h linux-4.9.6/include/linux/hrtimer.h
6364 --- linux-4.9.6.orig/include/linux/hrtimer.h 2017-01-26 08:25:24.000000000 +0100
6365 +++ linux-4.9.6/include/linux/hrtimer.h 2017-01-28 13:59:09.935656342 +0100
6367 * @function: timer expiry callback function
6368 * @base: pointer to the timer base (per cpu and per clock)
6369 * @state: state information (See bit values above)
6370 + * @cb_entry: list entry to defer timers from hardirq context
6371 + * @irqsafe: timer can run in hardirq context
6372 + * @praecox: timer expiry time if expired at the time of programming
6373 * @is_rel: Set if the timer was armed relative
6374 * @start_pid: timer statistics field to store the pid of the task which
6376 @@ -103,6 +106,11 @@
6377 enum hrtimer_restart (*function)(struct hrtimer *);
6378 struct hrtimer_clock_base *base;
6380 + struct list_head cb_entry;
6382 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
6386 #ifdef CONFIG_TIMER_STATS
6388 @@ -123,11 +131,7 @@
6389 struct task_struct *task;
6392 -#ifdef CONFIG_64BIT
6393 # define HRTIMER_CLOCK_BASE_ALIGN 64
6395 -# define HRTIMER_CLOCK_BASE_ALIGN 32
6399 * struct hrtimer_clock_base - the timer base for a specific clock
6401 * timer to a base on another cpu.
6402 * @clockid: clock id for per_cpu support
6403 * @active: red black tree root node for the active timers
6404 + * @expired: list head for deferred timers.
6405 * @get_time: function to retrieve the current time of the clock
6406 * @offset: offset of this clock to the monotonic base
6411 struct timerqueue_head active;
6412 + struct list_head expired;
6413 ktime_t (*get_time)(void);
6415 } __attribute__((__aligned__(HRTIMER_CLOCK_BASE_ALIGN)));
6417 raw_spinlock_t lock;
6419 struct hrtimer *running;
6420 + struct hrtimer *running_soft;
6422 unsigned int active_bases;
6423 unsigned int clock_was_set_seq;
6425 unsigned int nr_hangs;
6426 unsigned int max_hang_time;
6428 +#ifdef CONFIG_PREEMPT_RT_BASE
6429 + wait_queue_head_t wait;
6431 struct hrtimer_clock_base clock_base[HRTIMER_MAX_CLOCK_BASES];
6432 } ____cacheline_aligned;
6434 @@ -412,6 +422,13 @@
6435 hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
6438 +/* Softirq preemption could deadlock timer removal */
6439 +#ifdef CONFIG_PREEMPT_RT_BASE
6440 + extern void hrtimer_wait_for_timer(const struct hrtimer *timer);
6442 +# define hrtimer_wait_for_timer(timer) do { cpu_relax(); } while (0)
6446 extern ktime_t __hrtimer_get_remaining(const struct hrtimer *timer, bool adjust);
6449 * Helper function to check, whether the timer is running the callback
6452 -static inline int hrtimer_callback_running(struct hrtimer *timer)
6453 +static inline int hrtimer_callback_running(const struct hrtimer *timer)
6455 return timer->base->cpu_base->running == timer;
6457 diff -Nur linux-4.9.6.orig/include/linux/idr.h linux-4.9.6/include/linux/idr.h
6458 --- linux-4.9.6.orig/include/linux/idr.h 2017-01-26 08:25:24.000000000 +0100
6459 +++ linux-4.9.6/include/linux/idr.h 2017-01-28 13:59:09.935656342 +0100
6461 * Each idr_preload() should be matched with an invocation of this
6462 * function. See idr_preload() for details.
6464 +#ifdef CONFIG_PREEMPT_RT_FULL
6465 +void idr_preload_end(void);
6467 static inline void idr_preload_end(void)
6474 * idr_find - return pointer for given id
6475 diff -Nur linux-4.9.6.orig/include/linux/init_task.h linux-4.9.6/include/linux/init_task.h
6476 --- linux-4.9.6.orig/include/linux/init_task.h 2017-01-26 08:25:24.000000000 +0100
6477 +++ linux-4.9.6/include/linux/init_task.h 2017-01-28 13:59:09.935656342 +0100
6478 @@ -150,6 +150,12 @@
6479 # define INIT_PERF_EVENTS(tsk)
6482 +#ifdef CONFIG_PREEMPT_RT_BASE
6483 +# define INIT_TIMER_LIST .posix_timer_list = NULL,
6485 +# define INIT_TIMER_LIST
6488 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
6489 # define INIT_VTIME(tsk) \
6490 .vtime_seqcount = SEQCNT_ZERO(tsk.vtime_seqcount), \
6492 .cpu_timers = INIT_CPU_TIMERS(tsk.cpu_timers), \
6493 .pi_lock = __RAW_SPIN_LOCK_UNLOCKED(tsk.pi_lock), \
6494 .timer_slack_ns = 50000, /* 50 usec default slack */ \
6497 [PIDTYPE_PID] = INIT_PID_LINK(PIDTYPE_PID), \
6498 [PIDTYPE_PGID] = INIT_PID_LINK(PIDTYPE_PGID), \
6499 diff -Nur linux-4.9.6.orig/include/linux/interrupt.h linux-4.9.6/include/linux/interrupt.h
6500 --- linux-4.9.6.orig/include/linux/interrupt.h 2017-01-26 08:25:24.000000000 +0100
6501 +++ linux-4.9.6/include/linux/interrupt.h 2017-01-28 13:59:09.935656342 +0100
6503 #include <linux/hrtimer.h>
6504 #include <linux/kref.h>
6505 #include <linux/workqueue.h>
6506 +#include <linux/swork.h>
6508 #include <linux/atomic.h>
6509 #include <asm/ptrace.h>
6511 * interrupt handler after suspending interrupts. For system
6512 * wakeup devices users need to implement wakeup detection in
6513 * their interrupt handlers.
6514 + * IRQF_NO_SOFTIRQ_CALL - Do not process softirqs in the irq thread context (RT)
6516 #define IRQF_SHARED 0x00000080
6517 #define IRQF_PROBE_SHARED 0x00000100
6519 #define IRQF_NO_THREAD 0x00010000
6520 #define IRQF_EARLY_RESUME 0x00020000
6521 #define IRQF_COND_SUSPEND 0x00040000
6522 +#define IRQF_NO_SOFTIRQ_CALL 0x00080000
6524 #define IRQF_TIMER (__IRQF_TIMER | IRQF_NO_SUSPEND | IRQF_NO_THREAD)
6527 #ifdef CONFIG_LOCKDEP
6528 # define local_irq_enable_in_hardirq() do { } while (0)
6530 -# define local_irq_enable_in_hardirq() local_irq_enable()
6531 +# define local_irq_enable_in_hardirq() local_irq_enable_nort()
6534 extern void disable_irq_nosync(unsigned int irq);
6536 * struct irq_affinity_notify - context for notification of IRQ affinity changes
6537 * @irq: Interrupt to which notification applies
6538 * @kref: Reference count, for internal use
6539 + * @swork: Swork item, for internal use
6540 * @work: Work item, for internal use
6541 * @notify: Function to be called on change. This will be
6542 * called in process context.
6543 @@ -227,7 +231,11 @@
6544 struct irq_affinity_notify {
6547 +#ifdef CONFIG_PREEMPT_RT_BASE
6548 + struct swork_event swork;
6550 struct work_struct work;
6552 void (*notify)(struct irq_affinity_notify *, const cpumask_t *mask);
6553 void (*release)(struct kref *ref);
6555 @@ -406,9 +414,13 @@
6558 #ifdef CONFIG_IRQ_FORCED_THREADING
6559 +# ifndef CONFIG_PREEMPT_RT_BASE
6560 extern bool force_irqthreads;
6562 +# define force_irqthreads (true)
6565 -#define force_irqthreads (0)
6566 +#define force_irqthreads (false)
6569 #ifndef __ARCH_SET_SOFTIRQ_PENDING
6570 @@ -465,9 +477,10 @@
6571 void (*action)(struct softirq_action *);
6574 +#ifndef CONFIG_PREEMPT_RT_FULL
6575 asmlinkage void do_softirq(void);
6576 asmlinkage void __do_softirq(void);
6578 +static inline void thread_do_softirq(void) { do_softirq(); }
6579 #ifdef __ARCH_HAS_DO_SOFTIRQ
6580 void do_softirq_own_stack(void);
6582 @@ -476,13 +489,25 @@
6587 +extern void thread_do_softirq(void);
6590 extern void open_softirq(int nr, void (*action)(struct softirq_action *));
6591 extern void softirq_init(void);
6592 extern void __raise_softirq_irqoff(unsigned int nr);
6593 +#ifdef CONFIG_PREEMPT_RT_FULL
6594 +extern void __raise_softirq_irqoff_ksoft(unsigned int nr);
6596 +static inline void __raise_softirq_irqoff_ksoft(unsigned int nr)
6598 + __raise_softirq_irqoff(nr);
6602 extern void raise_softirq_irqoff(unsigned int nr);
6603 extern void raise_softirq(unsigned int nr);
6604 +extern void softirq_check_pending_idle(void);
6606 DECLARE_PER_CPU(struct task_struct *, ksoftirqd);
6609 to be executed on some cpu at least once after this.
6610 * If the tasklet is already scheduled, but its execution is still not
6611 started, it will be executed only once.
6612 - * If this tasklet is already running on another CPU (or schedule is called
6613 - from tasklet itself), it is rescheduled for later.
6614 + * If this tasklet is already running on another CPU, it is rescheduled
6616 + * Schedule must not be called from the tasklet itself (a lockup occurs)
6617 * Tasklet is strictly serialized wrt itself, but not
6618 wrt another tasklets. If client needs some intertask synchronization,
6619 he makes it with spinlocks.
6620 @@ -530,27 +556,36 @@
6623 TASKLET_STATE_SCHED, /* Tasklet is scheduled for execution */
6624 - TASKLET_STATE_RUN /* Tasklet is running (SMP only) */
6625 + TASKLET_STATE_RUN, /* Tasklet is running (SMP only) */
6626 + TASKLET_STATE_PENDING /* Tasklet is pending */
6630 +#define TASKLET_STATEF_SCHED (1 << TASKLET_STATE_SCHED)
6631 +#define TASKLET_STATEF_RUN (1 << TASKLET_STATE_RUN)
6632 +#define TASKLET_STATEF_PENDING (1 << TASKLET_STATE_PENDING)
6634 +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
6635 static inline int tasklet_trylock(struct tasklet_struct *t)
6637 return !test_and_set_bit(TASKLET_STATE_RUN, &(t)->state);
6640 +static inline int tasklet_tryunlock(struct tasklet_struct *t)
6642 + return cmpxchg(&t->state, TASKLET_STATEF_RUN, 0) == TASKLET_STATEF_RUN;
6645 static inline void tasklet_unlock(struct tasklet_struct *t)
6647 smp_mb__before_atomic();
6648 clear_bit(TASKLET_STATE_RUN, &(t)->state);
6651 -static inline void tasklet_unlock_wait(struct tasklet_struct *t)
6653 - while (test_bit(TASKLET_STATE_RUN, &(t)->state)) { barrier(); }
6655 +extern void tasklet_unlock_wait(struct tasklet_struct *t);
6658 #define tasklet_trylock(t) 1
6659 +#define tasklet_tryunlock(t) 1
6660 #define tasklet_unlock_wait(t) do { } while (0)
6661 #define tasklet_unlock(t) do { } while (0)
6663 @@ -599,12 +634,7 @@
6667 -static inline void tasklet_enable(struct tasklet_struct *t)
6669 - smp_mb__before_atomic();
6670 - atomic_dec(&t->count);
6673 +extern void tasklet_enable(struct tasklet_struct *t);
6674 extern void tasklet_kill(struct tasklet_struct *t);
6675 extern void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu);
6676 extern void tasklet_init(struct tasklet_struct *t,
6677 @@ -635,6 +665,12 @@
6678 tasklet_kill(&ttimer->tasklet);
6681 +#ifdef CONFIG_PREEMPT_RT_FULL
6682 +extern void softirq_early_init(void);
6684 +static inline void softirq_early_init(void) { }
6688 * Autoprobing for irqs:
6690 diff -Nur linux-4.9.6.orig/include/linux/irqdesc.h linux-4.9.6/include/linux/irqdesc.h
6691 --- linux-4.9.6.orig/include/linux/irqdesc.h 2017-01-26 08:25:24.000000000 +0100
6692 +++ linux-4.9.6/include/linux/irqdesc.h 2017-01-28 13:59:09.935656342 +0100
6694 unsigned int irqs_unhandled;
6695 atomic_t threads_handled;
6696 int threads_handled_last;
6698 raw_spinlock_t lock;
6699 struct cpumask *percpu_enabled;
6700 const struct cpumask *percpu_affinity;
6701 diff -Nur linux-4.9.6.orig/include/linux/irqflags.h linux-4.9.6/include/linux/irqflags.h
6702 --- linux-4.9.6.orig/include/linux/irqflags.h 2017-01-26 08:25:24.000000000 +0100
6703 +++ linux-4.9.6/include/linux/irqflags.h 2017-01-28 13:59:09.935656342 +0100
6705 # define trace_softirqs_enabled(p) ((p)->softirqs_enabled)
6706 # define trace_hardirq_enter() do { current->hardirq_context++; } while (0)
6707 # define trace_hardirq_exit() do { current->hardirq_context--; } while (0)
6708 -# define lockdep_softirq_enter() do { current->softirq_context++; } while (0)
6709 -# define lockdep_softirq_exit() do { current->softirq_context--; } while (0)
6710 # define INIT_TRACE_IRQFLAGS .softirqs_enabled = 1,
6712 # define trace_hardirqs_on() do { } while (0)
6714 # define trace_softirqs_enabled(p) 0
6715 # define trace_hardirq_enter() do { } while (0)
6716 # define trace_hardirq_exit() do { } while (0)
6717 +# define INIT_TRACE_IRQFLAGS
6720 +#if defined(CONFIG_TRACE_IRQFLAGS) && !defined(CONFIG_PREEMPT_RT_FULL)
6721 +# define lockdep_softirq_enter() do { current->softirq_context++; } while (0)
6722 +# define lockdep_softirq_exit() do { current->softirq_context--; } while (0)
6724 # define lockdep_softirq_enter() do { } while (0)
6725 # define lockdep_softirq_exit() do { } while (0)
6726 -# define INIT_TRACE_IRQFLAGS
6729 #if defined(CONFIG_IRQSOFF_TRACER) || \
6730 @@ -148,4 +152,23 @@
6732 #define irqs_disabled_flags(flags) raw_irqs_disabled_flags(flags)
6735 + * local_irq* variants depending on RT/!RT
6737 +#ifdef CONFIG_PREEMPT_RT_FULL
6738 +# define local_irq_disable_nort() do { } while (0)
6739 +# define local_irq_enable_nort() do { } while (0)
6740 +# define local_irq_save_nort(flags) local_save_flags(flags)
6741 +# define local_irq_restore_nort(flags) (void)(flags)
6742 +# define local_irq_disable_rt() local_irq_disable()
6743 +# define local_irq_enable_rt() local_irq_enable()
6745 +# define local_irq_disable_nort() local_irq_disable()
6746 +# define local_irq_enable_nort() local_irq_enable()
6747 +# define local_irq_save_nort(flags) local_irq_save(flags)
6748 +# define local_irq_restore_nort(flags) local_irq_restore(flags)
6749 +# define local_irq_disable_rt() do { } while (0)
6750 +# define local_irq_enable_rt() do { } while (0)
6754 diff -Nur linux-4.9.6.orig/include/linux/irq.h linux-4.9.6/include/linux/irq.h
6755 --- linux-4.9.6.orig/include/linux/irq.h 2017-01-26 08:25:24.000000000 +0100
6756 +++ linux-4.9.6/include/linux/irq.h 2017-01-28 13:59:09.935656342 +0100
6758 * IRQ_IS_POLLED - Always polled by another interrupt. Exclude
6759 * it from the spurious interrupt detection
6760 * mechanism and from core side polling.
6761 + * IRQ_NO_SOFTIRQ_CALL - No softirq processing in the irq thread context (RT)
6762 * IRQ_DISABLE_UNLAZY - Disable lazy irq disable
6765 @@ -99,13 +100,14 @@
6766 IRQ_PER_CPU_DEVID = (1 << 17),
6767 IRQ_IS_POLLED = (1 << 18),
6768 IRQ_DISABLE_UNLAZY = (1 << 19),
6769 + IRQ_NO_SOFTIRQ_CALL = (1 << 20),
6772 #define IRQF_MODIFY_MASK \
6773 (IRQ_TYPE_SENSE_MASK | IRQ_NOPROBE | IRQ_NOREQUEST | \
6774 IRQ_NOAUTOEN | IRQ_MOVE_PCNTXT | IRQ_LEVEL | IRQ_NO_BALANCING | \
6775 IRQ_PER_CPU | IRQ_NESTED_THREAD | IRQ_NOTHREAD | IRQ_PER_CPU_DEVID | \
6776 - IRQ_IS_POLLED | IRQ_DISABLE_UNLAZY)
6777 + IRQ_IS_POLLED | IRQ_DISABLE_UNLAZY | IRQ_NO_SOFTIRQ_CALL)
6779 #define IRQ_NO_BALANCING_MASK (IRQ_PER_CPU | IRQ_NO_BALANCING)
6781 diff -Nur linux-4.9.6.orig/include/linux/irq_work.h linux-4.9.6/include/linux/irq_work.h
6782 --- linux-4.9.6.orig/include/linux/irq_work.h 2017-01-26 08:25:24.000000000 +0100
6783 +++ linux-4.9.6/include/linux/irq_work.h 2017-01-28 13:59:09.935656342 +0100
6785 #define IRQ_WORK_BUSY 2UL
6786 #define IRQ_WORK_FLAGS 3UL
6787 #define IRQ_WORK_LAZY 4UL /* Doesn't want IPI, wait for tick */
6788 +#define IRQ_WORK_HARD_IRQ 8UL /* Run hard IRQ context, even on RT */
6791 unsigned long flags;
6793 static inline void irq_work_run(void) { }
6796 +#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_PREEMPT_RT_FULL)
6797 +void irq_work_tick_soft(void);
6799 +static inline void irq_work_tick_soft(void) { }
6802 #endif /* _LINUX_IRQ_WORK_H */
6803 diff -Nur linux-4.9.6.orig/include/linux/jbd2.h linux-4.9.6/include/linux/jbd2.h
6804 --- linux-4.9.6.orig/include/linux/jbd2.h 2017-01-26 08:25:24.000000000 +0100
6805 +++ linux-4.9.6/include/linux/jbd2.h 2017-01-28 13:59:09.939656496 +0100
6806 @@ -347,32 +347,56 @@
6808 static inline void jbd_lock_bh_state(struct buffer_head *bh)
6810 +#ifndef CONFIG_PREEMPT_RT_BASE
6811 bit_spin_lock(BH_State, &bh->b_state);
6813 + spin_lock(&bh->b_state_lock);
6817 static inline int jbd_trylock_bh_state(struct buffer_head *bh)
6819 +#ifndef CONFIG_PREEMPT_RT_BASE
6820 return bit_spin_trylock(BH_State, &bh->b_state);
6822 + return spin_trylock(&bh->b_state_lock);
6826 static inline int jbd_is_locked_bh_state(struct buffer_head *bh)
6828 +#ifndef CONFIG_PREEMPT_RT_BASE
6829 return bit_spin_is_locked(BH_State, &bh->b_state);
6831 + return spin_is_locked(&bh->b_state_lock);
6835 static inline void jbd_unlock_bh_state(struct buffer_head *bh)
6837 +#ifndef CONFIG_PREEMPT_RT_BASE
6838 bit_spin_unlock(BH_State, &bh->b_state);
6840 + spin_unlock(&bh->b_state_lock);
6844 static inline void jbd_lock_bh_journal_head(struct buffer_head *bh)
6846 +#ifndef CONFIG_PREEMPT_RT_BASE
6847 bit_spin_lock(BH_JournalHead, &bh->b_state);
6849 + spin_lock(&bh->b_journal_head_lock);
6853 static inline void jbd_unlock_bh_journal_head(struct buffer_head *bh)
6855 +#ifndef CONFIG_PREEMPT_RT_BASE
6856 bit_spin_unlock(BH_JournalHead, &bh->b_state);
6858 + spin_unlock(&bh->b_journal_head_lock);
6862 #define J_ASSERT(assert) BUG_ON(!(assert))
6863 diff -Nur linux-4.9.6.orig/include/linux/kdb.h linux-4.9.6/include/linux/kdb.h
6864 --- linux-4.9.6.orig/include/linux/kdb.h 2017-01-26 08:25:24.000000000 +0100
6865 +++ linux-4.9.6/include/linux/kdb.h 2017-01-28 13:59:09.939656496 +0100
6867 extern __printf(1, 2) int kdb_printf(const char *, ...);
6868 typedef __printf(1, 2) int (*kdb_printf_t)(const char *, ...);
6870 +#define in_kdb_printk() (kdb_trap_printk)
6871 extern void kdb_init(int level);
6873 /* Access to kdb specific polling devices */
6875 extern int kdb_unregister(char *);
6876 #else /* ! CONFIG_KGDB_KDB */
6877 static inline __printf(1, 2) int kdb_printf(const char *fmt, ...) { return 0; }
6878 +#define in_kdb_printk() (0)
6879 static inline void kdb_init(int level) {}
6880 static inline int kdb_register(char *cmd, kdb_func_t func, char *usage,
6881 char *help, short minlen) { return 0; }
6882 diff -Nur linux-4.9.6.orig/include/linux/kernel.h linux-4.9.6/include/linux/kernel.h
6883 --- linux-4.9.6.orig/include/linux/kernel.h 2017-01-26 08:25:24.000000000 +0100
6884 +++ linux-4.9.6/include/linux/kernel.h 2017-01-28 13:59:09.939656496 +0100
6887 # define might_sleep() \
6888 do { __might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0)
6890 +# define might_sleep_no_state_check() \
6891 + do { ___might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0)
6892 # define sched_annotate_sleep() (current->task_state_change = 0)
6894 static inline void ___might_sleep(const char *file, int line,
6896 static inline void __might_sleep(const char *file, int line,
6897 int preempt_offset) { }
6898 # define might_sleep() do { might_resched(); } while (0)
6899 +# define might_sleep_no_state_check() do { might_resched(); } while (0)
6900 # define sched_annotate_sleep() do { } while (0)
6910 #define TAINT_PROPRIETARY_MODULE 0
6911 diff -Nur linux-4.9.6.orig/include/linux/list_bl.h linux-4.9.6/include/linux/list_bl.h
6912 --- linux-4.9.6.orig/include/linux/list_bl.h 2017-01-26 08:25:24.000000000 +0100
6913 +++ linux-4.9.6/include/linux/list_bl.h 2017-01-28 13:59:09.939656496 +0100
6915 #define _LINUX_LIST_BL_H
6917 #include <linux/list.h>
6918 +#include <linux/spinlock.h>
6919 #include <linux/bit_spinlock.h>
6924 struct hlist_bl_head {
6925 struct hlist_bl_node *first;
6926 +#ifdef CONFIG_PREEMPT_RT_BASE
6927 + raw_spinlock_t lock;
6931 struct hlist_bl_node {
6932 struct hlist_bl_node *next, **pprev;
6934 -#define INIT_HLIST_BL_HEAD(ptr) \
6935 - ((ptr)->first = NULL)
6937 +#ifdef CONFIG_PREEMPT_RT_BASE
6938 +#define INIT_HLIST_BL_HEAD(h) \
6940 + (h)->first = NULL; \
6941 + raw_spin_lock_init(&(h)->lock); \
6944 +#define INIT_HLIST_BL_HEAD(h) (h)->first = NULL
6947 static inline void INIT_HLIST_BL_NODE(struct hlist_bl_node *h)
6949 @@ -118,12 +130,26 @@
6951 static inline void hlist_bl_lock(struct hlist_bl_head *b)
6953 +#ifndef CONFIG_PREEMPT_RT_BASE
6954 bit_spin_lock(0, (unsigned long *)b);
6956 + raw_spin_lock(&b->lock);
6957 +#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
6958 + __set_bit(0, (unsigned long *)b);
6963 static inline void hlist_bl_unlock(struct hlist_bl_head *b)
6965 +#ifndef CONFIG_PREEMPT_RT_BASE
6966 __bit_spin_unlock(0, (unsigned long *)b);
6968 +#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
6969 + __clear_bit(0, (unsigned long *)b);
6971 + raw_spin_unlock(&b->lock);
6975 static inline bool hlist_bl_is_locked(struct hlist_bl_head *b)
6976 diff -Nur linux-4.9.6.orig/include/linux/locallock.h linux-4.9.6/include/linux/locallock.h
6977 --- linux-4.9.6.orig/include/linux/locallock.h 1970-01-01 01:00:00.000000000 +0100
6978 +++ linux-4.9.6/include/linux/locallock.h 2017-01-28 13:59:09.939656496 +0100
6980 +#ifndef _LINUX_LOCALLOCK_H
6981 +#define _LINUX_LOCALLOCK_H
6983 +#include <linux/percpu.h>
6984 +#include <linux/spinlock.h>
6986 +#ifdef CONFIG_PREEMPT_RT_BASE
6988 +#ifdef CONFIG_DEBUG_SPINLOCK
6989 +# define LL_WARN(cond) WARN_ON(cond)
6991 +# define LL_WARN(cond) do { } while (0)
6995 + * per cpu lock based substitute for local_irq_*()
6997 +struct local_irq_lock {
6999 + struct task_struct *owner;
7001 + unsigned long flags;
7004 +#define DEFINE_LOCAL_IRQ_LOCK(lvar) \
7005 + DEFINE_PER_CPU(struct local_irq_lock, lvar) = { \
7006 + .lock = __SPIN_LOCK_UNLOCKED((lvar).lock) }
7008 +#define DECLARE_LOCAL_IRQ_LOCK(lvar) \
7009 + DECLARE_PER_CPU(struct local_irq_lock, lvar)
7011 +#define local_irq_lock_init(lvar) \
7014 + for_each_possible_cpu(__cpu) \
7015 + spin_lock_init(&per_cpu(lvar, __cpu).lock); \
7019 + * spin_lock|trylock|unlock_local flavour that does not migrate disable
7020 + * used for __local_lock|trylock|unlock where get_local_var/put_local_var
7021 + * already takes care of the migrate_disable/enable
7022 + * for CONFIG_PREEMPT_BASE map to the normal spin_* calls.
7024 +#ifdef CONFIG_PREEMPT_RT_FULL
7025 +# define spin_lock_local(lock) rt_spin_lock__no_mg(lock)
7026 +# define spin_trylock_local(lock) rt_spin_trylock__no_mg(lock)
7027 +# define spin_unlock_local(lock) rt_spin_unlock__no_mg(lock)
7029 +# define spin_lock_local(lock) spin_lock(lock)
7030 +# define spin_trylock_local(lock) spin_trylock(lock)
7031 +# define spin_unlock_local(lock) spin_unlock(lock)
7034 +static inline void __local_lock(struct local_irq_lock *lv)
7036 + if (lv->owner != current) {
7037 + spin_lock_local(&lv->lock);
7038 + LL_WARN(lv->owner);
7039 + LL_WARN(lv->nestcnt);
7040 + lv->owner = current;
7045 +#define local_lock(lvar) \
7046 + do { __local_lock(&get_local_var(lvar)); } while (0)
7048 +#define local_lock_on(lvar, cpu) \
7049 + do { __local_lock(&per_cpu(lvar, cpu)); } while (0)
7051 +static inline int __local_trylock(struct local_irq_lock *lv)
7053 + if (lv->owner != current && spin_trylock_local(&lv->lock)) {
7054 + LL_WARN(lv->owner);
7055 + LL_WARN(lv->nestcnt);
7056 + lv->owner = current;
7063 +#define local_trylock(lvar) \
7066 + __locked = __local_trylock(&get_local_var(lvar)); \
7068 + put_local_var(lvar); \
7072 +static inline void __local_unlock(struct local_irq_lock *lv)
7074 + LL_WARN(lv->nestcnt == 0);
7075 + LL_WARN(lv->owner != current);
7076 + if (--lv->nestcnt)
7080 + spin_unlock_local(&lv->lock);
7083 +#define local_unlock(lvar) \
7085 + __local_unlock(this_cpu_ptr(&lvar)); \
7086 + put_local_var(lvar); \
7089 +#define local_unlock_on(lvar, cpu) \
7090 + do { __local_unlock(&per_cpu(lvar, cpu)); } while (0)
7092 +static inline void __local_lock_irq(struct local_irq_lock *lv)
7094 + spin_lock_irqsave(&lv->lock, lv->flags);
7095 + LL_WARN(lv->owner);
7096 + LL_WARN(lv->nestcnt);
7097 + lv->owner = current;
7101 +#define local_lock_irq(lvar) \
7102 + do { __local_lock_irq(&get_local_var(lvar)); } while (0)
7104 +#define local_lock_irq_on(lvar, cpu) \
7105 + do { __local_lock_irq(&per_cpu(lvar, cpu)); } while (0)
7107 +static inline void __local_unlock_irq(struct local_irq_lock *lv)
7109 + LL_WARN(!lv->nestcnt);
7110 + LL_WARN(lv->owner != current);
7113 + spin_unlock_irq(&lv->lock);
7116 +#define local_unlock_irq(lvar) \
7118 + __local_unlock_irq(this_cpu_ptr(&lvar)); \
7119 + put_local_var(lvar); \
7122 +#define local_unlock_irq_on(lvar, cpu) \
7124 + __local_unlock_irq(&per_cpu(lvar, cpu)); \
7127 +static inline int __local_lock_irqsave(struct local_irq_lock *lv)
7129 + if (lv->owner != current) {
7130 + __local_lock_irq(lv);
7138 +#define local_lock_irqsave(lvar, _flags) \
7140 + if (__local_lock_irqsave(&get_local_var(lvar))) \
7141 + put_local_var(lvar); \
7142 + _flags = __this_cpu_read(lvar.flags); \
7145 +#define local_lock_irqsave_on(lvar, _flags, cpu) \
7147 + __local_lock_irqsave(&per_cpu(lvar, cpu)); \
7148 + _flags = per_cpu(lvar, cpu).flags; \
7151 +static inline int __local_unlock_irqrestore(struct local_irq_lock *lv,
7152 + unsigned long flags)
7154 + LL_WARN(!lv->nestcnt);
7155 + LL_WARN(lv->owner != current);
7156 + if (--lv->nestcnt)
7160 + spin_unlock_irqrestore(&lv->lock, lv->flags);
7164 +#define local_unlock_irqrestore(lvar, flags) \
7166 + if (__local_unlock_irqrestore(this_cpu_ptr(&lvar), flags)) \
7167 + put_local_var(lvar); \
7170 +#define local_unlock_irqrestore_on(lvar, flags, cpu) \
7172 + __local_unlock_irqrestore(&per_cpu(lvar, cpu), flags); \
7175 +#define local_spin_trylock_irq(lvar, lock) \
7178 + local_lock_irq(lvar); \
7179 + __locked = spin_trylock(lock); \
7181 + local_unlock_irq(lvar); \
7185 +#define local_spin_lock_irq(lvar, lock) \
7187 + local_lock_irq(lvar); \
7188 + spin_lock(lock); \
7191 +#define local_spin_unlock_irq(lvar, lock) \
7193 + spin_unlock(lock); \
7194 + local_unlock_irq(lvar); \
7197 +#define local_spin_lock_irqsave(lvar, lock, flags) \
7199 + local_lock_irqsave(lvar, flags); \
7200 + spin_lock(lock); \
7203 +#define local_spin_unlock_irqrestore(lvar, lock, flags) \
7205 + spin_unlock(lock); \
7206 + local_unlock_irqrestore(lvar, flags); \
7209 +#define get_locked_var(lvar, var) \
7211 + local_lock(lvar); \
7212 + this_cpu_ptr(&var); \
7215 +#define put_locked_var(lvar, var) local_unlock(lvar);
7217 +#define local_lock_cpu(lvar) \
7219 + local_lock(lvar); \
7220 + smp_processor_id(); \
7223 +#define local_unlock_cpu(lvar) local_unlock(lvar)
7225 +#else /* PREEMPT_RT_BASE */
7227 +#define DEFINE_LOCAL_IRQ_LOCK(lvar) __typeof__(const int) lvar
7228 +#define DECLARE_LOCAL_IRQ_LOCK(lvar) extern __typeof__(const int) lvar
7230 +static inline void local_irq_lock_init(int lvar) { }
7232 +#define local_lock(lvar) preempt_disable()
7233 +#define local_unlock(lvar) preempt_enable()
7234 +#define local_lock_irq(lvar) local_irq_disable()
7235 +#define local_lock_irq_on(lvar, cpu) local_irq_disable()
7236 +#define local_unlock_irq(lvar) local_irq_enable()
7237 +#define local_unlock_irq_on(lvar, cpu) local_irq_enable()
7238 +#define local_lock_irqsave(lvar, flags) local_irq_save(flags)
7239 +#define local_unlock_irqrestore(lvar, flags) local_irq_restore(flags)
7241 +#define local_spin_trylock_irq(lvar, lock) spin_trylock_irq(lock)
7242 +#define local_spin_lock_irq(lvar, lock) spin_lock_irq(lock)
7243 +#define local_spin_unlock_irq(lvar, lock) spin_unlock_irq(lock)
7244 +#define local_spin_lock_irqsave(lvar, lock, flags) \
7245 + spin_lock_irqsave(lock, flags)
7246 +#define local_spin_unlock_irqrestore(lvar, lock, flags) \
7247 + spin_unlock_irqrestore(lock, flags)
7249 +#define get_locked_var(lvar, var) get_cpu_var(var)
7250 +#define put_locked_var(lvar, var) put_cpu_var(var)
7252 +#define local_lock_cpu(lvar) get_cpu()
7253 +#define local_unlock_cpu(lvar) put_cpu()
7258 diff -Nur linux-4.9.6.orig/include/linux/mm_types.h linux-4.9.6/include/linux/mm_types.h
7259 --- linux-4.9.6.orig/include/linux/mm_types.h 2017-01-26 08:25:24.000000000 +0100
7260 +++ linux-4.9.6/include/linux/mm_types.h 2017-01-28 13:59:09.939656496 +0100
7262 #include <linux/completion.h>
7263 #include <linux/cpumask.h>
7264 #include <linux/uprobes.h>
7265 +#include <linux/rcupdate.h>
7266 #include <linux/page-flags-layout.h>
7267 #include <linux/workqueue.h>
7268 #include <asm/page.h>
7270 bool tlb_flush_pending;
7272 struct uprobes_state uprobes_state;
7273 +#ifdef CONFIG_PREEMPT_RT_BASE
7274 + struct rcu_head delayed_drop;
7276 #ifdef CONFIG_X86_INTEL_MPX
7277 /* address of the bounds directory */
7278 void __user *bd_addr;
7279 diff -Nur linux-4.9.6.orig/include/linux/mutex.h linux-4.9.6/include/linux/mutex.h
7280 --- linux-4.9.6.orig/include/linux/mutex.h 2017-01-26 08:25:24.000000000 +0100
7281 +++ linux-4.9.6/include/linux/mutex.h 2017-01-28 13:59:09.939656496 +0100
7283 #include <asm/processor.h>
7284 #include <linux/osq_lock.h>
7286 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
7287 +# define __DEP_MAP_MUTEX_INITIALIZER(lockname) \
7288 + , .dep_map = { .name = #lockname }
7290 +# define __DEP_MAP_MUTEX_INITIALIZER(lockname)
7293 +#ifdef CONFIG_PREEMPT_RT_FULL
7294 +# include <linux/mutex_rt.h>
7298 * Simple, straightforward mutexes with strict semantics:
7301 static inline void mutex_destroy(struct mutex *lock) {}
7304 -#ifdef CONFIG_DEBUG_LOCK_ALLOC
7305 -# define __DEP_MAP_MUTEX_INITIALIZER(lockname) \
7306 - , .dep_map = { .name = #lockname }
7308 -# define __DEP_MAP_MUTEX_INITIALIZER(lockname)
7311 #define __MUTEX_INITIALIZER(lockname) \
7312 { .count = ATOMIC_INIT(1) \
7313 , .wait_lock = __SPIN_LOCK_UNLOCKED(lockname.wait_lock) \
7315 extern int mutex_trylock(struct mutex *lock);
7316 extern void mutex_unlock(struct mutex *lock);
7318 +#endif /* !PREEMPT_RT_FULL */
7320 extern int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock);
7322 #endif /* __LINUX_MUTEX_H */
7323 diff -Nur linux-4.9.6.orig/include/linux/mutex_rt.h linux-4.9.6/include/linux/mutex_rt.h
7324 --- linux-4.9.6.orig/include/linux/mutex_rt.h 1970-01-01 01:00:00.000000000 +0100
7325 +++ linux-4.9.6/include/linux/mutex_rt.h 2017-01-28 13:59:09.939656496 +0100
7327 +#ifndef __LINUX_MUTEX_RT_H
7328 +#define __LINUX_MUTEX_RT_H
7330 +#ifndef __LINUX_MUTEX_H
7331 +#error "Please include mutex.h"
7334 +#include <linux/rtmutex.h>
7336 +/* FIXME: Just for __lockfunc */
7337 +#include <linux/spinlock.h>
7340 + struct rt_mutex lock;
7341 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
7342 + struct lockdep_map dep_map;
7346 +#define __MUTEX_INITIALIZER(mutexname) \
7348 + .lock = __RT_MUTEX_INITIALIZER(mutexname.lock) \
7349 + __DEP_MAP_MUTEX_INITIALIZER(mutexname) \
7352 +#define DEFINE_MUTEX(mutexname) \
7353 + struct mutex mutexname = __MUTEX_INITIALIZER(mutexname)
7355 +extern void __mutex_do_init(struct mutex *lock, const char *name, struct lock_class_key *key);
7356 +extern void __lockfunc _mutex_lock(struct mutex *lock);
7357 +extern int __lockfunc _mutex_lock_interruptible(struct mutex *lock);
7358 +extern int __lockfunc _mutex_lock_killable(struct mutex *lock);
7359 +extern void __lockfunc _mutex_lock_nested(struct mutex *lock, int subclass);
7360 +extern void __lockfunc _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest_lock);
7361 +extern int __lockfunc _mutex_lock_interruptible_nested(struct mutex *lock, int subclass);
7362 +extern int __lockfunc _mutex_lock_killable_nested(struct mutex *lock, int subclass);
7363 +extern int __lockfunc _mutex_trylock(struct mutex *lock);
7364 +extern void __lockfunc _mutex_unlock(struct mutex *lock);
7366 +#define mutex_is_locked(l) rt_mutex_is_locked(&(l)->lock)
7367 +#define mutex_lock(l) _mutex_lock(l)
7368 +#define mutex_lock_interruptible(l) _mutex_lock_interruptible(l)
7369 +#define mutex_lock_killable(l) _mutex_lock_killable(l)
7370 +#define mutex_trylock(l) _mutex_trylock(l)
7371 +#define mutex_unlock(l) _mutex_unlock(l)
7372 +#define mutex_destroy(l) rt_mutex_destroy(&(l)->lock)
7374 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
7375 +# define mutex_lock_nested(l, s) _mutex_lock_nested(l, s)
7376 +# define mutex_lock_interruptible_nested(l, s) \
7377 + _mutex_lock_interruptible_nested(l, s)
7378 +# define mutex_lock_killable_nested(l, s) \
7379 + _mutex_lock_killable_nested(l, s)
7381 +# define mutex_lock_nest_lock(lock, nest_lock) \
7383 + typecheck(struct lockdep_map *, &(nest_lock)->dep_map); \
7384 + _mutex_lock_nest_lock(lock, &(nest_lock)->dep_map); \
7388 +# define mutex_lock_nested(l, s) _mutex_lock(l)
7389 +# define mutex_lock_interruptible_nested(l, s) \
7390 + _mutex_lock_interruptible(l)
7391 +# define mutex_lock_killable_nested(l, s) \
7392 + _mutex_lock_killable(l)
7393 +# define mutex_lock_nest_lock(lock, nest_lock) mutex_lock(lock)
7396 +# define mutex_init(mutex) \
7398 + static struct lock_class_key __key; \
7400 + rt_mutex_init(&(mutex)->lock); \
7401 + __mutex_do_init((mutex), #mutex, &__key); \
7404 +# define __mutex_init(mutex, name, key) \
7406 + rt_mutex_init(&(mutex)->lock); \
7407 + __mutex_do_init((mutex), name, key); \
7411 diff -Nur linux-4.9.6.orig/include/linux/netdevice.h linux-4.9.6/include/linux/netdevice.h
7412 --- linux-4.9.6.orig/include/linux/netdevice.h 2017-01-26 08:25:24.000000000 +0100
7413 +++ linux-4.9.6/include/linux/netdevice.h 2017-01-28 13:59:09.939656496 +0100
7414 @@ -396,7 +396,19 @@
7415 typedef rx_handler_result_t rx_handler_func_t(struct sk_buff **pskb);
7417 void __napi_schedule(struct napi_struct *n);
7420 + * When PREEMPT_RT_FULL is defined, all device interrupt handlers
7421 + * run as threads, and they can also be preempted (without PREEMPT_RT
7422 + * interrupt threads can not be preempted). Which means that calling
7423 + * __napi_schedule_irqoff() from an interrupt handler can be preempted
7424 + * and can corrupt the napi->poll_list.
7426 +#ifdef CONFIG_PREEMPT_RT_FULL
7427 +#define __napi_schedule_irqoff(n) __napi_schedule(n)
7429 void __napi_schedule_irqoff(struct napi_struct *n);
7432 static inline bool napi_disable_pending(struct napi_struct *n)
7434 @@ -2461,14 +2473,53 @@
7435 void synchronize_net(void);
7436 int init_dummy_netdev(struct net_device *dev);
7438 -DECLARE_PER_CPU(int, xmit_recursion);
7439 #define XMIT_RECURSION_LIMIT 10
7440 +#ifdef CONFIG_PREEMPT_RT_FULL
7441 +static inline int dev_recursion_level(void)
7443 + return current->xmit_recursion;
7446 +static inline int xmit_rec_read(void)
7448 + return current->xmit_recursion;
7451 +static inline void xmit_rec_inc(void)
7453 + current->xmit_recursion++;
7456 +static inline void xmit_rec_dec(void)
7458 + current->xmit_recursion--;
7463 +DECLARE_PER_CPU(int, xmit_recursion);
7465 static inline int dev_recursion_level(void)
7467 return this_cpu_read(xmit_recursion);
7470 +static inline int xmit_rec_read(void)
7472 + return __this_cpu_read(xmit_recursion);
7475 +static inline void xmit_rec_inc(void)
7477 + __this_cpu_inc(xmit_recursion);
7480 +static inline void xmit_rec_dec(void)
7482 + __this_cpu_dec(xmit_recursion);
7486 struct net_device *dev_get_by_index(struct net *net, int ifindex);
7487 struct net_device *__dev_get_by_index(struct net *net, int ifindex);
7488 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex);
7489 @@ -2851,6 +2902,7 @@
7490 unsigned int dropped;
7491 struct sk_buff_head input_pkt_queue;
7492 struct napi_struct backlog;
7493 + struct sk_buff_head tofree_queue;
7497 diff -Nur linux-4.9.6.orig/include/linux/netfilter/x_tables.h linux-4.9.6/include/linux/netfilter/x_tables.h
7498 --- linux-4.9.6.orig/include/linux/netfilter/x_tables.h 2017-01-26 08:25:24.000000000 +0100
7499 +++ linux-4.9.6/include/linux/netfilter/x_tables.h 2017-01-28 13:59:09.939656496 +0100
7502 #include <linux/netdevice.h>
7503 #include <linux/static_key.h>
7504 +#include <linux/locallock.h>
7505 #include <uapi/linux/netfilter/x_tables.h>
7507 /* Test a struct->invflags and a boolean for inequality */
7510 DECLARE_PER_CPU(seqcount_t, xt_recseq);
7512 +DECLARE_LOCAL_IRQ_LOCK(xt_write_lock);
7514 /* xt_tee_enabled - true if x_tables needs to handle reentrancy
7516 * Enabled if current ip(6)tables ruleset has at least one -j TEE rule.
7519 unsigned int addend;
7521 + /* RT protection */
7522 + local_lock(xt_write_lock);
7525 * Low order bit of sequence is set if we already
7526 * called xt_write_recseq_begin().
7528 /* this is kind of a write_seqcount_end(), but addend is 0 or 1 */
7530 __this_cpu_add(xt_recseq.sequence, addend);
7531 + local_unlock(xt_write_lock);
7535 diff -Nur linux-4.9.6.orig/include/linux/nfs_fs.h linux-4.9.6/include/linux/nfs_fs.h
7536 --- linux-4.9.6.orig/include/linux/nfs_fs.h 2017-01-26 08:25:24.000000000 +0100
7537 +++ linux-4.9.6/include/linux/nfs_fs.h 2017-01-28 13:59:09.939656496 +0100
7538 @@ -165,7 +165,11 @@
7540 /* Readers: in-flight sillydelete RPC calls */
7541 /* Writers: rmdir */
7542 +#ifdef CONFIG_PREEMPT_RT_BASE
7543 + struct semaphore rmdir_sem;
7545 struct rw_semaphore rmdir_sem;
7548 #if IS_ENABLED(CONFIG_NFS_V4)
7549 struct nfs4_cached_acl *nfs4_acl;
7550 diff -Nur linux-4.9.6.orig/include/linux/nfs_xdr.h linux-4.9.6/include/linux/nfs_xdr.h
7551 --- linux-4.9.6.orig/include/linux/nfs_xdr.h 2017-01-26 08:25:24.000000000 +0100
7552 +++ linux-4.9.6/include/linux/nfs_xdr.h 2017-01-28 13:59:09.939656496 +0100
7553 @@ -1490,7 +1490,7 @@
7554 struct nfs_removeargs args;
7555 struct nfs_removeres res;
7556 struct dentry *dentry;
7557 - wait_queue_head_t wq;
7558 + struct swait_queue_head wq;
7559 struct rpc_cred *cred;
7560 struct nfs_fattr dir_attr;
7562 diff -Nur linux-4.9.6.orig/include/linux/notifier.h linux-4.9.6/include/linux/notifier.h
7563 --- linux-4.9.6.orig/include/linux/notifier.h 2017-01-26 08:25:24.000000000 +0100
7564 +++ linux-4.9.6/include/linux/notifier.h 2017-01-28 13:59:09.939656496 +0100
7567 * Alan Cox <Alan.Cox@linux.org>
7571 #ifndef _LINUX_NOTIFIER_H
7572 #define _LINUX_NOTIFIER_H
7573 #include <linux/errno.h>
7575 * in srcu_notifier_call_chain(): no cache bounces and no memory barriers.
7576 * As compensation, srcu_notifier_chain_unregister() is rather expensive.
7577 * SRCU notifier chains should be used when the chain will be called very
7578 - * often but notifier_blocks will seldom be removed. Also, SRCU notifier
7579 - * chains are slightly more difficult to use because they require special
7580 - * runtime initialization.
7581 + * often but notifier_blocks will seldom be removed.
7584 struct notifier_block;
7586 (name)->head = NULL; \
7589 -/* srcu_notifier_heads must be initialized and cleaned up dynamically */
7590 +/* srcu_notifier_heads must be cleaned up dynamically */
7591 extern void srcu_init_notifier_head(struct srcu_notifier_head *nh);
7592 #define srcu_cleanup_notifier_head(name) \
7593 cleanup_srcu_struct(&(name)->srcu);
7594 @@ -103,7 +101,13 @@
7596 #define RAW_NOTIFIER_INIT(name) { \
7598 -/* srcu_notifier_heads cannot be initialized statically */
7600 +#define SRCU_NOTIFIER_INIT(name, pcpu) \
7602 + .mutex = __MUTEX_INITIALIZER(name.mutex), \
7604 + .srcu = __SRCU_STRUCT_INIT(name.srcu, pcpu), \
7607 #define ATOMIC_NOTIFIER_HEAD(name) \
7608 struct atomic_notifier_head name = \
7609 @@ -115,6 +119,18 @@
7610 struct raw_notifier_head name = \
7611 RAW_NOTIFIER_INIT(name)
7613 +#define _SRCU_NOTIFIER_HEAD(name, mod) \
7614 + static DEFINE_PER_CPU(struct srcu_struct_array, \
7615 + name##_head_srcu_array); \
7616 + mod struct srcu_notifier_head name = \
7617 + SRCU_NOTIFIER_INIT(name, name##_head_srcu_array)
7619 +#define SRCU_NOTIFIER_HEAD(name) \
7620 + _SRCU_NOTIFIER_HEAD(name, )
7622 +#define SRCU_NOTIFIER_HEAD_STATIC(name) \
7623 + _SRCU_NOTIFIER_HEAD(name, static)
7627 extern int atomic_notifier_chain_register(struct atomic_notifier_head *nh,
7628 @@ -184,12 +200,12 @@
7631 * Declared notifiers so far. I can imagine quite a few more chains
7632 - * over time (eg laptop power reset chains, reboot chain (to clean
7633 + * over time (eg laptop power reset chains, reboot chain (to clean
7634 * device units up), device [un]mount chain, module load/unload chain,
7635 - * low memory chain, screenblank chain (for plug in modular screenblankers)
7636 + * low memory chain, screenblank chain (for plug in modular screenblankers)
7637 * VC switch chains (for loadable kernel svgalib VC switch helpers) etc...
7641 /* CPU notfiers are defined in include/linux/cpu.h. */
7643 /* netdevice notifiers are defined in include/linux/netdevice.h */
7644 diff -Nur linux-4.9.6.orig/include/linux/percpu.h linux-4.9.6/include/linux/percpu.h
7645 --- linux-4.9.6.orig/include/linux/percpu.h 2017-01-26 08:25:24.000000000 +0100
7646 +++ linux-4.9.6/include/linux/percpu.h 2017-01-28 13:59:09.939656496 +0100
7648 #define PERCPU_MODULE_RESERVE 0
7651 +#ifdef CONFIG_PREEMPT_RT_FULL
7653 +#define get_local_var(var) (*({ \
7654 + migrate_disable(); \
7655 + this_cpu_ptr(&var); }))
7657 +#define put_local_var(var) do { \
7659 + migrate_enable(); \
7662 +# define get_local_ptr(var) ({ \
7663 + migrate_disable(); \
7664 + this_cpu_ptr(var); })
7666 +# define put_local_ptr(var) do { \
7668 + migrate_enable(); \
7673 +#define get_local_var(var) get_cpu_var(var)
7674 +#define put_local_var(var) put_cpu_var(var)
7675 +#define get_local_ptr(var) get_cpu_ptr(var)
7676 +#define put_local_ptr(var) put_cpu_ptr(var)
7680 /* minimum unit size, also is the maximum supported allocation size */
7681 #define PCPU_MIN_UNIT_SIZE PFN_ALIGN(32 << 10)
7683 diff -Nur linux-4.9.6.orig/include/linux/percpu-rwsem.h linux-4.9.6/include/linux/percpu-rwsem.h
7684 --- linux-4.9.6.orig/include/linux/percpu-rwsem.h 2017-01-26 08:25:24.000000000 +0100
7685 +++ linux-4.9.6/include/linux/percpu-rwsem.h 2017-01-28 13:59:09.939656496 +0100
7687 #include <linux/atomic.h>
7688 #include <linux/rwsem.h>
7689 #include <linux/percpu.h>
7690 -#include <linux/wait.h>
7691 +#include <linux/swait.h>
7692 #include <linux/rcu_sync.h>
7693 #include <linux/lockdep.h>
7696 struct rcu_sync rss;
7697 unsigned int __percpu *read_count;
7698 struct rw_semaphore rw_sem;
7699 - wait_queue_head_t writer;
7700 + struct swait_queue_head writer;
7705 .rss = __RCU_SYNC_INITIALIZER(name.rss, RCU_SCHED_SYNC), \
7706 .read_count = &__percpu_rwsem_rc_##name, \
7707 .rw_sem = __RWSEM_INITIALIZER(name.rw_sem), \
7708 - .writer = __WAIT_QUEUE_HEAD_INITIALIZER(name.writer), \
7709 + .writer = __SWAIT_QUEUE_HEAD_INITIALIZER(name.writer), \
7712 extern int __percpu_down_read(struct percpu_rw_semaphore *, int);
7713 extern void __percpu_up_read(struct percpu_rw_semaphore *);
7715 -static inline void percpu_down_read_preempt_disable(struct percpu_rw_semaphore *sem)
7716 +static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
7721 __this_cpu_inc(*sem->read_count);
7722 if (unlikely(!rcu_sync_is_idle(&sem->rss)))
7723 __percpu_down_read(sem, false); /* Unconditional memory barrier */
7726 - * The barrier() prevents the compiler from
7727 + * The preempt_enable() prevents the compiler from
7728 * bleeding the critical section out.
7732 -static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
7734 - percpu_down_read_preempt_disable(sem);
7742 -static inline void percpu_up_read_preempt_enable(struct percpu_rw_semaphore *sem)
7743 +static inline void percpu_up_read(struct percpu_rw_semaphore *sem)
7746 - * The barrier() prevents the compiler from
7747 - * bleeding the critical section out.
7750 + preempt_disable();
7752 * Same as in percpu_down_read().
7755 rwsem_release(&sem->rw_sem.dep_map, 1, _RET_IP_);
7758 -static inline void percpu_up_read(struct percpu_rw_semaphore *sem)
7760 - preempt_disable();
7761 - percpu_up_read_preempt_enable(sem);
7764 extern void percpu_down_write(struct percpu_rw_semaphore *);
7765 extern void percpu_up_write(struct percpu_rw_semaphore *);
7767 diff -Nur linux-4.9.6.orig/include/linux/pid.h linux-4.9.6/include/linux/pid.h
7768 --- linux-4.9.6.orig/include/linux/pid.h 2017-01-26 08:25:24.000000000 +0100
7769 +++ linux-4.9.6/include/linux/pid.h 2017-01-28 13:59:09.939656496 +0100
7771 #define _LINUX_PID_H
7773 #include <linux/rcupdate.h>
7774 +#include <linux/atomic.h>
7778 diff -Nur linux-4.9.6.orig/include/linux/preempt.h linux-4.9.6/include/linux/preempt.h
7779 --- linux-4.9.6.orig/include/linux/preempt.h 2017-01-26 08:25:24.000000000 +0100
7780 +++ linux-4.9.6/include/linux/preempt.h 2017-01-28 13:59:09.939656496 +0100
7782 #define HARDIRQ_OFFSET (1UL << HARDIRQ_SHIFT)
7783 #define NMI_OFFSET (1UL << NMI_SHIFT)
7785 -#define SOFTIRQ_DISABLE_OFFSET (2 * SOFTIRQ_OFFSET)
7786 +#ifndef CONFIG_PREEMPT_RT_FULL
7787 +# define SOFTIRQ_DISABLE_OFFSET (2 * SOFTIRQ_OFFSET)
7789 +# define SOFTIRQ_DISABLE_OFFSET (0)
7792 /* We use the MSB mostly because its available */
7793 #define PREEMPT_NEED_RESCHED 0x80000000
7795 #include <asm/preempt.h>
7797 #define hardirq_count() (preempt_count() & HARDIRQ_MASK)
7798 -#define softirq_count() (preempt_count() & SOFTIRQ_MASK)
7799 #define irq_count() (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK \
7801 +#ifndef CONFIG_PREEMPT_RT_FULL
7802 +# define softirq_count() (preempt_count() & SOFTIRQ_MASK)
7803 +# define in_serving_softirq() (softirq_count() & SOFTIRQ_OFFSET)
7805 +# define softirq_count() (0UL)
7806 +extern int in_serving_softirq(void);
7810 * Are we doing bottom half or hardware interrupt processing?
7812 #define in_irq() (hardirq_count())
7813 #define in_softirq() (softirq_count())
7814 #define in_interrupt() (irq_count())
7815 -#define in_serving_softirq() (softirq_count() & SOFTIRQ_OFFSET)
7818 * Are we in NMI context?
7821 * The preempt_count offset after spin_lock()
7823 +#if !defined(CONFIG_PREEMPT_RT_FULL)
7824 #define PREEMPT_LOCK_OFFSET PREEMPT_DISABLE_OFFSET
7826 +#define PREEMPT_LOCK_OFFSET 0
7830 * The preempt_count offset needed for things like:
7831 @@ -140,6 +153,20 @@
7832 #define preempt_count_inc() preempt_count_add(1)
7833 #define preempt_count_dec() preempt_count_sub(1)
7835 +#ifdef CONFIG_PREEMPT_LAZY
7836 +#define add_preempt_lazy_count(val) do { preempt_lazy_count() += (val); } while (0)
7837 +#define sub_preempt_lazy_count(val) do { preempt_lazy_count() -= (val); } while (0)
7838 +#define inc_preempt_lazy_count() add_preempt_lazy_count(1)
7839 +#define dec_preempt_lazy_count() sub_preempt_lazy_count(1)
7840 +#define preempt_lazy_count() (current_thread_info()->preempt_lazy_count)
7842 +#define add_preempt_lazy_count(val) do { } while (0)
7843 +#define sub_preempt_lazy_count(val) do { } while (0)
7844 +#define inc_preempt_lazy_count() do { } while (0)
7845 +#define dec_preempt_lazy_count() do { } while (0)
7846 +#define preempt_lazy_count() (0)
7849 #ifdef CONFIG_PREEMPT_COUNT
7851 #define preempt_disable() \
7852 @@ -148,13 +175,25 @@
7856 +#define preempt_lazy_disable() \
7858 + inc_preempt_lazy_count(); \
7862 #define sched_preempt_enable_no_resched() \
7865 preempt_count_dec(); \
7868 -#define preempt_enable_no_resched() sched_preempt_enable_no_resched()
7869 +#ifdef CONFIG_PREEMPT_RT_BASE
7870 +# define preempt_enable_no_resched() sched_preempt_enable_no_resched()
7871 +# define preempt_check_resched_rt() preempt_check_resched()
7873 +# define preempt_enable_no_resched() preempt_enable()
7874 +# define preempt_check_resched_rt() barrier();
7877 #define preemptible() (preempt_count() == 0 && !irqs_disabled())
7879 @@ -179,6 +218,13 @@
7880 __preempt_schedule(); \
7883 +#define preempt_lazy_enable() \
7885 + dec_preempt_lazy_count(); \
7887 + preempt_check_resched(); \
7890 #else /* !CONFIG_PREEMPT */
7891 #define preempt_enable() \
7894 #define preempt_disable_notrace() barrier()
7895 #define preempt_enable_no_resched_notrace() barrier()
7896 #define preempt_enable_notrace() barrier()
7897 +#define preempt_check_resched_rt() barrier()
7898 #define preemptible() 0
7900 #endif /* CONFIG_PREEMPT_COUNT */
7901 @@ -244,10 +291,31 @@
7903 #define preempt_fold_need_resched() \
7905 - if (tif_need_resched()) \
7906 + if (tif_need_resched_now()) \
7907 set_preempt_need_resched(); \
7910 +#ifdef CONFIG_PREEMPT_RT_FULL
7911 +# define preempt_disable_rt() preempt_disable()
7912 +# define preempt_enable_rt() preempt_enable()
7913 +# define preempt_disable_nort() barrier()
7914 +# define preempt_enable_nort() barrier()
7916 + extern void migrate_disable(void);
7917 + extern void migrate_enable(void);
7918 +# else /* CONFIG_SMP */
7919 +# define migrate_disable() barrier()
7920 +# define migrate_enable() barrier()
7921 +# endif /* CONFIG_SMP */
7923 +# define preempt_disable_rt() barrier()
7924 +# define preempt_enable_rt() barrier()
7925 +# define preempt_disable_nort() preempt_disable()
7926 +# define preempt_enable_nort() preempt_enable()
7927 +# define migrate_disable() preempt_disable()
7928 +# define migrate_enable() preempt_enable()
7931 #ifdef CONFIG_PREEMPT_NOTIFIERS
7933 struct preempt_notifier;
7934 diff -Nur linux-4.9.6.orig/include/linux/printk.h linux-4.9.6/include/linux/printk.h
7935 --- linux-4.9.6.orig/include/linux/printk.h 2017-01-26 08:25:24.000000000 +0100
7936 +++ linux-4.9.6/include/linux/printk.h 2017-01-28 13:59:09.939656496 +0100
7937 @@ -126,9 +126,11 @@
7938 #ifdef CONFIG_EARLY_PRINTK
7939 extern asmlinkage __printf(1, 2)
7940 void early_printk(const char *fmt, ...);
7941 +extern void printk_kill(void);
7943 static inline __printf(1, 2) __cold
7944 void early_printk(const char *s, ...) { }
7945 +static inline void printk_kill(void) { }
7948 #ifdef CONFIG_PRINTK_NMI
7949 diff -Nur linux-4.9.6.orig/include/linux/radix-tree.h linux-4.9.6/include/linux/radix-tree.h
7950 --- linux-4.9.6.orig/include/linux/radix-tree.h 2017-01-26 08:25:24.000000000 +0100
7951 +++ linux-4.9.6/include/linux/radix-tree.h 2017-01-28 13:59:09.939656496 +0100
7952 @@ -289,9 +289,19 @@
7953 unsigned int radix_tree_gang_lookup_slot(struct radix_tree_root *root,
7954 void ***results, unsigned long *indices,
7955 unsigned long first_index, unsigned int max_items);
7956 +#ifdef CONFIG_PREEMPT_RT_FULL
7957 +static inline int radix_tree_preload(gfp_t gm) { return 0; }
7958 +static inline int radix_tree_maybe_preload(gfp_t gfp_mask) { return 0; }
7959 +static inline int radix_tree_maybe_preload_order(gfp_t gfp_mask, int order)
7965 int radix_tree_preload(gfp_t gfp_mask);
7966 int radix_tree_maybe_preload(gfp_t gfp_mask);
7967 int radix_tree_maybe_preload_order(gfp_t gfp_mask, int order);
7969 void radix_tree_init(void);
7970 void *radix_tree_tag_set(struct radix_tree_root *root,
7971 unsigned long index, unsigned int tag);
7974 static inline void radix_tree_preload_end(void)
7977 + preempt_enable_nort();
7981 diff -Nur linux-4.9.6.orig/include/linux/random.h linux-4.9.6/include/linux/random.h
7982 --- linux-4.9.6.orig/include/linux/random.h 2017-01-26 08:25:24.000000000 +0100
7983 +++ linux-4.9.6/include/linux/random.h 2017-01-28 13:59:09.943656649 +0100
7986 extern void add_input_randomness(unsigned int type, unsigned int code,
7987 unsigned int value) __latent_entropy;
7988 -extern void add_interrupt_randomness(int irq, int irq_flags) __latent_entropy;
7989 +extern void add_interrupt_randomness(int irq, int irq_flags, __u64 ip) __latent_entropy;
7991 extern void get_random_bytes(void *buf, int nbytes);
7992 extern int add_random_ready_callback(struct random_ready_callback *rdy);
7993 diff -Nur linux-4.9.6.orig/include/linux/rbtree_augmented.h linux-4.9.6/include/linux/rbtree_augmented.h
7994 --- linux-4.9.6.orig/include/linux/rbtree_augmented.h 2017-01-26 08:25:24.000000000 +0100
7995 +++ linux-4.9.6/include/linux/rbtree_augmented.h 2017-01-28 13:59:09.943656649 +0100
7998 #include <linux/compiler.h>
7999 #include <linux/rbtree.h>
8000 +#include <linux/rcupdate.h>
8003 * Please note - only struct rb_augment_callbacks and the prototypes for
8004 diff -Nur linux-4.9.6.orig/include/linux/rbtree.h linux-4.9.6/include/linux/rbtree.h
8005 --- linux-4.9.6.orig/include/linux/rbtree.h 2017-01-26 08:25:24.000000000 +0100
8006 +++ linux-4.9.6/include/linux/rbtree.h 2017-01-28 13:59:09.943656649 +0100
8009 #include <linux/kernel.h>
8010 #include <linux/stddef.h>
8011 -#include <linux/rcupdate.h>
8012 +#include <linux/rcu_assign_pointer.h>
8015 unsigned long __rb_parent_color;
8016 diff -Nur linux-4.9.6.orig/include/linux/rcu_assign_pointer.h linux-4.9.6/include/linux/rcu_assign_pointer.h
8017 --- linux-4.9.6.orig/include/linux/rcu_assign_pointer.h 1970-01-01 01:00:00.000000000 +0100
8018 +++ linux-4.9.6/include/linux/rcu_assign_pointer.h 2017-01-28 13:59:09.943656649 +0100
8020 +#ifndef __LINUX_RCU_ASSIGN_POINTER_H__
8021 +#define __LINUX_RCU_ASSIGN_POINTER_H__
8022 +#include <linux/compiler.h>
8023 +#include <asm/barrier.h>
8026 + * RCU_INITIALIZER() - statically initialize an RCU-protected global variable
8027 + * @v: The value to statically initialize with.
8029 +#define RCU_INITIALIZER(v) (typeof(*(v)) __force __rcu *)(v)
8032 + * rcu_assign_pointer() - assign to RCU-protected pointer
8033 + * @p: pointer to assign to
8034 + * @v: value to assign (publish)
8036 + * Assigns the specified value to the specified RCU-protected
8037 + * pointer, ensuring that any concurrent RCU readers will see
8038 + * any prior initialization.
8040 + * Inserts memory barriers on architectures that require them
8041 + * (which is most of them), and also prevents the compiler from
8042 + * reordering the code that initializes the structure after the pointer
8043 + * assignment. More importantly, this call documents which pointers
8044 + * will be dereferenced by RCU read-side code.
8046 + * In some special cases, you may use RCU_INIT_POINTER() instead
8047 + * of rcu_assign_pointer(). RCU_INIT_POINTER() is a bit faster due
8048 + * to the fact that it does not constrain either the CPU or the compiler.
8049 + * That said, using RCU_INIT_POINTER() when you should have used
8050 + * rcu_assign_pointer() is a very bad thing that results in
8051 + * impossible-to-diagnose memory corruption. So please be careful.
8052 + * See the RCU_INIT_POINTER() comment header for details.
8054 + * Note that rcu_assign_pointer() evaluates each of its arguments only
8055 + * once, appearances notwithstanding. One of the "extra" evaluations
8056 + * is in typeof() and the other visible only to sparse (__CHECKER__),
8057 + * neither of which actually execute the argument. As with most cpp
8058 + * macros, this execute-arguments-only-once property is important, so
8059 + * please be careful when making changes to rcu_assign_pointer() and the
8060 + * other macros that it invokes.
8062 +#define rcu_assign_pointer(p, v) \
8064 + uintptr_t _r_a_p__v = (uintptr_t)(v); \
8066 + if (__builtin_constant_p(v) && (_r_a_p__v) == (uintptr_t)NULL) \
8067 + WRITE_ONCE((p), (typeof(p))(_r_a_p__v)); \
8069 + smp_store_release(&p, RCU_INITIALIZER((typeof(p))_r_a_p__v)); \
8074 diff -Nur linux-4.9.6.orig/include/linux/rcupdate.h linux-4.9.6/include/linux/rcupdate.h
8075 --- linux-4.9.6.orig/include/linux/rcupdate.h 2017-01-26 08:25:24.000000000 +0100
8076 +++ linux-4.9.6/include/linux/rcupdate.h 2017-01-28 13:59:09.943656649 +0100
8078 #include <linux/compiler.h>
8079 #include <linux/ktime.h>
8080 #include <linux/irqflags.h>
8081 +#include <linux/rcu_assign_pointer.h>
8083 #include <asm/barrier.h>
8087 #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
8089 +#ifdef CONFIG_PREEMPT_RT_FULL
8090 +#define call_rcu_bh call_rcu
8093 * call_rcu_bh() - Queue an RCU for invocation after a quicker grace period.
8094 * @head: structure to be used for queueing the RCU updates.
8097 void call_rcu_bh(struct rcu_head *head,
8098 rcu_callback_t func);
8102 * call_rcu_sched() - Queue an RCU for invocation after sched grace period.
8103 @@ -301,6 +306,11 @@
8104 * types of kernel builds, the rcu_read_lock() nesting depth is unknowable.
8106 #define rcu_preempt_depth() (current->rcu_read_lock_nesting)
8107 +#ifndef CONFIG_PREEMPT_RT_FULL
8108 +#define sched_rcu_preempt_depth() rcu_preempt_depth()
8110 +static inline int sched_rcu_preempt_depth(void) { return 0; }
8113 #else /* #ifdef CONFIG_PREEMPT_RCU */
8119 +#define sched_rcu_preempt_depth() rcu_preempt_depth()
8121 #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
8123 /* Internal to kernel */
8124 @@ -505,7 +517,14 @@
8125 int debug_lockdep_rcu_enabled(void);
8127 int rcu_read_lock_held(void);
8128 +#ifdef CONFIG_PREEMPT_RT_FULL
8129 +static inline int rcu_read_lock_bh_held(void)
8131 + return rcu_read_lock_held();
8134 int rcu_read_lock_bh_held(void);
8138 * rcu_read_lock_sched_held() - might we be in RCU-sched read-side critical section?
8139 @@ -626,54 +645,6 @@
8143 - * RCU_INITIALIZER() - statically initialize an RCU-protected global variable
8144 - * @v: The value to statically initialize with.
8146 -#define RCU_INITIALIZER(v) (typeof(*(v)) __force __rcu *)(v)
8149 - * rcu_assign_pointer() - assign to RCU-protected pointer
8150 - * @p: pointer to assign to
8151 - * @v: value to assign (publish)
8153 - * Assigns the specified value to the specified RCU-protected
8154 - * pointer, ensuring that any concurrent RCU readers will see
8155 - * any prior initialization.
8157 - * Inserts memory barriers on architectures that require them
8158 - * (which is most of them), and also prevents the compiler from
8159 - * reordering the code that initializes the structure after the pointer
8160 - * assignment. More importantly, this call documents which pointers
8161 - * will be dereferenced by RCU read-side code.
8163 - * In some special cases, you may use RCU_INIT_POINTER() instead
8164 - * of rcu_assign_pointer(). RCU_INIT_POINTER() is a bit faster due
8165 - * to the fact that it does not constrain either the CPU or the compiler.
8166 - * That said, using RCU_INIT_POINTER() when you should have used
8167 - * rcu_assign_pointer() is a very bad thing that results in
8168 - * impossible-to-diagnose memory corruption. So please be careful.
8169 - * See the RCU_INIT_POINTER() comment header for details.
8171 - * Note that rcu_assign_pointer() evaluates each of its arguments only
8172 - * once, appearances notwithstanding. One of the "extra" evaluations
8173 - * is in typeof() and the other visible only to sparse (__CHECKER__),
8174 - * neither of which actually execute the argument. As with most cpp
8175 - * macros, this execute-arguments-only-once property is important, so
8176 - * please be careful when making changes to rcu_assign_pointer() and the
8177 - * other macros that it invokes.
8179 -#define rcu_assign_pointer(p, v) \
8181 - uintptr_t _r_a_p__v = (uintptr_t)(v); \
8183 - if (__builtin_constant_p(v) && (_r_a_p__v) == (uintptr_t)NULL) \
8184 - WRITE_ONCE((p), (typeof(p))(_r_a_p__v)); \
8186 - smp_store_release(&p, RCU_INITIALIZER((typeof(p))_r_a_p__v)); \
8191 * rcu_access_pointer() - fetch RCU pointer with no dereferencing
8192 * @p: The pointer to read
8194 @@ -951,10 +922,14 @@
8195 static inline void rcu_read_lock_bh(void)
8198 +#ifdef CONFIG_PREEMPT_RT_FULL
8202 rcu_lock_acquire(&rcu_bh_lock_map);
8203 RCU_LOCKDEP_WARN(!rcu_is_watching(),
8204 "rcu_read_lock_bh() used illegally while idle");
8209 @@ -964,10 +939,14 @@
8211 static inline void rcu_read_unlock_bh(void)
8213 +#ifdef CONFIG_PREEMPT_RT_FULL
8214 + rcu_read_unlock();
8216 RCU_LOCKDEP_WARN(!rcu_is_watching(),
8217 "rcu_read_unlock_bh() used illegally while idle");
8218 rcu_lock_release(&rcu_bh_lock_map);
8224 diff -Nur linux-4.9.6.orig/include/linux/rcutree.h linux-4.9.6/include/linux/rcutree.h
8225 --- linux-4.9.6.orig/include/linux/rcutree.h 2017-01-26 08:25:24.000000000 +0100
8226 +++ linux-4.9.6/include/linux/rcutree.h 2017-01-28 13:59:09.943656649 +0100
8228 rcu_note_context_switch();
8231 +#ifdef CONFIG_PREEMPT_RT_FULL
8232 +# define synchronize_rcu_bh synchronize_rcu
8234 void synchronize_rcu_bh(void);
8236 void synchronize_sched_expedited(void);
8237 void synchronize_rcu_expedited(void);
8242 void rcu_barrier(void);
8243 +#ifdef CONFIG_PREEMPT_RT_FULL
8244 +# define rcu_barrier_bh rcu_barrier
8246 void rcu_barrier_bh(void);
8248 void rcu_barrier_sched(void);
8249 unsigned long get_state_synchronize_rcu(void);
8250 void cond_synchronize_rcu(unsigned long oldstate);
8252 extern unsigned long rcutorture_testseq;
8253 extern unsigned long rcutorture_vernum;
8254 unsigned long rcu_batches_started(void);
8255 -unsigned long rcu_batches_started_bh(void);
8256 unsigned long rcu_batches_started_sched(void);
8257 unsigned long rcu_batches_completed(void);
8258 -unsigned long rcu_batches_completed_bh(void);
8259 unsigned long rcu_batches_completed_sched(void);
8260 unsigned long rcu_exp_batches_completed(void);
8261 unsigned long rcu_exp_batches_completed_sched(void);
8262 void show_rcu_gp_kthreads(void);
8264 void rcu_force_quiescent_state(void);
8265 -void rcu_bh_force_quiescent_state(void);
8266 void rcu_sched_force_quiescent_state(void);
8268 void rcu_idle_enter(void);
8269 @@ -109,6 +114,16 @@
8271 bool rcu_is_watching(void);
8273 +#ifndef CONFIG_PREEMPT_RT_FULL
8274 +void rcu_bh_force_quiescent_state(void);
8275 +unsigned long rcu_batches_started_bh(void);
8276 +unsigned long rcu_batches_completed_bh(void);
8278 +# define rcu_bh_force_quiescent_state rcu_force_quiescent_state
8279 +# define rcu_batches_completed_bh rcu_batches_completed
8280 +# define rcu_batches_started_bh rcu_batches_completed
8283 void rcu_all_qs(void);
8285 /* RCUtree hotplug events */
8286 diff -Nur linux-4.9.6.orig/include/linux/rtmutex.h linux-4.9.6/include/linux/rtmutex.h
8287 --- linux-4.9.6.orig/include/linux/rtmutex.h 2017-01-26 08:25:24.000000000 +0100
8288 +++ linux-4.9.6/include/linux/rtmutex.h 2017-01-28 13:59:09.943656649 +0100
8290 #define __LINUX_RT_MUTEX_H
8292 #include <linux/linkage.h>
8293 +#include <linux/spinlock_types_raw.h>
8294 #include <linux/rbtree.h>
8295 -#include <linux/spinlock_types.h>
8297 extern int max_lock_depth; /* for sysctl */
8299 +#ifdef CONFIG_DEBUG_MUTEXES
8300 +#include <linux/debug_locks.h>
8304 * The rt_mutex structure
8307 struct rb_root waiters;
8308 struct rb_node *waiters_leftmost;
8309 struct task_struct *owner;
8310 -#ifdef CONFIG_DEBUG_RT_MUTEXES
8312 +#ifdef CONFIG_DEBUG_RT_MUTEXES
8313 const char *name, *file;
8317 # define rt_mutex_debug_check_no_locks_held(task) do { } while (0)
8320 +# define rt_mutex_init(mutex) \
8322 + raw_spin_lock_init(&(mutex)->wait_lock); \
8323 + __rt_mutex_init(mutex, #mutex); \
8326 #ifdef CONFIG_DEBUG_RT_MUTEXES
8327 # define __DEBUG_RT_MUTEX_INITIALIZER(mutexname) \
8328 , .name = #mutexname, .file = __FILE__, .line = __LINE__
8329 -# define rt_mutex_init(mutex) __rt_mutex_init(mutex, __func__)
8330 extern void rt_mutex_debug_task_free(struct task_struct *tsk);
8332 # define __DEBUG_RT_MUTEX_INITIALIZER(mutexname)
8333 -# define rt_mutex_init(mutex) __rt_mutex_init(mutex, NULL)
8334 # define rt_mutex_debug_task_free(t) do { } while (0)
8337 -#define __RT_MUTEX_INITIALIZER(mutexname) \
8338 - { .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \
8339 +#define __RT_MUTEX_INITIALIZER_PLAIN(mutexname) \
8340 + .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \
8341 , .waiters = RB_ROOT \
8343 - __DEBUG_RT_MUTEX_INITIALIZER(mutexname)}
8344 + __DEBUG_RT_MUTEX_INITIALIZER(mutexname)
8346 +#define __RT_MUTEX_INITIALIZER(mutexname) \
8347 + { __RT_MUTEX_INITIALIZER_PLAIN(mutexname) }
8349 +#define __RT_MUTEX_INITIALIZER_SAVE_STATE(mutexname) \
8350 + { __RT_MUTEX_INITIALIZER_PLAIN(mutexname) \
8351 + , .save_state = 1 }
8353 #define DEFINE_RT_MUTEX(mutexname) \
8354 struct rt_mutex mutexname = __RT_MUTEX_INITIALIZER(mutexname)
8357 extern void rt_mutex_lock(struct rt_mutex *lock);
8358 extern int rt_mutex_lock_interruptible(struct rt_mutex *lock);
8359 +extern int rt_mutex_lock_killable(struct rt_mutex *lock);
8360 extern int rt_mutex_timed_lock(struct rt_mutex *lock,
8361 struct hrtimer_sleeper *timeout);
8363 diff -Nur linux-4.9.6.orig/include/linux/rwlock_rt.h linux-4.9.6/include/linux/rwlock_rt.h
8364 --- linux-4.9.6.orig/include/linux/rwlock_rt.h 1970-01-01 01:00:00.000000000 +0100
8365 +++ linux-4.9.6/include/linux/rwlock_rt.h 2017-01-28 13:59:09.943656649 +0100
8367 +#ifndef __LINUX_RWLOCK_RT_H
8368 +#define __LINUX_RWLOCK_RT_H
8370 +#ifndef __LINUX_SPINLOCK_H
8371 +#error Do not include directly. Use spinlock.h
8374 +#define rwlock_init(rwl) \
8376 + static struct lock_class_key __key; \
8378 + rt_mutex_init(&(rwl)->lock); \
8379 + __rt_rwlock_init(rwl, #rwl, &__key); \
8382 +extern void __lockfunc rt_write_lock(rwlock_t *rwlock);
8383 +extern void __lockfunc rt_read_lock(rwlock_t *rwlock);
8384 +extern int __lockfunc rt_write_trylock(rwlock_t *rwlock);
8385 +extern int __lockfunc rt_write_trylock_irqsave(rwlock_t *trylock, unsigned long *flags);
8386 +extern int __lockfunc rt_read_trylock(rwlock_t *rwlock);
8387 +extern void __lockfunc rt_write_unlock(rwlock_t *rwlock);
8388 +extern void __lockfunc rt_read_unlock(rwlock_t *rwlock);
8389 +extern unsigned long __lockfunc rt_write_lock_irqsave(rwlock_t *rwlock);
8390 +extern unsigned long __lockfunc rt_read_lock_irqsave(rwlock_t *rwlock);
8391 +extern void __rt_rwlock_init(rwlock_t *rwlock, char *name, struct lock_class_key *key);
8393 +#define read_trylock(lock) __cond_lock(lock, rt_read_trylock(lock))
8394 +#define write_trylock(lock) __cond_lock(lock, rt_write_trylock(lock))
8396 +#define write_trylock_irqsave(lock, flags) \
8397 + __cond_lock(lock, rt_write_trylock_irqsave(lock, &flags))
8399 +#define read_lock_irqsave(lock, flags) \
8401 + typecheck(unsigned long, flags); \
8402 + flags = rt_read_lock_irqsave(lock); \
8405 +#define write_lock_irqsave(lock, flags) \
8407 + typecheck(unsigned long, flags); \
8408 + flags = rt_write_lock_irqsave(lock); \
8411 +#define read_lock(lock) rt_read_lock(lock)
8413 +#define read_lock_bh(lock) \
8415 + local_bh_disable(); \
8416 + rt_read_lock(lock); \
8419 +#define read_lock_irq(lock) read_lock(lock)
8421 +#define write_lock(lock) rt_write_lock(lock)
8423 +#define write_lock_bh(lock) \
8425 + local_bh_disable(); \
8426 + rt_write_lock(lock); \
8429 +#define write_lock_irq(lock) write_lock(lock)
8431 +#define read_unlock(lock) rt_read_unlock(lock)
8433 +#define read_unlock_bh(lock) \
8435 + rt_read_unlock(lock); \
8436 + local_bh_enable(); \
8439 +#define read_unlock_irq(lock) read_unlock(lock)
8441 +#define write_unlock(lock) rt_write_unlock(lock)
8443 +#define write_unlock_bh(lock) \
8445 + rt_write_unlock(lock); \
8446 + local_bh_enable(); \
8449 +#define write_unlock_irq(lock) write_unlock(lock)
8451 +#define read_unlock_irqrestore(lock, flags) \
8453 + typecheck(unsigned long, flags); \
8455 + rt_read_unlock(lock); \
8458 +#define write_unlock_irqrestore(lock, flags) \
8460 + typecheck(unsigned long, flags); \
8462 + rt_write_unlock(lock); \
8466 diff -Nur linux-4.9.6.orig/include/linux/rwlock_types.h linux-4.9.6/include/linux/rwlock_types.h
8467 --- linux-4.9.6.orig/include/linux/rwlock_types.h 2017-01-26 08:25:24.000000000 +0100
8468 +++ linux-4.9.6/include/linux/rwlock_types.h 2017-01-28 13:59:09.943656649 +0100
8470 #ifndef __LINUX_RWLOCK_TYPES_H
8471 #define __LINUX_RWLOCK_TYPES_H
8473 +#if !defined(__LINUX_SPINLOCK_TYPES_H)
8474 +# error "Do not include directly, include spinlock_types.h"
8478 * include/linux/rwlock_types.h - generic rwlock type definitions
8480 diff -Nur linux-4.9.6.orig/include/linux/rwlock_types_rt.h linux-4.9.6/include/linux/rwlock_types_rt.h
8481 --- linux-4.9.6.orig/include/linux/rwlock_types_rt.h 1970-01-01 01:00:00.000000000 +0100
8482 +++ linux-4.9.6/include/linux/rwlock_types_rt.h 2017-01-28 13:59:09.943656649 +0100
8484 +#ifndef __LINUX_RWLOCK_TYPES_RT_H
8485 +#define __LINUX_RWLOCK_TYPES_RT_H
8487 +#ifndef __LINUX_SPINLOCK_TYPES_H
8488 +#error "Do not include directly. Include spinlock_types.h instead"
8492 + * rwlocks - rtmutex which allows single reader recursion
8495 + struct rt_mutex lock;
8497 + unsigned int break_lock;
8498 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
8499 + struct lockdep_map dep_map;
8503 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
8504 +# define RW_DEP_MAP_INIT(lockname) .dep_map = { .name = #lockname }
8506 +# define RW_DEP_MAP_INIT(lockname)
8509 +#define __RW_LOCK_UNLOCKED(name) \
8510 + { .lock = __RT_MUTEX_INITIALIZER_SAVE_STATE(name.lock), \
8511 + RW_DEP_MAP_INIT(name) }
8513 +#define DEFINE_RWLOCK(name) \
8514 + rwlock_t name = __RW_LOCK_UNLOCKED(name)
8517 diff -Nur linux-4.9.6.orig/include/linux/rwsem.h linux-4.9.6/include/linux/rwsem.h
8518 --- linux-4.9.6.orig/include/linux/rwsem.h 2017-01-26 08:25:24.000000000 +0100
8519 +++ linux-4.9.6/include/linux/rwsem.h 2017-01-28 13:59:09.943656649 +0100
8521 #include <linux/osq_lock.h>
8524 +#ifdef CONFIG_PREEMPT_RT_FULL
8525 +#include <linux/rwsem_rt.h>
8526 +#else /* PREEMPT_RT_FULL */
8528 struct rw_semaphore;
8530 #ifdef CONFIG_RWSEM_GENERIC_SPINLOCK
8532 # define up_read_non_owner(sem) up_read(sem)
8535 +#endif /* !PREEMPT_RT_FULL */
8537 #endif /* _LINUX_RWSEM_H */
8538 diff -Nur linux-4.9.6.orig/include/linux/rwsem_rt.h linux-4.9.6/include/linux/rwsem_rt.h
8539 --- linux-4.9.6.orig/include/linux/rwsem_rt.h 1970-01-01 01:00:00.000000000 +0100
8540 +++ linux-4.9.6/include/linux/rwsem_rt.h 2017-01-28 13:59:09.943656649 +0100
8542 +#ifndef _LINUX_RWSEM_RT_H
8543 +#define _LINUX_RWSEM_RT_H
8545 +#ifndef _LINUX_RWSEM_H
8546 +#error "Include rwsem.h"
8550 + * RW-semaphores are a spinlock plus a reader-depth count.
8552 + * Note that the semantics are different from the usual
8553 + * Linux rw-sems, in PREEMPT_RT mode we do not allow
8554 + * multiple readers to hold the lock at once, we only allow
8555 + * a read-lock owner to read-lock recursively. This is
8556 + * better for latency, makes the implementation inherently
8557 + * fair and makes it simpler as well.
8560 +#include <linux/rtmutex.h>
8562 +struct rw_semaphore {
8563 + struct rt_mutex lock;
8565 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
8566 + struct lockdep_map dep_map;
8570 +#define __RWSEM_INITIALIZER(name) \
8571 + { .lock = __RT_MUTEX_INITIALIZER(name.lock), \
8572 + RW_DEP_MAP_INIT(name) }
8574 +#define DECLARE_RWSEM(lockname) \
8575 + struct rw_semaphore lockname = __RWSEM_INITIALIZER(lockname)
8577 +extern void __rt_rwsem_init(struct rw_semaphore *rwsem, const char *name,
8578 + struct lock_class_key *key);
8580 +#define __rt_init_rwsem(sem, name, key) \
8582 + rt_mutex_init(&(sem)->lock); \
8583 + __rt_rwsem_init((sem), (name), (key));\
8586 +#define __init_rwsem(sem, name, key) __rt_init_rwsem(sem, name, key)
8588 +# define rt_init_rwsem(sem) \
8590 + static struct lock_class_key __key; \
8592 + __rt_init_rwsem((sem), #sem, &__key); \
8595 +extern void rt_down_write(struct rw_semaphore *rwsem);
8596 +extern int rt_down_write_killable(struct rw_semaphore *rwsem);
8597 +extern void rt_down_read_nested(struct rw_semaphore *rwsem, int subclass);
8598 +extern void rt_down_write_nested(struct rw_semaphore *rwsem, int subclass);
8599 +extern int rt_down_write_killable_nested(struct rw_semaphore *rwsem,
8601 +extern void rt_down_write_nested_lock(struct rw_semaphore *rwsem,
8602 + struct lockdep_map *nest);
8603 +extern void rt__down_read(struct rw_semaphore *rwsem);
8604 +extern void rt_down_read(struct rw_semaphore *rwsem);
8605 +extern int rt_down_write_trylock(struct rw_semaphore *rwsem);
8606 +extern int rt__down_read_trylock(struct rw_semaphore *rwsem);
8607 +extern int rt_down_read_trylock(struct rw_semaphore *rwsem);
8608 +extern void __rt_up_read(struct rw_semaphore *rwsem);
8609 +extern void rt_up_read(struct rw_semaphore *rwsem);
8610 +extern void rt_up_write(struct rw_semaphore *rwsem);
8611 +extern void rt_downgrade_write(struct rw_semaphore *rwsem);
8613 +#define init_rwsem(sem) rt_init_rwsem(sem)
8614 +#define rwsem_is_locked(s) rt_mutex_is_locked(&(s)->lock)
8616 +static inline int rwsem_is_contended(struct rw_semaphore *sem)
8618 + /* rt_mutex_has_waiters() */
8619 + return !RB_EMPTY_ROOT(&sem->lock.waiters);
8622 +static inline void __down_read(struct rw_semaphore *sem)
8624 + rt__down_read(sem);
8627 +static inline void down_read(struct rw_semaphore *sem)
8629 + rt_down_read(sem);
8632 +static inline int __down_read_trylock(struct rw_semaphore *sem)
8634 + return rt__down_read_trylock(sem);
8637 +static inline int down_read_trylock(struct rw_semaphore *sem)
8639 + return rt_down_read_trylock(sem);
8642 +static inline void down_write(struct rw_semaphore *sem)
8644 + rt_down_write(sem);
8647 +static inline int down_write_killable(struct rw_semaphore *sem)
8649 + return rt_down_write_killable(sem);
8652 +static inline int down_write_trylock(struct rw_semaphore *sem)
8654 + return rt_down_write_trylock(sem);
8657 +static inline void __up_read(struct rw_semaphore *sem)
8659 + __rt_up_read(sem);
8662 +static inline void up_read(struct rw_semaphore *sem)
8667 +static inline void up_write(struct rw_semaphore *sem)
8672 +static inline void downgrade_write(struct rw_semaphore *sem)
8674 + rt_downgrade_write(sem);
8677 +static inline void down_read_nested(struct rw_semaphore *sem, int subclass)
8679 + return rt_down_read_nested(sem, subclass);
8682 +static inline void down_write_nested(struct rw_semaphore *sem, int subclass)
8684 + rt_down_write_nested(sem, subclass);
8687 +static inline int down_write_killable_nested(struct rw_semaphore *sem,
8690 + return rt_down_write_killable_nested(sem, subclass);
8693 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
8694 +static inline void down_write_nest_lock(struct rw_semaphore *sem,
8695 + struct rw_semaphore *nest_lock)
8697 + rt_down_write_nested_lock(sem, &nest_lock->dep_map);
8702 +static inline void down_write_nest_lock(struct rw_semaphore *sem,
8703 + struct rw_semaphore *nest_lock)
8705 + rt_down_write_nested_lock(sem, NULL);
8709 diff -Nur linux-4.9.6.orig/include/linux/sched.h linux-4.9.6/include/linux/sched.h
8710 --- linux-4.9.6.orig/include/linux/sched.h 2017-01-26 08:25:24.000000000 +0100
8711 +++ linux-4.9.6/include/linux/sched.h 2017-01-28 13:59:09.943656649 +0100
8713 #include <linux/nodemask.h>
8714 #include <linux/mm_types.h>
8715 #include <linux/preempt.h>
8716 +#include <asm/kmap_types.h>
8718 #include <asm/page.h>
8719 #include <asm/ptrace.h>
8720 @@ -243,10 +244,7 @@
8721 TASK_UNINTERRUPTIBLE | __TASK_STOPPED | \
8722 __TASK_TRACED | EXIT_ZOMBIE | EXIT_DEAD)
8724 -#define task_is_traced(task) ((task->state & __TASK_TRACED) != 0)
8725 #define task_is_stopped(task) ((task->state & __TASK_STOPPED) != 0)
8726 -#define task_is_stopped_or_traced(task) \
8727 - ((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0)
8728 #define task_contributes_to_load(task) \
8729 ((task->state & TASK_UNINTERRUPTIBLE) != 0 && \
8730 (task->flags & PF_FROZEN) == 0 && \
8731 @@ -312,6 +310,11 @@
8735 +#define __set_current_state_no_track(state_value) \
8736 + do { current->state = (state_value); } while (0)
8737 +#define set_current_state_no_track(state_value) \
8738 + set_mb(current->state, (state_value))
8740 /* Task command name length */
8741 #define TASK_COMM_LEN 16
8743 @@ -1013,8 +1016,18 @@
8744 struct wake_q_head name = { WAKE_Q_TAIL, &name.first }
8746 extern void wake_q_add(struct wake_q_head *head,
8747 - struct task_struct *task);
8748 -extern void wake_up_q(struct wake_q_head *head);
8749 + struct task_struct *task);
8750 +extern void __wake_up_q(struct wake_q_head *head, bool sleeper);
8752 +static inline void wake_up_q(struct wake_q_head *head)
8754 + __wake_up_q(head, false);
8757 +static inline void wake_up_q_sleeper(struct wake_q_head *head)
8759 + __wake_up_q(head, true);
8763 * sched-domains (multiprocessor balancing) declarations:
8764 @@ -1481,6 +1494,7 @@
8765 struct thread_info thread_info;
8767 volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */
8768 + volatile long saved_state; /* saved state for "spinlock sleepers" */
8771 unsigned int flags; /* per process flags, defined below */
8772 @@ -1520,6 +1534,12 @@
8775 unsigned int policy;
8776 +#ifdef CONFIG_PREEMPT_RT_FULL
8777 + int migrate_disable;
8778 +# ifdef CONFIG_SCHED_DEBUG
8779 + int migrate_disable_atomic;
8782 int nr_cpus_allowed;
8783 cpumask_t cpus_allowed;
8785 @@ -1654,6 +1674,9 @@
8787 struct task_cputime cputime_expires;
8788 struct list_head cpu_timers[3];
8789 +#ifdef CONFIG_PREEMPT_RT_BASE
8790 + struct task_struct *posix_timer_list;
8793 /* process credentials */
8794 const struct cred __rcu *ptracer_cred; /* Tracer's credentials at attach */
8795 @@ -1685,10 +1708,15 @@
8796 /* signal handlers */
8797 struct signal_struct *signal;
8798 struct sighand_struct *sighand;
8799 + struct sigqueue *sigqueue_cache;
8801 sigset_t blocked, real_blocked;
8802 sigset_t saved_sigmask; /* restored if set_restore_sigmask() was used */
8803 struct sigpending pending;
8804 +#ifdef CONFIG_PREEMPT_RT_FULL
8805 + /* TODO: move me into ->restart_block ? */
8806 + struct siginfo forced_info;
8809 unsigned long sas_ss_sp;
8811 @@ -1917,6 +1945,12 @@
8812 /* bitmask and counter of trace recursion */
8813 unsigned long trace_recursion;
8814 #endif /* CONFIG_TRACING */
8815 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
8816 + u64 preempt_timestamp_hist;
8817 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
8818 + long timer_offset;
8822 /* Coverage collection mode enabled for this task (0 if disabled). */
8823 enum kcov_mode kcov_mode;
8824 @@ -1942,9 +1976,23 @@
8825 unsigned int sequential_io;
8826 unsigned int sequential_io_avg;
8828 +#ifdef CONFIG_PREEMPT_RT_BASE
8829 + struct rcu_head put_rcu;
8830 + int softirq_nestcnt;
8831 + unsigned int softirqs_raised;
8833 +#ifdef CONFIG_PREEMPT_RT_FULL
8834 +# if defined CONFIG_HIGHMEM || defined CONFIG_X86_32
8836 + pte_t kmap_pte[KM_TYPE_NR];
8839 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
8840 unsigned long task_state_change;
8842 +#ifdef CONFIG_PREEMPT_RT_FULL
8843 + int xmit_recursion;
8845 int pagefault_disabled;
8847 struct task_struct *oom_reaper_list;
8848 @@ -1984,14 +2032,6 @@
8852 -/* Future-safe accessor for struct task_struct's cpus_allowed. */
8853 -#define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
8855 -static inline int tsk_nr_cpus_allowed(struct task_struct *p)
8857 - return p->nr_cpus_allowed;
8860 #define TNF_MIGRATED 0x01
8861 #define TNF_NO_GROUP 0x02
8862 #define TNF_SHARED 0x04
8863 @@ -2207,6 +2247,15 @@
8864 extern void free_task(struct task_struct *tsk);
8865 #define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0)
8867 +#ifdef CONFIG_PREEMPT_RT_BASE
8868 +extern void __put_task_struct_cb(struct rcu_head *rhp);
8870 +static inline void put_task_struct(struct task_struct *t)
8872 + if (atomic_dec_and_test(&t->usage))
8873 + call_rcu(&t->put_rcu, __put_task_struct_cb);
8876 extern void __put_task_struct(struct task_struct *t);
8878 static inline void put_task_struct(struct task_struct *t)
8879 @@ -2214,6 +2263,7 @@
8880 if (atomic_dec_and_test(&t->usage))
8881 __put_task_struct(t);
8885 struct task_struct *task_rcu_dereference(struct task_struct **ptask);
8886 struct task_struct *try_get_task_struct(struct task_struct **ptask);
8887 @@ -2255,6 +2305,7 @@
8891 +#define PF_IN_SOFTIRQ 0x00000001 /* Task is serving softirq */
8892 #define PF_EXITING 0x00000004 /* getting shut down */
8893 #define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */
8894 #define PF_VCPU 0x00000010 /* I'm a virtual CPU */
8895 @@ -2423,6 +2474,10 @@
8897 extern int set_cpus_allowed_ptr(struct task_struct *p,
8898 const struct cpumask *new_mask);
8899 +int migrate_me(void);
8900 +void tell_sched_cpu_down_begin(int cpu);
8901 +void tell_sched_cpu_down_done(int cpu);
8904 static inline void do_set_cpus_allowed(struct task_struct *p,
8905 const struct cpumask *new_mask)
8906 @@ -2435,6 +2490,9 @@
8910 +static inline int migrate_me(void) { return 0; }
8911 +static inline void tell_sched_cpu_down_begin(int cpu) { }
8912 +static inline void tell_sched_cpu_down_done(int cpu) { }
8915 #ifdef CONFIG_NO_HZ_COMMON
8916 @@ -2673,6 +2731,7 @@
8918 extern int wake_up_state(struct task_struct *tsk, unsigned int state);
8919 extern int wake_up_process(struct task_struct *tsk);
8920 +extern int wake_up_lock_sleeper(struct task_struct * tsk);
8921 extern void wake_up_new_task(struct task_struct *tsk);
8923 extern void kick_process(struct task_struct *tsk);
8924 @@ -2881,6 +2940,17 @@
8928 +#ifdef CONFIG_PREEMPT_RT_BASE
8929 +extern void __mmdrop_delayed(struct rcu_head *rhp);
8930 +static inline void mmdrop_delayed(struct mm_struct *mm)
8932 + if (atomic_dec_and_test(&mm->mm_count))
8933 + call_rcu(&mm->delayed_drop, __mmdrop_delayed);
8936 +# define mmdrop_delayed(mm) mmdrop(mm)
8939 static inline void mmdrop_async_fn(struct work_struct *work)
8941 struct mm_struct *mm = container_of(work, struct mm_struct, async_put_work);
8942 @@ -3273,6 +3343,43 @@
8943 return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED));
8946 +#ifdef CONFIG_PREEMPT_LAZY
8947 +static inline void set_tsk_need_resched_lazy(struct task_struct *tsk)
8949 + set_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY);
8952 +static inline void clear_tsk_need_resched_lazy(struct task_struct *tsk)
8954 + clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY);
8957 +static inline int test_tsk_need_resched_lazy(struct task_struct *tsk)
8959 + return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY));
8962 +static inline int need_resched_lazy(void)
8964 + return test_thread_flag(TIF_NEED_RESCHED_LAZY);
8967 +static inline int need_resched_now(void)
8969 + return test_thread_flag(TIF_NEED_RESCHED);
8973 +static inline void clear_tsk_need_resched_lazy(struct task_struct *tsk) { }
8974 +static inline int need_resched_lazy(void) { return 0; }
8976 +static inline int need_resched_now(void)
8978 + return test_thread_flag(TIF_NEED_RESCHED);
8983 static inline int restart_syscall(void)
8985 set_tsk_thread_flag(current, TIF_SIGPENDING);
8986 @@ -3304,6 +3411,51 @@
8987 return (state & TASK_INTERRUPTIBLE) || __fatal_signal_pending(p);
8990 +static inline bool __task_is_stopped_or_traced(struct task_struct *task)
8992 + if (task->state & (__TASK_STOPPED | __TASK_TRACED))
8994 +#ifdef CONFIG_PREEMPT_RT_FULL
8995 + if (task->saved_state & (__TASK_STOPPED | __TASK_TRACED))
9001 +static inline bool task_is_stopped_or_traced(struct task_struct *task)
9003 + bool traced_stopped;
9005 +#ifdef CONFIG_PREEMPT_RT_FULL
9006 + unsigned long flags;
9008 + raw_spin_lock_irqsave(&task->pi_lock, flags);
9009 + traced_stopped = __task_is_stopped_or_traced(task);
9010 + raw_spin_unlock_irqrestore(&task->pi_lock, flags);
9012 + traced_stopped = __task_is_stopped_or_traced(task);
9014 + return traced_stopped;
9017 +static inline bool task_is_traced(struct task_struct *task)
9019 + bool traced = false;
9021 + if (task->state & __TASK_TRACED)
9023 +#ifdef CONFIG_PREEMPT_RT_FULL
9024 + /* in case the task is sleeping on tasklist_lock */
9025 + raw_spin_lock_irq(&task->pi_lock);
9026 + if (task->state & __TASK_TRACED)
9028 + else if (task->saved_state & __TASK_TRACED)
9030 + raw_spin_unlock_irq(&task->pi_lock);
9036 * cond_resched() and cond_resched_lock(): latency reduction via
9037 * explicit rescheduling in places that are safe. The return
9038 @@ -3329,12 +3481,16 @@
9039 __cond_resched_lock(lock); \
9042 +#ifndef CONFIG_PREEMPT_RT_FULL
9043 extern int __cond_resched_softirq(void);
9045 #define cond_resched_softirq() ({ \
9046 ___might_sleep(__FILE__, __LINE__, SOFTIRQ_DISABLE_OFFSET); \
9047 __cond_resched_softirq(); \
9050 +# define cond_resched_softirq() cond_resched()
9053 static inline void cond_resched_rcu(void)
9055 @@ -3509,6 +3665,31 @@
9057 #endif /* CONFIG_SMP */
9059 +static inline int __migrate_disabled(struct task_struct *p)
9061 +#ifdef CONFIG_PREEMPT_RT_FULL
9062 + return p->migrate_disable;
9068 +/* Future-safe accessor for struct task_struct's cpus_allowed. */
9069 +static inline const struct cpumask *tsk_cpus_allowed(struct task_struct *p)
9071 + if (__migrate_disabled(p))
9072 + return cpumask_of(task_cpu(p));
9074 + return &p->cpus_allowed;
9077 +static inline int tsk_nr_cpus_allowed(struct task_struct *p)
9079 + if (__migrate_disabled(p))
9081 + return p->nr_cpus_allowed;
9084 extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask);
9085 extern long sched_getaffinity(pid_t pid, struct cpumask *mask);
9087 diff -Nur linux-4.9.6.orig/include/linux/seqlock.h linux-4.9.6/include/linux/seqlock.h
9088 --- linux-4.9.6.orig/include/linux/seqlock.h 2017-01-26 08:25:24.000000000 +0100
9089 +++ linux-4.9.6/include/linux/seqlock.h 2017-01-28 13:59:09.943656649 +0100
9090 @@ -220,20 +220,30 @@
9091 return __read_seqcount_retry(s, start);
9096 -static inline void raw_write_seqcount_begin(seqcount_t *s)
9097 +static inline void __raw_write_seqcount_begin(seqcount_t *s)
9103 -static inline void raw_write_seqcount_end(seqcount_t *s)
9104 +static inline void raw_write_seqcount_begin(seqcount_t *s)
9106 + preempt_disable_rt();
9107 + __raw_write_seqcount_begin(s);
9110 +static inline void __raw_write_seqcount_end(seqcount_t *s)
9116 +static inline void raw_write_seqcount_end(seqcount_t *s)
9118 + __raw_write_seqcount_end(s);
9119 + preempt_enable_rt();
9123 * raw_write_seqcount_barrier - do a seq write barrier
9124 * @s: pointer to seqcount_t
9125 @@ -428,10 +438,32 @@
9127 * Read side functions for starting and finalizing a read side section.
9129 +#ifndef CONFIG_PREEMPT_RT_FULL
9130 static inline unsigned read_seqbegin(const seqlock_t *sl)
9132 return read_seqcount_begin(&sl->seqcount);
9136 + * Starvation safe read side for RT
9138 +static inline unsigned read_seqbegin(seqlock_t *sl)
9143 + ret = ACCESS_ONCE(sl->seqcount.sequence);
9144 + if (unlikely(ret & 1)) {
9146 + * Take the lock and let the writer proceed (i.e. evtl
9147 + * boost it), otherwise we could loop here forever.
9149 + spin_unlock_wait(&sl->lock);
9156 static inline unsigned read_seqretry(const seqlock_t *sl, unsigned start)
9158 @@ -446,36 +478,45 @@
9159 static inline void write_seqlock(seqlock_t *sl)
9161 spin_lock(&sl->lock);
9162 - write_seqcount_begin(&sl->seqcount);
9163 + __raw_write_seqcount_begin(&sl->seqcount);
9166 +static inline int try_write_seqlock(seqlock_t *sl)
9168 + if (spin_trylock(&sl->lock)) {
9169 + __raw_write_seqcount_begin(&sl->seqcount);
9175 static inline void write_sequnlock(seqlock_t *sl)
9177 - write_seqcount_end(&sl->seqcount);
9178 + __raw_write_seqcount_end(&sl->seqcount);
9179 spin_unlock(&sl->lock);
9182 static inline void write_seqlock_bh(seqlock_t *sl)
9184 spin_lock_bh(&sl->lock);
9185 - write_seqcount_begin(&sl->seqcount);
9186 + __raw_write_seqcount_begin(&sl->seqcount);
9189 static inline void write_sequnlock_bh(seqlock_t *sl)
9191 - write_seqcount_end(&sl->seqcount);
9192 + __raw_write_seqcount_end(&sl->seqcount);
9193 spin_unlock_bh(&sl->lock);
9196 static inline void write_seqlock_irq(seqlock_t *sl)
9198 spin_lock_irq(&sl->lock);
9199 - write_seqcount_begin(&sl->seqcount);
9200 + __raw_write_seqcount_begin(&sl->seqcount);
9203 static inline void write_sequnlock_irq(seqlock_t *sl)
9205 - write_seqcount_end(&sl->seqcount);
9206 + __raw_write_seqcount_end(&sl->seqcount);
9207 spin_unlock_irq(&sl->lock);
9211 unsigned long flags;
9213 spin_lock_irqsave(&sl->lock, flags);
9214 - write_seqcount_begin(&sl->seqcount);
9215 + __raw_write_seqcount_begin(&sl->seqcount);
9221 write_sequnlock_irqrestore(seqlock_t *sl, unsigned long flags)
9223 - write_seqcount_end(&sl->seqcount);
9224 + __raw_write_seqcount_end(&sl->seqcount);
9225 spin_unlock_irqrestore(&sl->lock, flags);
9228 diff -Nur linux-4.9.6.orig/include/linux/signal.h linux-4.9.6/include/linux/signal.h
9229 --- linux-4.9.6.orig/include/linux/signal.h 2017-01-26 08:25:24.000000000 +0100
9230 +++ linux-4.9.6/include/linux/signal.h 2017-01-28 13:59:09.947656802 +0100
9234 extern void flush_sigqueue(struct sigpending *queue);
9235 +extern void flush_task_sigqueue(struct task_struct *tsk);
9237 /* Test if 'sig' is valid signal. Use this instead of testing _NSIG directly */
9238 static inline int valid_signal(unsigned long sig)
9239 diff -Nur linux-4.9.6.orig/include/linux/skbuff.h linux-4.9.6/include/linux/skbuff.h
9240 --- linux-4.9.6.orig/include/linux/skbuff.h 2017-01-26 08:25:24.000000000 +0100
9241 +++ linux-4.9.6/include/linux/skbuff.h 2017-01-28 13:59:09.947656802 +0100
9246 + raw_spinlock_t raw_lock;
9250 @@ -1573,6 +1574,12 @@
9251 __skb_queue_head_init(list);
9254 +static inline void skb_queue_head_init_raw(struct sk_buff_head *list)
9256 + raw_spin_lock_init(&list->raw_lock);
9257 + __skb_queue_head_init(list);
9260 static inline void skb_queue_head_init_class(struct sk_buff_head *list,
9261 struct lock_class_key *class)
9263 diff -Nur linux-4.9.6.orig/include/linux/smp.h linux-4.9.6/include/linux/smp.h
9264 --- linux-4.9.6.orig/include/linux/smp.h 2017-01-26 08:25:24.000000000 +0100
9265 +++ linux-4.9.6/include/linux/smp.h 2017-01-28 13:59:09.947656802 +0100
9267 #define get_cpu() ({ preempt_disable(); smp_processor_id(); })
9268 #define put_cpu() preempt_enable()
9270 +#define get_cpu_light() ({ migrate_disable(); smp_processor_id(); })
9271 +#define put_cpu_light() migrate_enable()
9274 * Callback to arch code if there's nosmp or maxcpus=0 on the
9275 * boot command line:
9276 diff -Nur linux-4.9.6.orig/include/linux/spinlock_api_smp.h linux-4.9.6/include/linux/spinlock_api_smp.h
9277 --- linux-4.9.6.orig/include/linux/spinlock_api_smp.h 2017-01-26 08:25:24.000000000 +0100
9278 +++ linux-4.9.6/include/linux/spinlock_api_smp.h 2017-01-28 13:59:09.947656802 +0100
9283 -#include <linux/rwlock_api_smp.h>
9284 +#ifndef CONFIG_PREEMPT_RT_FULL
9285 +# include <linux/rwlock_api_smp.h>
9288 #endif /* __LINUX_SPINLOCK_API_SMP_H */
9289 diff -Nur linux-4.9.6.orig/include/linux/spinlock.h linux-4.9.6/include/linux/spinlock.h
9290 --- linux-4.9.6.orig/include/linux/spinlock.h 2017-01-26 08:25:24.000000000 +0100
9291 +++ linux-4.9.6/include/linux/spinlock.h 2017-01-28 13:59:09.947656802 +0100
9292 @@ -271,7 +271,11 @@
9293 #define raw_spin_can_lock(lock) (!raw_spin_is_locked(lock))
9295 /* Include rwlock functions */
9296 -#include <linux/rwlock.h>
9297 +#ifdef CONFIG_PREEMPT_RT_FULL
9298 +# include <linux/rwlock_rt.h>
9300 +# include <linux/rwlock.h>
9304 * Pull the _spin_*()/_read_*()/_write_*() functions/declarations:
9305 @@ -282,6 +286,10 @@
9306 # include <linux/spinlock_api_up.h>
9309 +#ifdef CONFIG_PREEMPT_RT_FULL
9310 +# include <linux/spinlock_rt.h>
9311 +#else /* PREEMPT_RT_FULL */
9314 * Map the spin_lock functions to the raw variants for PREEMPT_RT=n
9316 @@ -347,6 +355,12 @@
9317 raw_spin_unlock(&lock->rlock);
9320 +static __always_inline int spin_unlock_no_deboost(spinlock_t *lock)
9322 + raw_spin_unlock(&lock->rlock);
9326 static __always_inline void spin_unlock_bh(spinlock_t *lock)
9328 raw_spin_unlock_bh(&lock->rlock);
9330 #define atomic_dec_and_lock(atomic, lock) \
9331 __cond_lock(lock, _atomic_dec_and_lock(atomic, lock))
9333 +#endif /* !PREEMPT_RT_FULL */
9335 #endif /* __LINUX_SPINLOCK_H */
9336 diff -Nur linux-4.9.6.orig/include/linux/spinlock_rt.h linux-4.9.6/include/linux/spinlock_rt.h
9337 --- linux-4.9.6.orig/include/linux/spinlock_rt.h 1970-01-01 01:00:00.000000000 +0100
9338 +++ linux-4.9.6/include/linux/spinlock_rt.h 2017-01-28 13:59:09.947656802 +0100
9340 +#ifndef __LINUX_SPINLOCK_RT_H
9341 +#define __LINUX_SPINLOCK_RT_H
9343 +#ifndef __LINUX_SPINLOCK_H
9344 +#error Do not include directly. Use spinlock.h
9347 +#include <linux/bug.h>
9350 +__rt_spin_lock_init(spinlock_t *lock, char *name, struct lock_class_key *key);
9352 +#define spin_lock_init(slock) \
9354 + static struct lock_class_key __key; \
9356 + rt_mutex_init(&(slock)->lock); \
9357 + __rt_spin_lock_init(slock, #slock, &__key); \
9360 +void __lockfunc rt_spin_lock__no_mg(spinlock_t *lock);
9361 +void __lockfunc rt_spin_unlock__no_mg(spinlock_t *lock);
9362 +int __lockfunc rt_spin_trylock__no_mg(spinlock_t *lock);
9364 +extern void __lockfunc rt_spin_lock(spinlock_t *lock);
9365 +extern unsigned long __lockfunc rt_spin_lock_trace_flags(spinlock_t *lock);
9366 +extern void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int subclass);
9367 +extern void __lockfunc rt_spin_unlock(spinlock_t *lock);
9368 +extern int __lockfunc rt_spin_unlock_no_deboost(spinlock_t *lock);
9369 +extern void __lockfunc rt_spin_unlock_wait(spinlock_t *lock);
9370 +extern int __lockfunc rt_spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags);
9371 +extern int __lockfunc rt_spin_trylock_bh(spinlock_t *lock);
9372 +extern int __lockfunc rt_spin_trylock(spinlock_t *lock);
9373 +extern int atomic_dec_and_spin_lock(atomic_t *atomic, spinlock_t *lock);
9376 + * lockdep-less calls, for derived types like rwlock:
9377 + * (for trylock they can use rt_mutex_trylock() directly.
9379 +extern void __lockfunc __rt_spin_lock__no_mg(struct rt_mutex *lock);
9380 +extern void __lockfunc __rt_spin_lock(struct rt_mutex *lock);
9381 +extern void __lockfunc __rt_spin_unlock(struct rt_mutex *lock);
9383 +#define spin_lock(lock) rt_spin_lock(lock)
9385 +#define spin_lock_bh(lock) \
9387 + local_bh_disable(); \
9388 + rt_spin_lock(lock); \
9391 +#define spin_lock_irq(lock) spin_lock(lock)
9393 +#define spin_do_trylock(lock) __cond_lock(lock, rt_spin_trylock(lock))
9395 +#define spin_trylock(lock) \
9398 + __locked = spin_do_trylock(lock); \
9402 +#ifdef CONFIG_LOCKDEP
9403 +# define spin_lock_nested(lock, subclass) \
9405 + rt_spin_lock_nested(lock, subclass); \
9408 +#define spin_lock_bh_nested(lock, subclass) \
9410 + local_bh_disable(); \
9411 + rt_spin_lock_nested(lock, subclass); \
9414 +# define spin_lock_irqsave_nested(lock, flags, subclass) \
9416 + typecheck(unsigned long, flags); \
9418 + rt_spin_lock_nested(lock, subclass); \
9421 +# define spin_lock_nested(lock, subclass) spin_lock(lock)
9422 +# define spin_lock_bh_nested(lock, subclass) spin_lock_bh(lock)
9424 +# define spin_lock_irqsave_nested(lock, flags, subclass) \
9426 + typecheck(unsigned long, flags); \
9428 + spin_lock(lock); \
9432 +#define spin_lock_irqsave(lock, flags) \
9434 + typecheck(unsigned long, flags); \
9436 + spin_lock(lock); \
9439 +static inline unsigned long spin_lock_trace_flags(spinlock_t *lock)
9441 + unsigned long flags = 0;
9442 +#ifdef CONFIG_TRACE_IRQFLAGS
9443 + flags = rt_spin_lock_trace_flags(lock);
9445 + spin_lock(lock); /* lock_local */
9450 +/* FIXME: we need rt_spin_lock_nest_lock */
9451 +#define spin_lock_nest_lock(lock, nest_lock) spin_lock_nested(lock, 0)
9453 +#define spin_unlock(lock) rt_spin_unlock(lock)
9454 +#define spin_unlock_no_deboost(lock) rt_spin_unlock_no_deboost(lock)
9456 +#define spin_unlock_bh(lock) \
9458 + rt_spin_unlock(lock); \
9459 + local_bh_enable(); \
9462 +#define spin_unlock_irq(lock) spin_unlock(lock)
9464 +#define spin_unlock_irqrestore(lock, flags) \
9466 + typecheck(unsigned long, flags); \
9468 + spin_unlock(lock); \
9471 +#define spin_trylock_bh(lock) __cond_lock(lock, rt_spin_trylock_bh(lock))
9472 +#define spin_trylock_irq(lock) spin_trylock(lock)
9474 +#define spin_trylock_irqsave(lock, flags) \
9475 + rt_spin_trylock_irqsave(lock, &(flags))
9477 +#define spin_unlock_wait(lock) rt_spin_unlock_wait(lock)
9479 +#ifdef CONFIG_GENERIC_LOCKBREAK
9480 +# define spin_is_contended(lock) ((lock)->break_lock)
9482 +# define spin_is_contended(lock) (((void)(lock), 0))
9485 +static inline int spin_can_lock(spinlock_t *lock)
9487 + return !rt_mutex_is_locked(&lock->lock);
9490 +static inline int spin_is_locked(spinlock_t *lock)
9492 + return rt_mutex_is_locked(&lock->lock);
9495 +static inline void assert_spin_locked(spinlock_t *lock)
9497 + BUG_ON(!spin_is_locked(lock));
9500 +#define atomic_dec_and_lock(atomic, lock) \
9501 + atomic_dec_and_spin_lock(atomic, lock)
9504 diff -Nur linux-4.9.6.orig/include/linux/spinlock_types.h linux-4.9.6/include/linux/spinlock_types.h
9505 --- linux-4.9.6.orig/include/linux/spinlock_types.h 2017-01-26 08:25:24.000000000 +0100
9506 +++ linux-4.9.6/include/linux/spinlock_types.h 2017-01-28 13:59:09.947656802 +0100
9508 * Released under the General Public License (GPL).
9511 -#if defined(CONFIG_SMP)
9512 -# include <asm/spinlock_types.h>
9514 -# include <linux/spinlock_types_up.h>
9517 -#include <linux/lockdep.h>
9519 -typedef struct raw_spinlock {
9520 - arch_spinlock_t raw_lock;
9521 -#ifdef CONFIG_GENERIC_LOCKBREAK
9522 - unsigned int break_lock;
9524 -#ifdef CONFIG_DEBUG_SPINLOCK
9525 - unsigned int magic, owner_cpu;
9528 -#ifdef CONFIG_DEBUG_LOCK_ALLOC
9529 - struct lockdep_map dep_map;
9533 -#define SPINLOCK_MAGIC 0xdead4ead
9535 -#define SPINLOCK_OWNER_INIT ((void *)-1L)
9537 -#ifdef CONFIG_DEBUG_LOCK_ALLOC
9538 -# define SPIN_DEP_MAP_INIT(lockname) .dep_map = { .name = #lockname }
9540 -# define SPIN_DEP_MAP_INIT(lockname)
9542 +#include <linux/spinlock_types_raw.h>
9544 -#ifdef CONFIG_DEBUG_SPINLOCK
9545 -# define SPIN_DEBUG_INIT(lockname) \
9546 - .magic = SPINLOCK_MAGIC, \
9547 - .owner_cpu = -1, \
9548 - .owner = SPINLOCK_OWNER_INIT,
9549 +#ifndef CONFIG_PREEMPT_RT_FULL
9550 +# include <linux/spinlock_types_nort.h>
9551 +# include <linux/rwlock_types.h>
9553 -# define SPIN_DEBUG_INIT(lockname)
9554 +# include <linux/rtmutex.h>
9555 +# include <linux/spinlock_types_rt.h>
9556 +# include <linux/rwlock_types_rt.h>
9559 -#define __RAW_SPIN_LOCK_INITIALIZER(lockname) \
9561 - .raw_lock = __ARCH_SPIN_LOCK_UNLOCKED, \
9562 - SPIN_DEBUG_INIT(lockname) \
9563 - SPIN_DEP_MAP_INIT(lockname) }
9565 -#define __RAW_SPIN_LOCK_UNLOCKED(lockname) \
9566 - (raw_spinlock_t) __RAW_SPIN_LOCK_INITIALIZER(lockname)
9568 -#define DEFINE_RAW_SPINLOCK(x) raw_spinlock_t x = __RAW_SPIN_LOCK_UNLOCKED(x)
9570 -typedef struct spinlock {
9572 - struct raw_spinlock rlock;
9574 -#ifdef CONFIG_DEBUG_LOCK_ALLOC
9575 -# define LOCK_PADSIZE (offsetof(struct raw_spinlock, dep_map))
9577 - u8 __padding[LOCK_PADSIZE];
9578 - struct lockdep_map dep_map;
9584 -#define __SPIN_LOCK_INITIALIZER(lockname) \
9585 - { { .rlock = __RAW_SPIN_LOCK_INITIALIZER(lockname) } }
9587 -#define __SPIN_LOCK_UNLOCKED(lockname) \
9588 - (spinlock_t ) __SPIN_LOCK_INITIALIZER(lockname)
9590 -#define DEFINE_SPINLOCK(x) spinlock_t x = __SPIN_LOCK_UNLOCKED(x)
9592 -#include <linux/rwlock_types.h>
9594 #endif /* __LINUX_SPINLOCK_TYPES_H */
9595 diff -Nur linux-4.9.6.orig/include/linux/spinlock_types_nort.h linux-4.9.6/include/linux/spinlock_types_nort.h
9596 --- linux-4.9.6.orig/include/linux/spinlock_types_nort.h 1970-01-01 01:00:00.000000000 +0100
9597 +++ linux-4.9.6/include/linux/spinlock_types_nort.h 2017-01-28 13:59:09.947656802 +0100
9599 +#ifndef __LINUX_SPINLOCK_TYPES_NORT_H
9600 +#define __LINUX_SPINLOCK_TYPES_NORT_H
9602 +#ifndef __LINUX_SPINLOCK_TYPES_H
9603 +#error "Do not include directly. Include spinlock_types.h instead"
9607 + * The non RT version maps spinlocks to raw_spinlocks
9609 +typedef struct spinlock {
9611 + struct raw_spinlock rlock;
9613 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
9614 +# define LOCK_PADSIZE (offsetof(struct raw_spinlock, dep_map))
9616 + u8 __padding[LOCK_PADSIZE];
9617 + struct lockdep_map dep_map;
9623 +#define __SPIN_LOCK_INITIALIZER(lockname) \
9624 + { { .rlock = __RAW_SPIN_LOCK_INITIALIZER(lockname) } }
9626 +#define __SPIN_LOCK_UNLOCKED(lockname) \
9627 + (spinlock_t ) __SPIN_LOCK_INITIALIZER(lockname)
9629 +#define DEFINE_SPINLOCK(x) spinlock_t x = __SPIN_LOCK_UNLOCKED(x)
9632 diff -Nur linux-4.9.6.orig/include/linux/spinlock_types_raw.h linux-4.9.6/include/linux/spinlock_types_raw.h
9633 --- linux-4.9.6.orig/include/linux/spinlock_types_raw.h 1970-01-01 01:00:00.000000000 +0100
9634 +++ linux-4.9.6/include/linux/spinlock_types_raw.h 2017-01-28 13:59:09.947656802 +0100
9636 +#ifndef __LINUX_SPINLOCK_TYPES_RAW_H
9637 +#define __LINUX_SPINLOCK_TYPES_RAW_H
9639 +#if defined(CONFIG_SMP)
9640 +# include <asm/spinlock_types.h>
9642 +# include <linux/spinlock_types_up.h>
9645 +#include <linux/lockdep.h>
9647 +typedef struct raw_spinlock {
9648 + arch_spinlock_t raw_lock;
9649 +#ifdef CONFIG_GENERIC_LOCKBREAK
9650 + unsigned int break_lock;
9652 +#ifdef CONFIG_DEBUG_SPINLOCK
9653 + unsigned int magic, owner_cpu;
9656 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
9657 + struct lockdep_map dep_map;
9661 +#define SPINLOCK_MAGIC 0xdead4ead
9663 +#define SPINLOCK_OWNER_INIT ((void *)-1L)
9665 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
9666 +# define SPIN_DEP_MAP_INIT(lockname) .dep_map = { .name = #lockname }
9668 +# define SPIN_DEP_MAP_INIT(lockname)
9671 +#ifdef CONFIG_DEBUG_SPINLOCK
9672 +# define SPIN_DEBUG_INIT(lockname) \
9673 + .magic = SPINLOCK_MAGIC, \
9674 + .owner_cpu = -1, \
9675 + .owner = SPINLOCK_OWNER_INIT,
9677 +# define SPIN_DEBUG_INIT(lockname)
9680 +#define __RAW_SPIN_LOCK_INITIALIZER(lockname) \
9682 + .raw_lock = __ARCH_SPIN_LOCK_UNLOCKED, \
9683 + SPIN_DEBUG_INIT(lockname) \
9684 + SPIN_DEP_MAP_INIT(lockname) }
9686 +#define __RAW_SPIN_LOCK_UNLOCKED(lockname) \
9687 + (raw_spinlock_t) __RAW_SPIN_LOCK_INITIALIZER(lockname)
9689 +#define DEFINE_RAW_SPINLOCK(x) raw_spinlock_t x = __RAW_SPIN_LOCK_UNLOCKED(x)
9692 diff -Nur linux-4.9.6.orig/include/linux/spinlock_types_rt.h linux-4.9.6/include/linux/spinlock_types_rt.h
9693 --- linux-4.9.6.orig/include/linux/spinlock_types_rt.h 1970-01-01 01:00:00.000000000 +0100
9694 +++ linux-4.9.6/include/linux/spinlock_types_rt.h 2017-01-28 13:59:09.947656802 +0100
9696 +#ifndef __LINUX_SPINLOCK_TYPES_RT_H
9697 +#define __LINUX_SPINLOCK_TYPES_RT_H
9699 +#ifndef __LINUX_SPINLOCK_TYPES_H
9700 +#error "Do not include directly. Include spinlock_types.h instead"
9703 +#include <linux/cache.h>
9706 + * PREEMPT_RT: spinlocks - an RT mutex plus lock-break field:
9708 +typedef struct spinlock {
9709 + struct rt_mutex lock;
9710 + unsigned int break_lock;
9711 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
9712 + struct lockdep_map dep_map;
9716 +#ifdef CONFIG_DEBUG_RT_MUTEXES
9717 +# define __RT_SPIN_INITIALIZER(name) \
9719 + .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), \
9720 + .save_state = 1, \
9721 + .file = __FILE__, \
9722 + .line = __LINE__ , \
9725 +# define __RT_SPIN_INITIALIZER(name) \
9727 + .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), \
9728 + .save_state = 1, \
9733 +.wait_list = PLIST_HEAD_INIT_RAW((name).lock.wait_list, (name).lock.wait_lock)
9736 +#define __SPIN_LOCK_UNLOCKED(name) \
9737 + { .lock = __RT_SPIN_INITIALIZER(name.lock), \
9738 + SPIN_DEP_MAP_INIT(name) }
9740 +#define DEFINE_SPINLOCK(name) \
9741 + spinlock_t name = __SPIN_LOCK_UNLOCKED(name)
9744 diff -Nur linux-4.9.6.orig/include/linux/srcu.h linux-4.9.6/include/linux/srcu.h
9745 --- linux-4.9.6.orig/include/linux/srcu.h 2017-01-26 08:25:24.000000000 +0100
9746 +++ linux-4.9.6/include/linux/srcu.h 2017-01-28 13:59:09.947656802 +0100
9749 void process_srcu(struct work_struct *work);
9751 -#define __SRCU_STRUCT_INIT(name) \
9752 +#define __SRCU_STRUCT_INIT(name, pcpu_name) \
9754 .completed = -300, \
9755 - .per_cpu_ref = &name##_srcu_array, \
9756 + .per_cpu_ref = &pcpu_name, \
9757 .queue_lock = __SPIN_LOCK_UNLOCKED(name.queue_lock), \
9759 .batch_queue = RCU_BATCH_INIT(name.batch_queue), \
9762 #define __DEFINE_SRCU(name, is_static) \
9763 static DEFINE_PER_CPU(struct srcu_struct_array, name##_srcu_array);\
9764 - is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name)
9765 + is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name, name##_srcu_array)
9766 #define DEFINE_SRCU(name) __DEFINE_SRCU(name, /* not static */)
9767 #define DEFINE_STATIC_SRCU(name) __DEFINE_SRCU(name, static)
9769 diff -Nur linux-4.9.6.orig/include/linux/suspend.h linux-4.9.6/include/linux/suspend.h
9770 --- linux-4.9.6.orig/include/linux/suspend.h 2017-01-26 08:25:24.000000000 +0100
9771 +++ linux-4.9.6/include/linux/suspend.h 2017-01-28 13:59:09.947656802 +0100
9772 @@ -193,6 +193,12 @@
9776 +#if defined(CONFIG_SUSPEND) || defined(CONFIG_HIBERNATION)
9777 +extern bool pm_in_action;
9779 +# define pm_in_action false
9782 #ifdef CONFIG_SUSPEND
9784 * suspend_set_ops - set platform dependent suspend operations
9785 diff -Nur linux-4.9.6.orig/include/linux/swait.h linux-4.9.6/include/linux/swait.h
9786 --- linux-4.9.6.orig/include/linux/swait.h 2017-01-26 08:25:24.000000000 +0100
9787 +++ linux-4.9.6/include/linux/swait.h 2017-01-28 13:59:09.947656802 +0100
9789 extern void swake_up(struct swait_queue_head *q);
9790 extern void swake_up_all(struct swait_queue_head *q);
9791 extern void swake_up_locked(struct swait_queue_head *q);
9792 +extern void swake_up_all_locked(struct swait_queue_head *q);
9794 extern void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait);
9795 extern void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int state);
9796 diff -Nur linux-4.9.6.orig/include/linux/swap.h linux-4.9.6/include/linux/swap.h
9797 --- linux-4.9.6.orig/include/linux/swap.h 2017-01-26 08:25:24.000000000 +0100
9798 +++ linux-4.9.6/include/linux/swap.h 2017-01-28 13:59:09.947656802 +0100
9800 #include <linux/fs.h>
9801 #include <linux/atomic.h>
9802 #include <linux/page-flags.h>
9803 +#include <linux/locallock.h>
9804 #include <asm/page.h>
9806 struct notifier_block;
9808 void *workingset_eviction(struct address_space *mapping, struct page *page);
9809 bool workingset_refault(void *shadow);
9810 void workingset_activation(struct page *page);
9811 -extern struct list_lru workingset_shadow_nodes;
9812 +extern struct list_lru __workingset_shadow_nodes;
9813 +DECLARE_LOCAL_IRQ_LOCK(workingset_shadow_lock);
9815 static inline unsigned int workingset_node_pages(struct radix_tree_node *node)
9820 /* linux/mm/swap.c */
9821 +DECLARE_LOCAL_IRQ_LOCK(swapvec_lock);
9822 extern void lru_cache_add(struct page *);
9823 extern void lru_cache_add_anon(struct page *page);
9824 extern void lru_cache_add_file(struct page *page);
9825 diff -Nur linux-4.9.6.orig/include/linux/swork.h linux-4.9.6/include/linux/swork.h
9826 --- linux-4.9.6.orig/include/linux/swork.h 1970-01-01 01:00:00.000000000 +0100
9827 +++ linux-4.9.6/include/linux/swork.h 2017-01-28 13:59:09.947656802 +0100
9829 +#ifndef _LINUX_SWORK_H
9830 +#define _LINUX_SWORK_H
9832 +#include <linux/list.h>
9834 +struct swork_event {
9835 + struct list_head item;
9836 + unsigned long flags;
9837 + void (*func)(struct swork_event *);
9840 +static inline void INIT_SWORK(struct swork_event *event,
9841 + void (*func)(struct swork_event *))
9844 + event->func = func;
9847 +bool swork_queue(struct swork_event *sev);
9849 +int swork_get(void);
9850 +void swork_put(void);
9852 +#endif /* _LINUX_SWORK_H */
9853 diff -Nur linux-4.9.6.orig/include/linux/thread_info.h linux-4.9.6/include/linux/thread_info.h
9854 --- linux-4.9.6.orig/include/linux/thread_info.h 2017-01-26 08:25:24.000000000 +0100
9855 +++ linux-4.9.6/include/linux/thread_info.h 2017-01-28 13:59:09.947656802 +0100
9856 @@ -107,7 +107,17 @@
9857 #define test_thread_flag(flag) \
9858 test_ti_thread_flag(current_thread_info(), flag)
9860 -#define tif_need_resched() test_thread_flag(TIF_NEED_RESCHED)
9861 +#ifdef CONFIG_PREEMPT_LAZY
9862 +#define tif_need_resched() (test_thread_flag(TIF_NEED_RESCHED) || \
9863 + test_thread_flag(TIF_NEED_RESCHED_LAZY))
9864 +#define tif_need_resched_now() (test_thread_flag(TIF_NEED_RESCHED))
9865 +#define tif_need_resched_lazy() test_thread_flag(TIF_NEED_RESCHED_LAZY))
9868 +#define tif_need_resched() test_thread_flag(TIF_NEED_RESCHED)
9869 +#define tif_need_resched_now() test_thread_flag(TIF_NEED_RESCHED)
9870 +#define tif_need_resched_lazy() 0
9873 #ifndef CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES
9874 static inline int arch_within_stack_frames(const void * const stack,
9875 diff -Nur linux-4.9.6.orig/include/linux/timer.h linux-4.9.6/include/linux/timer.h
9876 --- linux-4.9.6.orig/include/linux/timer.h 2017-01-26 08:25:24.000000000 +0100
9877 +++ linux-4.9.6/include/linux/timer.h 2017-01-28 13:59:09.947656802 +0100
9880 extern int try_to_del_timer_sync(struct timer_list *timer);
9883 +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
9884 extern int del_timer_sync(struct timer_list *timer);
9886 # define del_timer_sync(t) del_timer(t)
9887 diff -Nur linux-4.9.6.orig/include/linux/trace_events.h linux-4.9.6/include/linux/trace_events.h
9888 --- linux-4.9.6.orig/include/linux/trace_events.h 2017-01-26 08:25:24.000000000 +0100
9889 +++ linux-4.9.6/include/linux/trace_events.h 2017-01-28 13:59:09.947656802 +0100
9891 unsigned char flags;
9892 unsigned char preempt_count;
9894 + unsigned short migrate_disable;
9895 + unsigned short padding;
9896 + unsigned char preempt_lazy_count;
9899 #define TRACE_EVENT_TYPE_MAX \
9900 diff -Nur linux-4.9.6.orig/include/linux/uaccess.h linux-4.9.6/include/linux/uaccess.h
9901 --- linux-4.9.6.orig/include/linux/uaccess.h 2017-01-26 08:25:24.000000000 +0100
9902 +++ linux-4.9.6/include/linux/uaccess.h 2017-01-28 13:59:09.947656802 +0100
9905 static inline void pagefault_disable(void)
9907 + migrate_disable();
9908 pagefault_disabled_inc();
9910 * make sure to have issued the store before a pagefault
9914 pagefault_disabled_dec();
9919 diff -Nur linux-4.9.6.orig/include/linux/uprobes.h linux-4.9.6/include/linux/uprobes.h
9920 --- linux-4.9.6.orig/include/linux/uprobes.h 2017-01-26 08:25:24.000000000 +0100
9921 +++ linux-4.9.6/include/linux/uprobes.h 2017-01-28 13:59:09.947656802 +0100
9923 #include <linux/errno.h>
9924 #include <linux/rbtree.h>
9925 #include <linux/types.h>
9926 +#include <linux/wait.h>
9928 struct vm_area_struct;
9930 diff -Nur linux-4.9.6.orig/include/linux/vmstat.h linux-4.9.6/include/linux/vmstat.h
9931 --- linux-4.9.6.orig/include/linux/vmstat.h 2017-01-26 08:25:24.000000000 +0100
9932 +++ linux-4.9.6/include/linux/vmstat.h 2017-01-28 13:59:09.947656802 +0100
9935 static inline void __count_vm_event(enum vm_event_item item)
9937 + preempt_disable_rt();
9938 raw_cpu_inc(vm_event_states.event[item]);
9939 + preempt_enable_rt();
9942 static inline void count_vm_event(enum vm_event_item item)
9945 static inline void __count_vm_events(enum vm_event_item item, long delta)
9947 + preempt_disable_rt();
9948 raw_cpu_add(vm_event_states.event[item], delta);
9949 + preempt_enable_rt();
9952 static inline void count_vm_events(enum vm_event_item item, long delta)
9953 diff -Nur linux-4.9.6.orig/include/linux/wait.h linux-4.9.6/include/linux/wait.h
9954 --- linux-4.9.6.orig/include/linux/wait.h 2017-01-26 08:25:24.000000000 +0100
9955 +++ linux-4.9.6/include/linux/wait.h 2017-01-28 13:59:09.947656802 +0100
9957 #include <linux/spinlock.h>
9958 #include <asm/current.h>
9959 #include <uapi/linux/wait.h>
9960 +#include <linux/atomic.h>
9962 typedef struct __wait_queue wait_queue_t;
9963 typedef int (*wait_queue_func_t)(wait_queue_t *wait, unsigned mode, int flags, void *key);
9964 diff -Nur linux-4.9.6.orig/include/net/dst.h linux-4.9.6/include/net/dst.h
9965 --- linux-4.9.6.orig/include/net/dst.h 2017-01-26 08:25:24.000000000 +0100
9966 +++ linux-4.9.6/include/net/dst.h 2017-01-28 13:59:09.947656802 +0100
9968 static inline int dst_neigh_output(struct dst_entry *dst, struct neighbour *n,
9969 struct sk_buff *skb)
9971 - const struct hh_cache *hh;
9972 + struct hh_cache *hh;
9974 if (dst->pending_confirm) {
9975 unsigned long now = jiffies;
9976 diff -Nur linux-4.9.6.orig/include/net/gen_stats.h linux-4.9.6/include/net/gen_stats.h
9977 --- linux-4.9.6.orig/include/net/gen_stats.h 2017-01-26 08:25:24.000000000 +0100
9978 +++ linux-4.9.6/include/net/gen_stats.h 2017-01-28 13:59:09.947656802 +0100
9980 #include <linux/socket.h>
9981 #include <linux/rtnetlink.h>
9982 #include <linux/pkt_sched.h>
9983 +#include <net/net_seq_lock.h>
9985 struct gnet_stats_basic_cpu {
9986 struct gnet_stats_basic_packed bstats;
9988 spinlock_t *lock, struct gnet_dump *d,
9991 -int gnet_stats_copy_basic(const seqcount_t *running,
9992 +int gnet_stats_copy_basic(net_seqlock_t *running,
9993 struct gnet_dump *d,
9994 struct gnet_stats_basic_cpu __percpu *cpu,
9995 struct gnet_stats_basic_packed *b);
9996 -void __gnet_stats_copy_basic(const seqcount_t *running,
9997 +void __gnet_stats_copy_basic(net_seqlock_t *running,
9998 struct gnet_stats_basic_packed *bstats,
9999 struct gnet_stats_basic_cpu __percpu *cpu,
10000 struct gnet_stats_basic_packed *b);
10001 @@ -55,14 +56,14 @@
10002 struct gnet_stats_basic_cpu __percpu *cpu_bstats,
10003 struct gnet_stats_rate_est64 *rate_est,
10004 spinlock_t *stats_lock,
10005 - seqcount_t *running, struct nlattr *opt);
10006 + net_seqlock_t *running, struct nlattr *opt);
10007 void gen_kill_estimator(struct gnet_stats_basic_packed *bstats,
10008 struct gnet_stats_rate_est64 *rate_est);
10009 int gen_replace_estimator(struct gnet_stats_basic_packed *bstats,
10010 struct gnet_stats_basic_cpu __percpu *cpu_bstats,
10011 struct gnet_stats_rate_est64 *rate_est,
10012 spinlock_t *stats_lock,
10013 - seqcount_t *running, struct nlattr *opt);
10014 + net_seqlock_t *running, struct nlattr *opt);
10015 bool gen_estimator_active(const struct gnet_stats_basic_packed *bstats,
10016 const struct gnet_stats_rate_est64 *rate_est);
10018 diff -Nur linux-4.9.6.orig/include/net/neighbour.h linux-4.9.6/include/net/neighbour.h
10019 --- linux-4.9.6.orig/include/net/neighbour.h 2017-01-26 08:25:24.000000000 +0100
10020 +++ linux-4.9.6/include/net/neighbour.h 2017-01-28 13:59:09.947656802 +0100
10021 @@ -446,7 +446,7 @@
10025 -static inline int neigh_hh_output(const struct hh_cache *hh, struct sk_buff *skb)
10026 +static inline int neigh_hh_output(struct hh_cache *hh, struct sk_buff *skb)
10030 @@ -501,7 +501,7 @@
10032 #define NEIGH_CB(skb) ((struct neighbour_cb *)(skb)->cb)
10034 -static inline void neigh_ha_snapshot(char *dst, const struct neighbour *n,
10035 +static inline void neigh_ha_snapshot(char *dst, struct neighbour *n,
10036 const struct net_device *dev)
10039 diff -Nur linux-4.9.6.orig/include/net/netns/ipv4.h linux-4.9.6/include/net/netns/ipv4.h
10040 --- linux-4.9.6.orig/include/net/netns/ipv4.h 2017-01-26 08:25:24.000000000 +0100
10041 +++ linux-4.9.6/include/net/netns/ipv4.h 2017-01-28 13:59:09.947656802 +0100
10044 int sysctl_icmp_echo_ignore_all;
10045 int sysctl_icmp_echo_ignore_broadcasts;
10046 + int sysctl_icmp_echo_sysrq;
10047 int sysctl_icmp_ignore_bogus_error_responses;
10048 int sysctl_icmp_ratelimit;
10049 int sysctl_icmp_ratemask;
10050 diff -Nur linux-4.9.6.orig/include/net/net_seq_lock.h linux-4.9.6/include/net/net_seq_lock.h
10051 --- linux-4.9.6.orig/include/net/net_seq_lock.h 1970-01-01 01:00:00.000000000 +0100
10052 +++ linux-4.9.6/include/net/net_seq_lock.h 2017-01-28 13:59:09.947656802 +0100
10054 +#ifndef __NET_NET_SEQ_LOCK_H__
10055 +#define __NET_NET_SEQ_LOCK_H__
10057 +#ifdef CONFIG_PREEMPT_RT_BASE
10058 +# define net_seqlock_t seqlock_t
10059 +# define net_seq_begin(__r) read_seqbegin(__r)
10060 +# define net_seq_retry(__r, __s) read_seqretry(__r, __s)
10063 +# define net_seqlock_t seqcount_t
10064 +# define net_seq_begin(__r) read_seqcount_begin(__r)
10065 +# define net_seq_retry(__r, __s) read_seqcount_retry(__r, __s)
10069 diff -Nur linux-4.9.6.orig/include/net/sch_generic.h linux-4.9.6/include/net/sch_generic.h
10070 --- linux-4.9.6.orig/include/net/sch_generic.h 2017-01-26 08:25:24.000000000 +0100
10071 +++ linux-4.9.6/include/net/sch_generic.h 2017-01-28 13:59:09.951656955 +0100
10073 #include <linux/dynamic_queue_limits.h>
10074 #include <net/gen_stats.h>
10075 #include <net/rtnetlink.h>
10076 +#include <net/net_seq_lock.h>
10079 struct qdisc_walker;
10081 struct sk_buff *gso_skb ____cacheline_aligned_in_smp;
10082 struct qdisc_skb_head q;
10083 struct gnet_stats_basic_packed bstats;
10084 - seqcount_t running;
10085 + net_seqlock_t running;
10086 struct gnet_stats_queue qstats;
10087 unsigned long state;
10088 struct Qdisc *next_sched;
10089 @@ -98,13 +99,22 @@
10090 spinlock_t busylock ____cacheline_aligned_in_smp;
10093 -static inline bool qdisc_is_running(const struct Qdisc *qdisc)
10094 +static inline bool qdisc_is_running(struct Qdisc *qdisc)
10096 +#ifdef CONFIG_PREEMPT_RT_BASE
10097 + return spin_is_locked(&qdisc->running.lock) ? true : false;
10099 return (raw_read_seqcount(&qdisc->running) & 1) ? true : false;
10103 static inline bool qdisc_run_begin(struct Qdisc *qdisc)
10105 +#ifdef CONFIG_PREEMPT_RT_BASE
10106 + if (try_write_seqlock(&qdisc->running))
10110 if (qdisc_is_running(qdisc))
10112 /* Variant of write_seqcount_begin() telling lockdep a trylock
10113 @@ -113,11 +123,16 @@
10114 raw_write_seqcount_begin(&qdisc->running);
10115 seqcount_acquire(&qdisc->running.dep_map, 0, 1, _RET_IP_);
10120 static inline void qdisc_run_end(struct Qdisc *qdisc)
10122 +#ifdef CONFIG_PREEMPT_RT_BASE
10123 + write_sequnlock(&qdisc->running);
10125 write_seqcount_end(&qdisc->running);
10129 static inline bool qdisc_may_bulk(const struct Qdisc *qdisc)
10130 @@ -308,7 +323,7 @@
10131 return qdisc_lock(root);
10134 -static inline seqcount_t *qdisc_root_sleeping_running(const struct Qdisc *qdisc)
10135 +static inline net_seqlock_t *qdisc_root_sleeping_running(const struct Qdisc *qdisc)
10137 struct Qdisc *root = qdisc_root_sleeping(qdisc);
10139 diff -Nur linux-4.9.6.orig/include/trace/events/hist.h linux-4.9.6/include/trace/events/hist.h
10140 --- linux-4.9.6.orig/include/trace/events/hist.h 1970-01-01 01:00:00.000000000 +0100
10141 +++ linux-4.9.6/include/trace/events/hist.h 2017-01-28 13:59:09.951656955 +0100
10143 +#undef TRACE_SYSTEM
10144 +#define TRACE_SYSTEM hist
10146 +#if !defined(_TRACE_HIST_H) || defined(TRACE_HEADER_MULTI_READ)
10147 +#define _TRACE_HIST_H
10149 +#include "latency_hist.h"
10150 +#include <linux/tracepoint.h>
10152 +#if !defined(CONFIG_PREEMPT_OFF_HIST) && !defined(CONFIG_INTERRUPT_OFF_HIST)
10153 +#define trace_preemptirqsoff_hist(a, b)
10154 +#define trace_preemptirqsoff_hist_rcuidle(a, b)
10156 +TRACE_EVENT(preemptirqsoff_hist,
10158 + TP_PROTO(int reason, int starthist),
10160 + TP_ARGS(reason, starthist),
10162 + TP_STRUCT__entry(
10163 + __field(int, reason)
10164 + __field(int, starthist)
10168 + __entry->reason = reason;
10169 + __entry->starthist = starthist;
10172 + TP_printk("reason=%s starthist=%s", getaction(__entry->reason),
10173 + __entry->starthist ? "start" : "stop")
10177 +#ifndef CONFIG_MISSED_TIMER_OFFSETS_HIST
10178 +#define trace_hrtimer_interrupt(a, b, c, d)
10180 +TRACE_EVENT(hrtimer_interrupt,
10182 + TP_PROTO(int cpu, long long offset, struct task_struct *curr,
10183 + struct task_struct *task),
10185 + TP_ARGS(cpu, offset, curr, task),
10187 + TP_STRUCT__entry(
10188 + __field(int, cpu)
10189 + __field(long long, offset)
10190 + __array(char, ccomm, TASK_COMM_LEN)
10191 + __field(int, cprio)
10192 + __array(char, tcomm, TASK_COMM_LEN)
10193 + __field(int, tprio)
10197 + __entry->cpu = cpu;
10198 + __entry->offset = offset;
10199 + memcpy(__entry->ccomm, curr->comm, TASK_COMM_LEN);
10200 + __entry->cprio = curr->prio;
10201 + memcpy(__entry->tcomm, task != NULL ? task->comm : "<none>",
10202 + task != NULL ? TASK_COMM_LEN : 7);
10203 + __entry->tprio = task != NULL ? task->prio : -1;
10206 + TP_printk("cpu=%d offset=%lld curr=%s[%d] thread=%s[%d]",
10207 + __entry->cpu, __entry->offset, __entry->ccomm,
10208 + __entry->cprio, __entry->tcomm, __entry->tprio)
10212 +#endif /* _TRACE_HIST_H */
10214 +/* This part must be outside protection */
10215 +#include <trace/define_trace.h>
10216 diff -Nur linux-4.9.6.orig/include/trace/events/latency_hist.h linux-4.9.6/include/trace/events/latency_hist.h
10217 --- linux-4.9.6.orig/include/trace/events/latency_hist.h 1970-01-01 01:00:00.000000000 +0100
10218 +++ linux-4.9.6/include/trace/events/latency_hist.h 2017-01-28 13:59:09.951656955 +0100
10220 +#ifndef _LATENCY_HIST_H
10221 +#define _LATENCY_HIST_H
10223 +enum hist_action {
10232 +static char *actions[] = {
10241 +static inline char *getaction(int action)
10243 + if (action >= 0 && action <= sizeof(actions)/sizeof(actions[0]))
10244 + return actions[action];
10245 + return "unknown";
10248 +#endif /* _LATENCY_HIST_H */
10249 diff -Nur linux-4.9.6.orig/init/Kconfig linux-4.9.6/init/Kconfig
10250 --- linux-4.9.6.orig/init/Kconfig 2017-01-26 08:25:24.000000000 +0100
10251 +++ linux-4.9.6/init/Kconfig 2017-01-28 13:59:09.951656955 +0100
10252 @@ -506,7 +506,7 @@
10255 bool "Make expert-level adjustments to RCU configuration"
10257 + default y if PREEMPT_RT_FULL
10259 This option needs to be enabled if you wish to make
10260 expert-level adjustments to RCU configuration. By default,
10261 @@ -623,7 +623,7 @@
10263 config RCU_FAST_NO_HZ
10264 bool "Accelerate last non-dyntick-idle CPU's grace periods"
10265 - depends on NO_HZ_COMMON && SMP && RCU_EXPERT
10266 + depends on NO_HZ_COMMON && SMP && RCU_EXPERT && !PREEMPT_RT_FULL
10269 This option permits CPUs to enter dynticks-idle state even if
10270 @@ -650,7 +650,7 @@
10272 bool "Enable RCU priority boosting"
10273 depends on RT_MUTEXES && PREEMPT_RCU && RCU_EXPERT
10275 + default y if PREEMPT_RT_FULL
10277 This option boosts the priority of preempted RCU readers that
10278 block the current preemptible RCU grace period for too long.
10279 @@ -781,19 +781,6 @@
10283 -config RCU_EXPEDITE_BOOT
10287 - This option enables expedited grace periods at boot time,
10288 - as if rcu_expedite_gp() had been invoked early in boot.
10289 - The corresponding rcu_unexpedite_gp() is invoked from
10290 - rcu_end_inkernel_boot(), which is intended to be invoked
10291 - at the end of the kernel-only boot sequence, just before
10294 - Accept the default if unsure.
10296 endmenu # "RCU Subsystem"
10299 @@ -1064,6 +1051,7 @@
10300 config RT_GROUP_SCHED
10301 bool "Group scheduling for SCHED_RR/FIFO"
10302 depends on CGROUP_SCHED
10303 + depends on !PREEMPT_RT_FULL
10306 This feature lets you explicitly allocate real CPU bandwidth
10307 @@ -1772,6 +1760,7 @@
10311 + depends on !PREEMPT_RT_FULL
10312 select HAVE_HARDENED_USERCOPY_ALLOCATOR
10314 The regular slab allocator that is established and known to work
10315 @@ -1792,6 +1781,7 @@
10318 bool "SLOB (Simple Allocator)"
10319 + depends on !PREEMPT_RT_FULL
10321 SLOB replaces the stock allocator with a drastically simpler
10322 allocator. SLOB is generally more space efficient but
10323 @@ -1810,7 +1800,7 @@
10325 config SLUB_CPU_PARTIAL
10327 - depends on SLUB && SMP
10328 + depends on SLUB && SMP && !PREEMPT_RT_FULL
10329 bool "SLUB per cpu partial cache"
10331 Per cpu partial caches accellerate objects allocation and freeing
10332 diff -Nur linux-4.9.6.orig/init/main.c linux-4.9.6/init/main.c
10333 --- linux-4.9.6.orig/init/main.c 2017-01-26 08:25:24.000000000 +0100
10334 +++ linux-4.9.6/init/main.c 2017-01-28 13:59:09.951656955 +0100
10335 @@ -507,6 +507,7 @@
10336 setup_command_line(command_line);
10337 setup_nr_cpu_ids();
10338 setup_per_cpu_areas();
10339 + softirq_early_init();
10340 boot_cpu_state_init();
10341 smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */
10343 diff -Nur linux-4.9.6.orig/init/Makefile linux-4.9.6/init/Makefile
10344 --- linux-4.9.6.orig/init/Makefile 2017-01-26 08:25:24.000000000 +0100
10345 +++ linux-4.9.6/init/Makefile 2017-01-28 13:59:09.951656955 +0100
10347 include/generated/compile.h: FORCE
10348 @$($(quiet)chk_compile.h)
10349 $(Q)$(CONFIG_SHELL) $(srctree)/scripts/mkcompile_h $@ \
10350 - "$(UTS_MACHINE)" "$(CONFIG_SMP)" "$(CONFIG_PREEMPT)" "$(CC) $(KBUILD_CFLAGS)"
10351 + "$(UTS_MACHINE)" "$(CONFIG_SMP)" "$(CONFIG_PREEMPT)" "$(CONFIG_PREEMPT_RT_FULL)" "$(CC) $(KBUILD_CFLAGS)"
10352 diff -Nur linux-4.9.6.orig/ipc/sem.c linux-4.9.6/ipc/sem.c
10353 --- linux-4.9.6.orig/ipc/sem.c 2017-01-26 08:25:24.000000000 +0100
10354 +++ linux-4.9.6/ipc/sem.c 2017-01-28 13:59:09.951656955 +0100
10355 @@ -712,6 +712,13 @@
10356 static void wake_up_sem_queue_prepare(struct list_head *pt,
10357 struct sem_queue *q, int error)
10359 +#ifdef CONFIG_PREEMPT_RT_BASE
10360 + struct task_struct *p = q->sleeper;
10361 + get_task_struct(p);
10362 + q->status = error;
10363 + wake_up_process(p);
10364 + put_task_struct(p);
10366 if (list_empty(pt)) {
10368 * Hold preempt off so that we don't get preempted and have the
10369 @@ -723,6 +730,7 @@
10372 list_add_tail(&q->list, pt);
10377 @@ -736,6 +744,7 @@
10379 static void wake_up_sem_queue_do(struct list_head *pt)
10381 +#ifndef CONFIG_PREEMPT_RT_BASE
10382 struct sem_queue *q, *t;
10385 @@ -748,6 +757,7 @@
10392 static void unlink_queue(struct sem_array *sma, struct sem_queue *q)
10393 diff -Nur linux-4.9.6.orig/kernel/cgroup.c linux-4.9.6/kernel/cgroup.c
10394 --- linux-4.9.6.orig/kernel/cgroup.c 2017-01-26 08:25:24.000000000 +0100
10395 +++ linux-4.9.6/kernel/cgroup.c 2017-01-28 13:59:09.951656955 +0100
10396 @@ -5040,10 +5040,10 @@
10397 queue_work(cgroup_destroy_wq, &css->destroy_work);
10400 -static void css_release_work_fn(struct work_struct *work)
10401 +static void css_release_work_fn(struct swork_event *sev)
10403 struct cgroup_subsys_state *css =
10404 - container_of(work, struct cgroup_subsys_state, destroy_work);
10405 + container_of(sev, struct cgroup_subsys_state, destroy_swork);
10406 struct cgroup_subsys *ss = css->ss;
10407 struct cgroup *cgrp = css->cgroup;
10409 @@ -5086,8 +5086,8 @@
10410 struct cgroup_subsys_state *css =
10411 container_of(ref, struct cgroup_subsys_state, refcnt);
10413 - INIT_WORK(&css->destroy_work, css_release_work_fn);
10414 - queue_work(cgroup_destroy_wq, &css->destroy_work);
10415 + INIT_SWORK(&css->destroy_swork, css_release_work_fn);
10416 + swork_queue(&css->destroy_swork);
10419 static void init_and_link_css(struct cgroup_subsys_state *css,
10420 @@ -5742,6 +5742,7 @@
10422 cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
10423 BUG_ON(!cgroup_destroy_wq);
10424 + BUG_ON(swork_get());
10427 * Used to destroy pidlists and separate to serve as flush domain.
10428 diff -Nur linux-4.9.6.orig/kernel/cpu.c linux-4.9.6/kernel/cpu.c
10429 --- linux-4.9.6.orig/kernel/cpu.c 2017-01-26 08:25:24.000000000 +0100
10430 +++ linux-4.9.6/kernel/cpu.c 2017-01-28 13:59:09.951656955 +0100
10431 @@ -239,6 +239,289 @@
10432 #define cpuhp_lock_acquire() lock_map_acquire(&cpu_hotplug.dep_map)
10433 #define cpuhp_lock_release() lock_map_release(&cpu_hotplug.dep_map)
10436 + * hotplug_pcp - per cpu hotplug descriptor
10437 + * @unplug: set when pin_current_cpu() needs to sync tasks
10438 + * @sync_tsk: the task that waits for tasks to finish pinned sections
10439 + * @refcount: counter of tasks in pinned sections
10440 + * @grab_lock: set when the tasks entering pinned sections should wait
10441 + * @synced: notifier for @sync_tsk to tell cpu_down it's finished
10442 + * @mutex: the mutex to make tasks wait (used when @grab_lock is true)
10443 + * @mutex_init: zero if the mutex hasn't been initialized yet.
10445 + * Although @unplug and @sync_tsk may point to the same task, the @unplug
10446 + * is used as a flag and still exists after @sync_tsk has exited and
10447 + * @sync_tsk set to NULL.
10449 +struct hotplug_pcp {
10450 + struct task_struct *unplug;
10451 + struct task_struct *sync_tsk;
10454 + struct completion synced;
10455 + struct completion unplug_wait;
10456 +#ifdef CONFIG_PREEMPT_RT_FULL
10458 + * Note, on PREEMPT_RT, the hotplug lock must save the state of
10459 + * the task, otherwise the mutex will cause the task to fail
10460 + * to sleep when required. (Because it's called from migrate_disable())
10462 + * The spinlock_t on PREEMPT_RT is a mutex that saves the task's
10467 + struct mutex mutex;
10472 +#ifdef CONFIG_PREEMPT_RT_FULL
10473 +# define hotplug_lock(hp) rt_spin_lock__no_mg(&(hp)->lock)
10474 +# define hotplug_unlock(hp) rt_spin_unlock__no_mg(&(hp)->lock)
10476 +# define hotplug_lock(hp) mutex_lock(&(hp)->mutex)
10477 +# define hotplug_unlock(hp) mutex_unlock(&(hp)->mutex)
10480 +static DEFINE_PER_CPU(struct hotplug_pcp, hotplug_pcp);
10483 + * pin_current_cpu - Prevent the current cpu from being unplugged
10485 + * Lightweight version of get_online_cpus() to prevent cpu from being
10486 + * unplugged when code runs in a migration disabled region.
10488 + * Must be called with preemption disabled (preempt_count = 1)!
10490 +void pin_current_cpu(void)
10492 + struct hotplug_pcp *hp;
10496 + hp = this_cpu_ptr(&hotplug_pcp);
10498 + if (!hp->unplug || hp->refcount || force || preempt_count() > 1 ||
10499 + hp->unplug == current) {
10503 + if (hp->grab_lock) {
10504 + preempt_enable();
10505 + hotplug_lock(hp);
10506 + hotplug_unlock(hp);
10508 + preempt_enable();
10510 + * Try to push this task off of this CPU.
10512 + if (!migrate_me()) {
10513 + preempt_disable();
10514 + hp = this_cpu_ptr(&hotplug_pcp);
10515 + if (!hp->grab_lock) {
10517 + * Just let it continue it's already pinned
10518 + * or about to sleep.
10523 + preempt_enable();
10526 + preempt_disable();
10531 + * unpin_current_cpu - Allow unplug of current cpu
10533 + * Must be called with preemption or interrupts disabled!
10535 +void unpin_current_cpu(void)
10537 + struct hotplug_pcp *hp = this_cpu_ptr(&hotplug_pcp);
10539 + WARN_ON(hp->refcount <= 0);
10541 + /* This is safe. sync_unplug_thread is pinned to this cpu */
10542 + if (!--hp->refcount && hp->unplug && hp->unplug != current)
10543 + wake_up_process(hp->unplug);
10546 +static void wait_for_pinned_cpus(struct hotplug_pcp *hp)
10548 + set_current_state(TASK_UNINTERRUPTIBLE);
10549 + while (hp->refcount) {
10550 + schedule_preempt_disabled();
10551 + set_current_state(TASK_UNINTERRUPTIBLE);
10555 +static int sync_unplug_thread(void *data)
10557 + struct hotplug_pcp *hp = data;
10559 + wait_for_completion(&hp->unplug_wait);
10560 + preempt_disable();
10561 + hp->unplug = current;
10562 + wait_for_pinned_cpus(hp);
10565 + * This thread will synchronize the cpu_down() with threads
10566 + * that have pinned the CPU. When the pinned CPU count reaches
10567 + * zero, we inform the cpu_down code to continue to the next step.
10569 + set_current_state(TASK_UNINTERRUPTIBLE);
10570 + preempt_enable();
10571 + complete(&hp->synced);
10574 + * If all succeeds, the next step will need tasks to wait till
10575 + * the CPU is offline before continuing. To do this, the grab_lock
10576 + * is set and tasks going into pin_current_cpu() will block on the
10577 + * mutex. But we still need to wait for those that are already in
10578 + * pinned CPU sections. If the cpu_down() failed, the kthread_should_stop()
10579 + * will kick this thread out.
10581 + while (!hp->grab_lock && !kthread_should_stop()) {
10583 + set_current_state(TASK_UNINTERRUPTIBLE);
10586 + /* Make sure grab_lock is seen before we see a stale completion */
10590 + * Now just before cpu_down() enters stop machine, we need to make
10591 + * sure all tasks that are in pinned CPU sections are out, and new
10592 + * tasks will now grab the lock, keeping them from entering pinned
10595 + if (!kthread_should_stop()) {
10596 + preempt_disable();
10597 + wait_for_pinned_cpus(hp);
10598 + preempt_enable();
10599 + complete(&hp->synced);
10602 + set_current_state(TASK_UNINTERRUPTIBLE);
10603 + while (!kthread_should_stop()) {
10605 + set_current_state(TASK_UNINTERRUPTIBLE);
10607 + set_current_state(TASK_RUNNING);
10610 + * Force this thread off this CPU as it's going down and
10611 + * we don't want any more work on this CPU.
10613 + current->flags &= ~PF_NO_SETAFFINITY;
10614 + set_cpus_allowed_ptr(current, cpu_present_mask);
10619 +static void __cpu_unplug_sync(struct hotplug_pcp *hp)
10621 + wake_up_process(hp->sync_tsk);
10622 + wait_for_completion(&hp->synced);
10625 +static void __cpu_unplug_wait(unsigned int cpu)
10627 + struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
10629 + complete(&hp->unplug_wait);
10630 + wait_for_completion(&hp->synced);
10634 + * Start the sync_unplug_thread on the target cpu and wait for it to
10637 +static int cpu_unplug_begin(unsigned int cpu)
10639 + struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
10642 + /* Protected by cpu_hotplug.lock */
10643 + if (!hp->mutex_init) {
10644 +#ifdef CONFIG_PREEMPT_RT_FULL
10645 + spin_lock_init(&hp->lock);
10647 + mutex_init(&hp->mutex);
10649 + hp->mutex_init = 1;
10652 + /* Inform the scheduler to migrate tasks off this CPU */
10653 + tell_sched_cpu_down_begin(cpu);
10655 + init_completion(&hp->synced);
10656 + init_completion(&hp->unplug_wait);
10658 + hp->sync_tsk = kthread_create(sync_unplug_thread, hp, "sync_unplug/%d", cpu);
10659 + if (IS_ERR(hp->sync_tsk)) {
10660 + err = PTR_ERR(hp->sync_tsk);
10661 + hp->sync_tsk = NULL;
10664 + kthread_bind(hp->sync_tsk, cpu);
10667 + * Wait for tasks to get out of the pinned sections,
10668 + * it's still OK if new tasks enter. Some CPU notifiers will
10669 + * wait for tasks that are going to enter these sections and
10670 + * we must not have them block.
10672 + wake_up_process(hp->sync_tsk);
10676 +static void cpu_unplug_sync(unsigned int cpu)
10678 + struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
10680 + init_completion(&hp->synced);
10681 + /* The completion needs to be initialzied before setting grab_lock */
10684 + /* Grab the mutex before setting grab_lock */
10685 + hotplug_lock(hp);
10686 + hp->grab_lock = 1;
10689 + * The CPU notifiers have been completed.
10690 + * Wait for tasks to get out of pinned CPU sections and have new
10691 + * tasks block until the CPU is completely down.
10693 + __cpu_unplug_sync(hp);
10695 + /* All done with the sync thread */
10696 + kthread_stop(hp->sync_tsk);
10697 + hp->sync_tsk = NULL;
10700 +static void cpu_unplug_done(unsigned int cpu)
10702 + struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
10704 + hp->unplug = NULL;
10705 + /* Let all tasks know cpu unplug is finished before cleaning up */
10708 + if (hp->sync_tsk)
10709 + kthread_stop(hp->sync_tsk);
10711 + if (hp->grab_lock) {
10712 + hotplug_unlock(hp);
10713 + /* protected by cpu_hotplug.lock */
10714 + hp->grab_lock = 0;
10716 + tell_sched_cpu_down_done(cpu);
10719 void get_online_cpus(void)
10721 @@ -789,10 +1072,14 @@
10722 struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
10725 + __cpu_unplug_wait(cpu);
10726 /* Park the smpboot threads */
10727 kthread_park(per_cpu_ptr(&cpuhp_state, cpu)->thread);
10728 smpboot_park_threads(cpu);
10730 + /* Notifiers are done. Don't let any more tasks pin this CPU. */
10731 + cpu_unplug_sync(cpu);
10734 * Prevent irq alloc/free while the dying cpu reorganizes the
10735 * interrupt affinities.
10736 @@ -877,6 +1164,9 @@
10737 struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
10738 int prev_state, ret = 0;
10739 bool hasdied = false;
10741 + cpumask_var_t cpumask;
10742 + cpumask_var_t cpumask_org;
10744 if (num_online_cpus() == 1)
10746 @@ -884,7 +1174,34 @@
10747 if (!cpu_present(cpu))
10750 + /* Move the downtaker off the unplug cpu */
10751 + if (!alloc_cpumask_var(&cpumask, GFP_KERNEL))
10753 + if (!alloc_cpumask_var(&cpumask_org, GFP_KERNEL)) {
10754 + free_cpumask_var(cpumask);
10758 + cpumask_copy(cpumask_org, tsk_cpus_allowed(current));
10759 + cpumask_andnot(cpumask, cpu_online_mask, cpumask_of(cpu));
10760 + set_cpus_allowed_ptr(current, cpumask);
10761 + free_cpumask_var(cpumask);
10762 + migrate_disable();
10763 + mycpu = smp_processor_id();
10764 + if (mycpu == cpu) {
10765 + printk(KERN_ERR "Yuck! Still on unplug CPU\n!");
10766 + migrate_enable();
10768 + goto restore_cpus;
10771 + migrate_enable();
10772 cpu_hotplug_begin();
10773 + ret = cpu_unplug_begin(cpu);
10775 + printk("cpu_unplug_begin(%d) failed\n", cpu);
10779 cpuhp_tasks_frozen = tasks_frozen;
10781 @@ -923,10 +1240,15 @@
10783 hasdied = prev_state != st->state && st->state == CPUHP_OFFLINE;
10785 + cpu_unplug_done(cpu);
10787 cpu_hotplug_done();
10788 /* This post dead nonsense must die */
10789 if (!ret && hasdied)
10790 cpu_notify_nofail(CPU_POST_DEAD, cpu);
10792 + set_cpus_allowed_ptr(current, cpumask_org);
10793 + free_cpumask_var(cpumask_org);
10797 diff -Nur linux-4.9.6.orig/kernel/debug/kdb/kdb_io.c linux-4.9.6/kernel/debug/kdb/kdb_io.c
10798 --- linux-4.9.6.orig/kernel/debug/kdb/kdb_io.c 2017-01-26 08:25:24.000000000 +0100
10799 +++ linux-4.9.6/kernel/debug/kdb/kdb_io.c 2017-01-28 13:59:09.951656955 +0100
10800 @@ -554,7 +554,6 @@
10803 int logging, saved_loglevel = 0;
10804 - int saved_trap_printk;
10805 int got_printf_lock = 0;
10808 @@ -565,8 +564,6 @@
10809 unsigned long uninitialized_var(flags);
10812 - saved_trap_printk = kdb_trap_printk;
10813 - kdb_trap_printk = 0;
10815 /* Serialize kdb_printf if multiple cpus try to write at once.
10816 * But if any cpu goes recursive in kdb, just print the output,
10817 @@ -855,7 +852,6 @@
10819 __release(kdb_printf_lock);
10821 - kdb_trap_printk = saved_trap_printk;
10825 @@ -865,9 +861,11 @@
10829 + kdb_trap_printk++;
10831 r = vkdb_printf(KDB_MSGSRC_INTERNAL, fmt, ap);
10833 + kdb_trap_printk--;
10837 diff -Nur linux-4.9.6.orig/kernel/events/core.c linux-4.9.6/kernel/events/core.c
10838 --- linux-4.9.6.orig/kernel/events/core.c 2017-01-26 08:25:24.000000000 +0100
10839 +++ linux-4.9.6/kernel/events/core.c 2017-01-28 13:59:09.955657109 +0100
10840 @@ -1050,6 +1050,7 @@
10841 raw_spin_lock_init(&cpuctx->hrtimer_lock);
10842 hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
10843 timer->function = perf_mux_hrtimer_handler;
10844 + timer->irqsafe = 1;
10847 static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx)
10848 @@ -8335,6 +8336,7 @@
10850 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
10851 hwc->hrtimer.function = perf_swevent_hrtimer;
10852 + hwc->hrtimer.irqsafe = 1;
10855 * Since hrtimers have a fixed rate, we can do a static freq->period
10856 diff -Nur linux-4.9.6.orig/kernel/exit.c linux-4.9.6/kernel/exit.c
10857 --- linux-4.9.6.orig/kernel/exit.c 2017-01-26 08:25:24.000000000 +0100
10858 +++ linux-4.9.6/kernel/exit.c 2017-01-28 13:59:09.955657109 +0100
10859 @@ -143,7 +143,7 @@
10860 * Do this under ->siglock, we can race with another thread
10861 * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals.
10863 - flush_sigqueue(&tsk->pending);
10864 + flush_task_sigqueue(tsk);
10865 tsk->sighand = NULL;
10866 spin_unlock(&sighand->siglock);
10868 diff -Nur linux-4.9.6.orig/kernel/fork.c linux-4.9.6/kernel/fork.c
10869 --- linux-4.9.6.orig/kernel/fork.c 2017-01-26 08:25:24.000000000 +0100
10870 +++ linux-4.9.6/kernel/fork.c 2017-01-28 13:59:09.955657109 +0100
10872 #include <linux/compiler.h>
10873 #include <linux/sysctl.h>
10874 #include <linux/kcov.h>
10875 +#include <linux/kprobes.h>
10877 #include <asm/pgtable.h>
10878 #include <asm/pgalloc.h>
10879 @@ -376,13 +377,24 @@
10880 if (atomic_dec_and_test(&sig->sigcnt))
10881 free_signal_struct(sig);
10884 +#ifdef CONFIG_PREEMPT_RT_BASE
10887 void __put_task_struct(struct task_struct *tsk)
10889 WARN_ON(!tsk->exit_state);
10890 WARN_ON(atomic_read(&tsk->usage));
10891 WARN_ON(tsk == current);
10894 + * Remove function-return probe instances associated with this
10895 + * task and put them back on the free list.
10897 + kprobe_flush_task(tsk);
10899 + /* Task is done with its stack. */
10900 + put_task_stack(tsk);
10903 task_numa_free(tsk);
10904 security_task_free(tsk);
10905 @@ -393,7 +405,18 @@
10906 if (!profile_handoff_task(tsk))
10909 +#ifndef CONFIG_PREEMPT_RT_BASE
10910 EXPORT_SYMBOL_GPL(__put_task_struct);
10912 +void __put_task_struct_cb(struct rcu_head *rhp)
10914 + struct task_struct *tsk = container_of(rhp, struct task_struct, put_rcu);
10916 + __put_task_struct(tsk);
10919 +EXPORT_SYMBOL_GPL(__put_task_struct_cb);
10922 void __init __weak arch_task_cache_init(void) { }
10924 @@ -852,6 +875,19 @@
10926 EXPORT_SYMBOL_GPL(__mmdrop);
10928 +#ifdef CONFIG_PREEMPT_RT_BASE
10930 + * RCU callback for delayed mm drop. Not strictly rcu, but we don't
10931 + * want another facility to make this work.
10933 +void __mmdrop_delayed(struct rcu_head *rhp)
10935 + struct mm_struct *mm = container_of(rhp, struct mm_struct, delayed_drop);
10941 static inline void __mmput(struct mm_struct *mm)
10943 VM_BUG_ON(atomic_read(&mm->mm_users));
10944 @@ -1426,6 +1462,9 @@
10946 static void posix_cpu_timers_init(struct task_struct *tsk)
10948 +#ifdef CONFIG_PREEMPT_RT_BASE
10949 + tsk->posix_timer_list = NULL;
10951 tsk->cputime_expires.prof_exp = 0;
10952 tsk->cputime_expires.virt_exp = 0;
10953 tsk->cputime_expires.sched_exp = 0;
10954 @@ -1552,6 +1591,7 @@
10955 spin_lock_init(&p->alloc_lock);
10957 init_sigpending(&p->pending);
10958 + p->sigqueue_cache = NULL;
10960 p->utime = p->stime = p->gtime = 0;
10961 p->utimescaled = p->stimescaled = 0;
10962 diff -Nur linux-4.9.6.orig/kernel/futex.c linux-4.9.6/kernel/futex.c
10963 --- linux-4.9.6.orig/kernel/futex.c 2017-01-26 08:25:24.000000000 +0100
10964 +++ linux-4.9.6/kernel/futex.c 2017-01-28 13:59:09.955657109 +0100
10965 @@ -904,7 +904,9 @@
10966 * task still owns the PI-state:
10968 if (head->next != next) {
10969 + raw_spin_unlock_irq(&curr->pi_lock);
10970 spin_unlock(&hb->lock);
10971 + raw_spin_lock_irq(&curr->pi_lock);
10975 @@ -1299,6 +1301,7 @@
10976 struct futex_pi_state *pi_state = this->pi_state;
10977 u32 uninitialized_var(curval), newval;
10979 + WAKE_Q(wake_sleeper_q);
10983 @@ -1365,7 +1368,8 @@
10985 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
10987 - deboost = rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q);
10988 + deboost = rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q,
10989 + &wake_sleeper_q);
10992 * First unlock HB so the waiter does not spin on it once he got woken
10993 @@ -1373,8 +1377,9 @@
10994 * deboost first (and lose our higher priority), then the task might get
10995 * scheduled away before the wake up can take place.
10997 - spin_unlock(&hb->lock);
10998 + deboost |= spin_unlock_no_deboost(&hb->lock);
10999 wake_up_q(&wake_q);
11000 + wake_up_q_sleeper(&wake_sleeper_q);
11002 rt_mutex_adjust_prio(current);
11004 @@ -1924,6 +1929,16 @@
11005 requeue_pi_wake_futex(this, &key2, hb2);
11008 + } else if (ret == -EAGAIN) {
11010 + * Waiter was woken by timeout or
11011 + * signal and has set pi_blocked_on to
11012 + * PI_WAKEUP_INPROGRESS before we
11013 + * tried to enqueue it on the rtmutex.
11015 + this->pi_state = NULL;
11016 + put_pi_state(pi_state);
11020 * rt_mutex_start_proxy_lock() detected a
11021 @@ -2814,7 +2829,7 @@
11022 struct hrtimer_sleeper timeout, *to = NULL;
11023 struct rt_mutex_waiter rt_waiter;
11024 struct rt_mutex *pi_mutex = NULL;
11025 - struct futex_hash_bucket *hb;
11026 + struct futex_hash_bucket *hb, *hb2;
11027 union futex_key key2 = FUTEX_KEY_INIT;
11028 struct futex_q q = futex_q_init;
11030 @@ -2839,10 +2854,7 @@
11031 * The waiter is allocated on our stack, manipulated by the requeue
11032 * code while we sleep on uaddr.
11034 - debug_rt_mutex_init_waiter(&rt_waiter);
11035 - RB_CLEAR_NODE(&rt_waiter.pi_tree_entry);
11036 - RB_CLEAR_NODE(&rt_waiter.tree_entry);
11037 - rt_waiter.task = NULL;
11038 + rt_mutex_init_waiter(&rt_waiter, false);
11040 ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
11041 if (unlikely(ret != 0))
11042 @@ -2873,20 +2885,55 @@
11043 /* Queue the futex_q, drop the hb lock, wait for wakeup. */
11044 futex_wait_queue_me(hb, &q, to);
11046 - spin_lock(&hb->lock);
11047 - ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
11048 - spin_unlock(&hb->lock);
11050 - goto out_put_keys;
11052 + * On RT we must avoid races with requeue and trying to block
11053 + * on two mutexes (hb->lock and uaddr2's rtmutex) by
11054 + * serializing access to pi_blocked_on with pi_lock.
11056 + raw_spin_lock_irq(¤t->pi_lock);
11057 + if (current->pi_blocked_on) {
11059 + * We have been requeued or are in the process of
11060 + * being requeued.
11062 + raw_spin_unlock_irq(¤t->pi_lock);
11065 + * Setting pi_blocked_on to PI_WAKEUP_INPROGRESS
11066 + * prevents a concurrent requeue from moving us to the
11067 + * uaddr2 rtmutex. After that we can safely acquire
11068 + * (and possibly block on) hb->lock.
11070 + current->pi_blocked_on = PI_WAKEUP_INPROGRESS;
11071 + raw_spin_unlock_irq(¤t->pi_lock);
11073 + spin_lock(&hb->lock);
11076 + * Clean up pi_blocked_on. We might leak it otherwise
11077 + * when we succeeded with the hb->lock in the fast
11080 + raw_spin_lock_irq(¤t->pi_lock);
11081 + current->pi_blocked_on = NULL;
11082 + raw_spin_unlock_irq(¤t->pi_lock);
11084 + ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
11085 + spin_unlock(&hb->lock);
11087 + goto out_put_keys;
11091 - * In order for us to be here, we know our q.key == key2, and since
11092 - * we took the hb->lock above, we also know that futex_requeue() has
11093 - * completed and we no longer have to concern ourselves with a wakeup
11094 - * race with the atomic proxy lock acquisition by the requeue code. The
11095 - * futex_requeue dropped our key1 reference and incremented our key2
11096 - * reference count.
11097 + * In order to be here, we have either been requeued, are in
11098 + * the process of being requeued, or requeue successfully
11099 + * acquired uaddr2 on our behalf. If pi_blocked_on was
11100 + * non-null above, we may be racing with a requeue. Do not
11101 + * rely on q->lock_ptr to be hb2->lock until after blocking on
11102 + * hb->lock or hb2->lock. The futex_requeue dropped our key1
11103 + * reference and incremented our key2 reference count.
11105 + hb2 = hash_futex(&key2);
11107 /* Check if the requeue code acquired the second futex for us. */
11108 if (!q.rt_waiter) {
11109 @@ -2895,14 +2942,15 @@
11110 * did a lock-steal - fix up the PI-state in that case.
11112 if (q.pi_state && (q.pi_state->owner != current)) {
11113 - spin_lock(q.lock_ptr);
11114 + spin_lock(&hb2->lock);
11115 + BUG_ON(&hb2->lock != q.lock_ptr);
11116 ret = fixup_pi_state_owner(uaddr2, &q, current);
11118 * Drop the reference to the pi state which
11119 * the requeue_pi() code acquired for us.
11121 put_pi_state(q.pi_state);
11122 - spin_unlock(q.lock_ptr);
11123 + spin_unlock(&hb2->lock);
11127 @@ -2915,7 +2963,8 @@
11128 ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter);
11129 debug_rt_mutex_free_waiter(&rt_waiter);
11131 - spin_lock(q.lock_ptr);
11132 + spin_lock(&hb2->lock);
11133 + BUG_ON(&hb2->lock != q.lock_ptr);
11135 * Fixup the pi_state owner and possibly acquire the lock if we
11137 diff -Nur linux-4.9.6.orig/kernel/irq/handle.c linux-4.9.6/kernel/irq/handle.c
11138 --- linux-4.9.6.orig/kernel/irq/handle.c 2017-01-26 08:25:24.000000000 +0100
11139 +++ linux-4.9.6/kernel/irq/handle.c 2017-01-28 13:59:09.955657109 +0100
11140 @@ -181,10 +181,16 @@
11142 irqreturn_t retval;
11143 unsigned int flags = 0;
11144 + struct pt_regs *regs = get_irq_regs();
11145 + u64 ip = regs ? instruction_pointer(regs) : 0;
11147 retval = __handle_irq_event_percpu(desc, &flags);
11149 - add_interrupt_randomness(desc->irq_data.irq, flags);
11150 +#ifdef CONFIG_PREEMPT_RT_FULL
11151 + desc->random_ip = ip;
11153 + add_interrupt_randomness(desc->irq_data.irq, flags, ip);
11157 note_interrupt(desc, retval);
11158 diff -Nur linux-4.9.6.orig/kernel/irq/manage.c linux-4.9.6/kernel/irq/manage.c
11159 --- linux-4.9.6.orig/kernel/irq/manage.c 2017-01-26 08:25:24.000000000 +0100
11160 +++ linux-4.9.6/kernel/irq/manage.c 2017-01-28 13:59:09.955657109 +0100
11162 #include "internals.h"
11164 #ifdef CONFIG_IRQ_FORCED_THREADING
11165 +# ifndef CONFIG_PREEMPT_RT_BASE
11166 __read_mostly bool force_irqthreads;
11168 static int __init setup_forced_irqthreads(char *arg)
11172 early_param("threadirqs", setup_forced_irqthreads);
11176 static void __synchronize_hardirq(struct irq_desc *desc)
11177 @@ -233,7 +235,12 @@
11179 if (desc->affinity_notify) {
11180 kref_get(&desc->affinity_notify->kref);
11182 +#ifdef CONFIG_PREEMPT_RT_BASE
11183 + swork_queue(&desc->affinity_notify->swork);
11185 schedule_work(&desc->affinity_notify->work);
11188 irqd_set(data, IRQD_AFFINITY_SET);
11190 @@ -271,10 +278,8 @@
11192 EXPORT_SYMBOL_GPL(irq_set_affinity_hint);
11194 -static void irq_affinity_notify(struct work_struct *work)
11195 +static void _irq_affinity_notify(struct irq_affinity_notify *notify)
11197 - struct irq_affinity_notify *notify =
11198 - container_of(work, struct irq_affinity_notify, work);
11199 struct irq_desc *desc = irq_to_desc(notify->irq);
11200 cpumask_var_t cpumask;
11201 unsigned long flags;
11202 @@ -296,6 +301,35 @@
11203 kref_put(¬ify->kref, notify->release);
11206 +#ifdef CONFIG_PREEMPT_RT_BASE
11207 +static void init_helper_thread(void)
11209 + static int init_sworker_once;
11211 + if (init_sworker_once)
11213 + if (WARN_ON(swork_get()))
11215 + init_sworker_once = 1;
11218 +static void irq_affinity_notify(struct swork_event *swork)
11220 + struct irq_affinity_notify *notify =
11221 + container_of(swork, struct irq_affinity_notify, swork);
11222 + _irq_affinity_notify(notify);
11227 +static void irq_affinity_notify(struct work_struct *work)
11229 + struct irq_affinity_notify *notify =
11230 + container_of(work, struct irq_affinity_notify, work);
11231 + _irq_affinity_notify(notify);
11236 * irq_set_affinity_notifier - control notification of IRQ affinity changes
11237 * @irq: Interrupt for which to enable/disable notification
11238 @@ -324,7 +358,12 @@
11241 kref_init(¬ify->kref);
11242 +#ifdef CONFIG_PREEMPT_RT_BASE
11243 + INIT_SWORK(¬ify->swork, irq_affinity_notify);
11244 + init_helper_thread();
11246 INIT_WORK(¬ify->work, irq_affinity_notify);
11250 raw_spin_lock_irqsave(&desc->lock, flags);
11251 @@ -879,7 +918,15 @@
11252 local_bh_disable();
11253 ret = action->thread_fn(action->irq, action->dev_id);
11254 irq_finalize_oneshot(desc, action);
11255 - local_bh_enable();
11257 + * Interrupts which have real time requirements can be set up
11258 + * to avoid softirq processing in the thread handler. This is
11259 + * safe as these interrupts do not raise soft interrupts.
11261 + if (irq_settings_no_softirq_call(desc))
11262 + _local_bh_enable();
11264 + local_bh_enable();
11268 @@ -976,6 +1023,12 @@
11269 if (action_ret == IRQ_WAKE_THREAD)
11270 irq_wake_secondary(desc, action);
11272 +#ifdef CONFIG_PREEMPT_RT_FULL
11273 + migrate_disable();
11274 + add_interrupt_randomness(action->irq, 0,
11275 + desc->random_ip ^ (unsigned long) action);
11276 + migrate_enable();
11278 wake_threads_waitq(desc);
11281 @@ -1336,6 +1389,9 @@
11282 irqd_set(&desc->irq_data, IRQD_NO_BALANCING);
11285 + if (new->flags & IRQF_NO_SOFTIRQ_CALL)
11286 + irq_settings_set_no_softirq_call(desc);
11288 /* Set default affinity mask once everything is setup */
11289 setup_affinity(desc, mask);
11291 @@ -2061,7 +2117,7 @@
11292 * This call sets the internal irqchip state of an interrupt,
11293 * depending on the value of @which.
11295 - * This function should be called with preemption disabled if the
11296 + * This function should be called with migration disabled if the
11297 * interrupt controller has per-cpu registers.
11299 int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
11300 diff -Nur linux-4.9.6.orig/kernel/irq/settings.h linux-4.9.6/kernel/irq/settings.h
11301 --- linux-4.9.6.orig/kernel/irq/settings.h 2017-01-26 08:25:24.000000000 +0100
11302 +++ linux-4.9.6/kernel/irq/settings.h 2017-01-28 13:59:09.955657109 +0100
11304 _IRQ_PER_CPU_DEVID = IRQ_PER_CPU_DEVID,
11305 _IRQ_IS_POLLED = IRQ_IS_POLLED,
11306 _IRQ_DISABLE_UNLAZY = IRQ_DISABLE_UNLAZY,
11307 + _IRQ_NO_SOFTIRQ_CALL = IRQ_NO_SOFTIRQ_CALL,
11308 _IRQF_MODIFY_MASK = IRQF_MODIFY_MASK,
11312 #define IRQ_PER_CPU_DEVID GOT_YOU_MORON
11313 #define IRQ_IS_POLLED GOT_YOU_MORON
11314 #define IRQ_DISABLE_UNLAZY GOT_YOU_MORON
11315 +#define IRQ_NO_SOFTIRQ_CALL GOT_YOU_MORON
11316 #undef IRQF_MODIFY_MASK
11317 #define IRQF_MODIFY_MASK GOT_YOU_MORON
11320 desc->status_use_accessors |= (set & _IRQF_MODIFY_MASK);
11323 +static inline bool irq_settings_no_softirq_call(struct irq_desc *desc)
11325 + return desc->status_use_accessors & _IRQ_NO_SOFTIRQ_CALL;
11328 +static inline void irq_settings_set_no_softirq_call(struct irq_desc *desc)
11330 + desc->status_use_accessors |= _IRQ_NO_SOFTIRQ_CALL;
11333 static inline bool irq_settings_is_per_cpu(struct irq_desc *desc)
11335 return desc->status_use_accessors & _IRQ_PER_CPU;
11336 diff -Nur linux-4.9.6.orig/kernel/irq/spurious.c linux-4.9.6/kernel/irq/spurious.c
11337 --- linux-4.9.6.orig/kernel/irq/spurious.c 2017-01-26 08:25:24.000000000 +0100
11338 +++ linux-4.9.6/kernel/irq/spurious.c 2017-01-28 13:59:09.955657109 +0100
11339 @@ -442,6 +442,10 @@
11341 static int __init irqfixup_setup(char *str)
11343 +#ifdef CONFIG_PREEMPT_RT_BASE
11344 + pr_warn("irqfixup boot option not supported w/ CONFIG_PREEMPT_RT_BASE\n");
11348 printk(KERN_WARNING "Misrouted IRQ fixup support enabled.\n");
11349 printk(KERN_WARNING "This may impact system performance.\n");
11350 @@ -454,6 +458,10 @@
11352 static int __init irqpoll_setup(char *str)
11354 +#ifdef CONFIG_PREEMPT_RT_BASE
11355 + pr_warn("irqpoll boot option not supported w/ CONFIG_PREEMPT_RT_BASE\n");
11359 printk(KERN_WARNING "Misrouted IRQ fixup and polling support "
11361 diff -Nur linux-4.9.6.orig/kernel/irq_work.c linux-4.9.6/kernel/irq_work.c
11362 --- linux-4.9.6.orig/kernel/irq_work.c 2017-01-26 08:25:24.000000000 +0100
11363 +++ linux-4.9.6/kernel/irq_work.c 2017-01-28 13:59:09.955657109 +0100
11365 #include <linux/cpu.h>
11366 #include <linux/notifier.h>
11367 #include <linux/smp.h>
11368 +#include <linux/interrupt.h>
11369 #include <asm/processor.h>
11374 bool irq_work_queue_on(struct irq_work *work, int cpu)
11376 + struct llist_head *list;
11378 /* All work should have been flushed before going offline */
11379 WARN_ON_ONCE(cpu_is_offline(cpu));
11382 if (!irq_work_claim(work))
11385 - if (llist_add(&work->llnode, &per_cpu(raised_list, cpu)))
11386 + if (IS_ENABLED(CONFIG_PREEMPT_RT_FULL) && !(work->flags & IRQ_WORK_HARD_IRQ))
11387 + list = &per_cpu(lazy_list, cpu);
11389 + list = &per_cpu(raised_list, cpu);
11391 + if (llist_add(&work->llnode, list))
11392 arch_send_call_function_single_ipi(cpu);
11396 /* Enqueue the irq work @work on the current CPU */
11397 bool irq_work_queue(struct irq_work *work)
11399 + struct llist_head *list;
11400 + bool lazy_work, realtime = IS_ENABLED(CONFIG_PREEMPT_RT_FULL);
11402 /* Only queue if not already pending */
11403 if (!irq_work_claim(work))
11405 @@ -93,13 +104,15 @@
11406 /* Queue the entry and raise the IPI if needed. */
11409 - /* If the work is "lazy", handle it from next tick if any */
11410 - if (work->flags & IRQ_WORK_LAZY) {
11411 - if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) &&
11412 - tick_nohz_tick_stopped())
11413 - arch_irq_work_raise();
11415 - if (llist_add(&work->llnode, this_cpu_ptr(&raised_list)))
11416 + lazy_work = work->flags & IRQ_WORK_LAZY;
11418 + if (lazy_work || (realtime && !(work->flags & IRQ_WORK_HARD_IRQ)))
11419 + list = this_cpu_ptr(&lazy_list);
11421 + list = this_cpu_ptr(&raised_list);
11423 + if (llist_add(&work->llnode, list)) {
11424 + if (!lazy_work || tick_nohz_tick_stopped())
11425 arch_irq_work_raise();
11428 @@ -116,9 +129,8 @@
11429 raised = this_cpu_ptr(&raised_list);
11430 lazy = this_cpu_ptr(&lazy_list);
11432 - if (llist_empty(raised) || arch_irq_work_has_interrupt())
11433 - if (llist_empty(lazy))
11435 + if (llist_empty(raised) && llist_empty(lazy))
11438 /* All work should have been flushed before going offline */
11439 WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
11440 @@ -132,7 +144,7 @@
11441 struct irq_work *work;
11442 struct llist_node *llnode;
11444 - BUG_ON(!irqs_disabled());
11445 + BUG_ON_NONRT(!irqs_disabled());
11447 if (llist_empty(list))
11449 @@ -169,7 +181,16 @@
11450 void irq_work_run(void)
11452 irq_work_run_list(this_cpu_ptr(&raised_list));
11453 - irq_work_run_list(this_cpu_ptr(&lazy_list));
11454 + if (IS_ENABLED(CONFIG_PREEMPT_RT_FULL)) {
11456 + * NOTE: we raise softirq via IPI for safety,
11457 + * and execute in irq_work_tick() to move the
11458 + * overhead from hard to soft irq context.
11460 + if (!llist_empty(this_cpu_ptr(&lazy_list)))
11461 + raise_softirq(TIMER_SOFTIRQ);
11463 + irq_work_run_list(this_cpu_ptr(&lazy_list));
11465 EXPORT_SYMBOL_GPL(irq_work_run);
11467 @@ -179,8 +200,17 @@
11469 if (!llist_empty(raised) && !arch_irq_work_has_interrupt())
11470 irq_work_run_list(raised);
11472 + if (!IS_ENABLED(CONFIG_PREEMPT_RT_FULL))
11473 + irq_work_run_list(this_cpu_ptr(&lazy_list));
11476 +#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_PREEMPT_RT_FULL)
11477 +void irq_work_tick_soft(void)
11479 irq_work_run_list(this_cpu_ptr(&lazy_list));
11484 * Synchronize against the irq_work @entry, ensures the entry is not
11485 diff -Nur linux-4.9.6.orig/kernel/Kconfig.locks linux-4.9.6/kernel/Kconfig.locks
11486 --- linux-4.9.6.orig/kernel/Kconfig.locks 2017-01-26 08:25:24.000000000 +0100
11487 +++ linux-4.9.6/kernel/Kconfig.locks 2017-01-28 13:59:09.951656955 +0100
11488 @@ -225,11 +225,11 @@
11490 config MUTEX_SPIN_ON_OWNER
11492 - depends on SMP && !DEBUG_MUTEXES && ARCH_SUPPORTS_ATOMIC_RMW
11493 + depends on SMP && !DEBUG_MUTEXES && ARCH_SUPPORTS_ATOMIC_RMW && !PREEMPT_RT_FULL
11495 config RWSEM_SPIN_ON_OWNER
11497 - depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW
11498 + depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW && !PREEMPT_RT_FULL
11500 config LOCK_SPIN_ON_OWNER
11502 diff -Nur linux-4.9.6.orig/kernel/Kconfig.preempt linux-4.9.6/kernel/Kconfig.preempt
11503 --- linux-4.9.6.orig/kernel/Kconfig.preempt 2017-01-26 08:25:24.000000000 +0100
11504 +++ linux-4.9.6/kernel/Kconfig.preempt 2017-01-28 13:59:09.951656955 +0100
11508 + select PREEMPT_COUNT
11510 +config PREEMPT_RT_BASE
11514 +config HAVE_PREEMPT_LAZY
11517 +config PREEMPT_LAZY
11518 + def_bool y if HAVE_PREEMPT_LAZY && PREEMPT_RT_FULL
11521 prompt "Preemption Model"
11524 Select this if you are building a kernel for a desktop system.
11527 +config PREEMPT__LL
11528 bool "Preemptible Kernel (Low-Latency Desktop)"
11529 - select PREEMPT_COUNT
11531 select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK
11533 This option reduces the latency of the kernel by making
11535 embedded system with latency requirements in the milliseconds
11538 +config PREEMPT_RTB
11539 + bool "Preemptible Kernel (Basic RT)"
11540 + select PREEMPT_RT_BASE
11542 + This option is basically the same as (Low-Latency Desktop) but
11543 + enables changes which are preliminary for the full preemptible
11546 +config PREEMPT_RT_FULL
11547 + bool "Fully Preemptible Kernel (RT)"
11548 + depends on IRQ_FORCED_THREADING
11549 + select PREEMPT_RT_BASE
11550 + select PREEMPT_RCU
11552 + All and everything
11556 config PREEMPT_COUNT
11557 diff -Nur linux-4.9.6.orig/kernel/ksysfs.c linux-4.9.6/kernel/ksysfs.c
11558 --- linux-4.9.6.orig/kernel/ksysfs.c 2017-01-26 08:25:24.000000000 +0100
11559 +++ linux-4.9.6/kernel/ksysfs.c 2017-01-28 13:59:09.955657109 +0100
11560 @@ -136,6 +136,15 @@
11562 #endif /* CONFIG_KEXEC_CORE */
11564 +#if defined(CONFIG_PREEMPT_RT_FULL)
11565 +static ssize_t realtime_show(struct kobject *kobj,
11566 + struct kobj_attribute *attr, char *buf)
11568 + return sprintf(buf, "%d\n", 1);
11570 +KERNEL_ATTR_RO(realtime);
11573 /* whether file capabilities are enabled */
11574 static ssize_t fscaps_show(struct kobject *kobj,
11575 struct kobj_attribute *attr, char *buf)
11576 @@ -225,6 +234,9 @@
11577 &rcu_expedited_attr.attr,
11578 &rcu_normal_attr.attr,
11580 +#ifdef CONFIG_PREEMPT_RT_FULL
11581 + &realtime_attr.attr,
11586 diff -Nur linux-4.9.6.orig/kernel/locking/lockdep.c linux-4.9.6/kernel/locking/lockdep.c
11587 --- linux-4.9.6.orig/kernel/locking/lockdep.c 2017-01-26 08:25:24.000000000 +0100
11588 +++ linux-4.9.6/kernel/locking/lockdep.c 2017-01-28 13:59:09.955657109 +0100
11589 @@ -3689,6 +3689,7 @@
11593 +#ifndef CONFIG_PREEMPT_RT_FULL
11595 * We dont accurately track softirq state in e.g.
11596 * hardirq contexts (such as on 4KSTACKS), so only
11597 @@ -3703,6 +3704,7 @@
11598 DEBUG_LOCKS_WARN_ON(!current->softirqs_enabled);
11604 print_irqtrace_events(current);
11605 diff -Nur linux-4.9.6.orig/kernel/locking/locktorture.c linux-4.9.6/kernel/locking/locktorture.c
11606 --- linux-4.9.6.orig/kernel/locking/locktorture.c 2017-01-26 08:25:24.000000000 +0100
11607 +++ linux-4.9.6/kernel/locking/locktorture.c 2017-01-28 13:59:09.959657263 +0100
11609 #include <linux/kthread.h>
11610 #include <linux/sched/rt.h>
11611 #include <linux/spinlock.h>
11612 -#include <linux/rwlock.h>
11613 #include <linux/mutex.h>
11614 #include <linux/rwsem.h>
11615 #include <linux/smp.h>
11616 diff -Nur linux-4.9.6.orig/kernel/locking/Makefile linux-4.9.6/kernel/locking/Makefile
11617 --- linux-4.9.6.orig/kernel/locking/Makefile 2017-01-26 08:25:24.000000000 +0100
11618 +++ linux-4.9.6/kernel/locking/Makefile 2017-01-28 13:59:09.955657109 +0100
11620 # and is generally not a function of system call inputs.
11621 KCOV_INSTRUMENT := n
11623 -obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o
11624 +obj-y += semaphore.o percpu-rwsem.o
11626 ifdef CONFIG_FUNCTION_TRACER
11627 CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE)
11629 CFLAGS_REMOVE_rtmutex-debug.o = $(CC_FLAGS_FTRACE)
11632 +ifneq ($(CONFIG_PREEMPT_RT_FULL),y)
11634 obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
11637 obj-$(CONFIG_LOCKDEP) += lockdep.o
11638 ifeq ($(CONFIG_PROC_FS),y)
11639 obj-$(CONFIG_LOCKDEP) += lockdep_proc.o
11641 obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
11642 obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
11643 obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o
11644 +ifneq ($(CONFIG_PREEMPT_RT_FULL),y)
11645 obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
11646 obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o
11648 +obj-$(CONFIG_PREEMPT_RT_FULL) += rt.o
11649 obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o
11650 obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o
11651 diff -Nur linux-4.9.6.orig/kernel/locking/percpu-rwsem.c linux-4.9.6/kernel/locking/percpu-rwsem.c
11652 --- linux-4.9.6.orig/kernel/locking/percpu-rwsem.c 2017-01-26 08:25:24.000000000 +0100
11653 +++ linux-4.9.6/kernel/locking/percpu-rwsem.c 2017-01-28 13:59:09.959657263 +0100
11655 /* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */
11656 rcu_sync_init(&sem->rss, RCU_SCHED_SYNC);
11657 __init_rwsem(&sem->rw_sem, name, rwsem_key);
11658 - init_waitqueue_head(&sem->writer);
11659 + init_swait_queue_head(&sem->writer);
11660 sem->readers_block = 0;
11663 @@ -103,7 +103,7 @@
11664 __this_cpu_dec(*sem->read_count);
11666 /* Prod writer to recheck readers_active */
11667 - wake_up(&sem->writer);
11668 + swake_up(&sem->writer);
11670 EXPORT_SYMBOL_GPL(__percpu_up_read);
11672 @@ -160,7 +160,7 @@
11675 /* Wait for all now active readers to complete. */
11676 - wait_event(sem->writer, readers_active_check(sem));
11677 + swait_event(sem->writer, readers_active_check(sem));
11679 EXPORT_SYMBOL_GPL(percpu_down_write);
11681 diff -Nur linux-4.9.6.orig/kernel/locking/rt.c linux-4.9.6/kernel/locking/rt.c
11682 --- linux-4.9.6.orig/kernel/locking/rt.c 1970-01-01 01:00:00.000000000 +0100
11683 +++ linux-4.9.6/kernel/locking/rt.c 2017-01-28 13:59:10.035660185 +0100
11688 + * Real-Time Preemption Support
11690 + * started by Ingo Molnar:
11692 + * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
11693 + * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
11695 + * historic credit for proving that Linux spinlocks can be implemented via
11696 + * RT-aware mutexes goes to many people: The Pmutex project (Dirk Grambow
11697 + * and others) who prototyped it on 2.4 and did lots of comparative
11698 + * research and analysis; TimeSys, for proving that you can implement a
11699 + * fully preemptible kernel via the use of IRQ threading and mutexes;
11700 + * Bill Huey for persuasively arguing on lkml that the mutex model is the
11701 + * right one; and to MontaVista, who ported pmutexes to 2.6.
11703 + * This code is a from-scratch implementation and is not based on pmutexes,
11704 + * but the idea of converting spinlocks to mutexes is used here too.
11706 + * lock debugging, locking tree, deadlock detection:
11708 + * Copyright (C) 2004, LynuxWorks, Inc., Igor Manyilov, Bill Huey
11709 + * Released under the General Public License (GPL).
11711 + * Includes portions of the generic R/W semaphore implementation from:
11713 + * Copyright (c) 2001 David Howells (dhowells@redhat.com).
11714 + * - Derived partially from idea by Andrea Arcangeli <andrea@suse.de>
11715 + * - Derived also from comments by Linus
11717 + * Pending ownership of locks and ownership stealing:
11719 + * Copyright (C) 2005, Kihon Technologies Inc., Steven Rostedt
11721 + * (also by Steven Rostedt)
11722 + * - Converted single pi_lock to individual task locks.
11724 + * By Esben Nielsen:
11725 + * Doing priority inheritance with help of the scheduler.
11727 + * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
11728 + * - major rework based on Esben Nielsens initial patch
11729 + * - replaced thread_info references by task_struct refs
11730 + * - removed task->pending_owner dependency
11731 + * - BKL drop/reacquire for semaphore style locks to avoid deadlocks
11732 + * in the scheduler return path as discussed with Steven Rostedt
11734 + * Copyright (C) 2006, Kihon Technologies Inc.
11735 + * Steven Rostedt <rostedt@goodmis.org>
11736 + * - debugged and patched Thomas Gleixner's rework.
11737 + * - added back the cmpxchg to the rework.
11738 + * - turned atomic require back on for SMP.
11741 +#include <linux/spinlock.h>
11742 +#include <linux/rtmutex.h>
11743 +#include <linux/sched.h>
11744 +#include <linux/delay.h>
11745 +#include <linux/module.h>
11746 +#include <linux/kallsyms.h>
11747 +#include <linux/syscalls.h>
11748 +#include <linux/interrupt.h>
11749 +#include <linux/plist.h>
11750 +#include <linux/fs.h>
11751 +#include <linux/futex.h>
11752 +#include <linux/hrtimer.h>
11754 +#include "rtmutex_common.h"
11757 + * struct mutex functions
11759 +void __mutex_do_init(struct mutex *mutex, const char *name,
11760 + struct lock_class_key *key)
11762 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
11764 + * Make sure we are not reinitializing a held lock:
11766 + debug_check_no_locks_freed((void *)mutex, sizeof(*mutex));
11767 + lockdep_init_map(&mutex->dep_map, name, key, 0);
11769 + mutex->lock.save_state = 0;
11771 +EXPORT_SYMBOL(__mutex_do_init);
11773 +void __lockfunc _mutex_lock(struct mutex *lock)
11775 + mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
11776 + rt_mutex_lock(&lock->lock);
11778 +EXPORT_SYMBOL(_mutex_lock);
11780 +int __lockfunc _mutex_lock_interruptible(struct mutex *lock)
11784 + mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
11785 + ret = rt_mutex_lock_interruptible(&lock->lock);
11787 + mutex_release(&lock->dep_map, 1, _RET_IP_);
11790 +EXPORT_SYMBOL(_mutex_lock_interruptible);
11792 +int __lockfunc _mutex_lock_killable(struct mutex *lock)
11796 + mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
11797 + ret = rt_mutex_lock_killable(&lock->lock);
11799 + mutex_release(&lock->dep_map, 1, _RET_IP_);
11802 +EXPORT_SYMBOL(_mutex_lock_killable);
11804 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
11805 +void __lockfunc _mutex_lock_nested(struct mutex *lock, int subclass)
11807 + mutex_acquire_nest(&lock->dep_map, subclass, 0, NULL, _RET_IP_);
11808 + rt_mutex_lock(&lock->lock);
11810 +EXPORT_SYMBOL(_mutex_lock_nested);
11812 +void __lockfunc _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest)
11814 + mutex_acquire_nest(&lock->dep_map, 0, 0, nest, _RET_IP_);
11815 + rt_mutex_lock(&lock->lock);
11817 +EXPORT_SYMBOL(_mutex_lock_nest_lock);
11819 +int __lockfunc _mutex_lock_interruptible_nested(struct mutex *lock, int subclass)
11823 + mutex_acquire_nest(&lock->dep_map, subclass, 0, NULL, _RET_IP_);
11824 + ret = rt_mutex_lock_interruptible(&lock->lock);
11826 + mutex_release(&lock->dep_map, 1, _RET_IP_);
11829 +EXPORT_SYMBOL(_mutex_lock_interruptible_nested);
11831 +int __lockfunc _mutex_lock_killable_nested(struct mutex *lock, int subclass)
11835 + mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
11836 + ret = rt_mutex_lock_killable(&lock->lock);
11838 + mutex_release(&lock->dep_map, 1, _RET_IP_);
11841 +EXPORT_SYMBOL(_mutex_lock_killable_nested);
11844 +int __lockfunc _mutex_trylock(struct mutex *lock)
11846 + int ret = rt_mutex_trylock(&lock->lock);
11849 + mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
11853 +EXPORT_SYMBOL(_mutex_trylock);
11855 +void __lockfunc _mutex_unlock(struct mutex *lock)
11857 + mutex_release(&lock->dep_map, 1, _RET_IP_);
11858 + rt_mutex_unlock(&lock->lock);
11860 +EXPORT_SYMBOL(_mutex_unlock);
11863 + * rwlock_t functions
11865 +int __lockfunc rt_write_trylock(rwlock_t *rwlock)
11869 + migrate_disable();
11870 + ret = rt_mutex_trylock(&rwlock->lock);
11872 + rwlock_acquire(&rwlock->dep_map, 0, 1, _RET_IP_);
11874 + migrate_enable();
11878 +EXPORT_SYMBOL(rt_write_trylock);
11880 +int __lockfunc rt_write_trylock_irqsave(rwlock_t *rwlock, unsigned long *flags)
11885 + ret = rt_write_trylock(rwlock);
11888 +EXPORT_SYMBOL(rt_write_trylock_irqsave);
11890 +int __lockfunc rt_read_trylock(rwlock_t *rwlock)
11892 + struct rt_mutex *lock = &rwlock->lock;
11896 + * recursive read locks succeed when current owns the lock,
11897 + * but not when read_depth == 0 which means that the lock is
11900 + if (rt_mutex_owner(lock) != current) {
11901 + migrate_disable();
11902 + ret = rt_mutex_trylock(lock);
11904 + rwlock_acquire(&rwlock->dep_map, 0, 1, _RET_IP_);
11906 + migrate_enable();
11908 + } else if (!rwlock->read_depth) {
11913 + rwlock->read_depth++;
11917 +EXPORT_SYMBOL(rt_read_trylock);
11919 +void __lockfunc rt_write_lock(rwlock_t *rwlock)
11921 + rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_);
11922 + __rt_spin_lock(&rwlock->lock);
11924 +EXPORT_SYMBOL(rt_write_lock);
11926 +void __lockfunc rt_read_lock(rwlock_t *rwlock)
11928 + struct rt_mutex *lock = &rwlock->lock;
11932 + * recursive read locks succeed when current owns the lock
11934 + if (rt_mutex_owner(lock) != current) {
11935 + rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_);
11936 + __rt_spin_lock(lock);
11938 + rwlock->read_depth++;
11941 +EXPORT_SYMBOL(rt_read_lock);
11943 +void __lockfunc rt_write_unlock(rwlock_t *rwlock)
11945 + /* NOTE: we always pass in '1' for nested, for simplicity */
11946 + rwlock_release(&rwlock->dep_map, 1, _RET_IP_);
11947 + __rt_spin_unlock(&rwlock->lock);
11948 + migrate_enable();
11950 +EXPORT_SYMBOL(rt_write_unlock);
11952 +void __lockfunc rt_read_unlock(rwlock_t *rwlock)
11954 + /* Release the lock only when read_depth is down to 0 */
11955 + if (--rwlock->read_depth == 0) {
11956 + rwlock_release(&rwlock->dep_map, 1, _RET_IP_);
11957 + __rt_spin_unlock(&rwlock->lock);
11958 + migrate_enable();
11961 +EXPORT_SYMBOL(rt_read_unlock);
11963 +unsigned long __lockfunc rt_write_lock_irqsave(rwlock_t *rwlock)
11965 + rt_write_lock(rwlock);
11969 +EXPORT_SYMBOL(rt_write_lock_irqsave);
11971 +unsigned long __lockfunc rt_read_lock_irqsave(rwlock_t *rwlock)
11973 + rt_read_lock(rwlock);
11977 +EXPORT_SYMBOL(rt_read_lock_irqsave);
11979 +void __rt_rwlock_init(rwlock_t *rwlock, char *name, struct lock_class_key *key)
11981 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
11983 + * Make sure we are not reinitializing a held lock:
11985 + debug_check_no_locks_freed((void *)rwlock, sizeof(*rwlock));
11986 + lockdep_init_map(&rwlock->dep_map, name, key, 0);
11988 + rwlock->lock.save_state = 1;
11989 + rwlock->read_depth = 0;
11991 +EXPORT_SYMBOL(__rt_rwlock_init);
11997 +void rt_up_write(struct rw_semaphore *rwsem)
11999 + rwsem_release(&rwsem->dep_map, 1, _RET_IP_);
12000 + rt_mutex_unlock(&rwsem->lock);
12002 +EXPORT_SYMBOL(rt_up_write);
12004 +void __rt_up_read(struct rw_semaphore *rwsem)
12006 + if (--rwsem->read_depth == 0)
12007 + rt_mutex_unlock(&rwsem->lock);
12010 +void rt_up_read(struct rw_semaphore *rwsem)
12012 + rwsem_release(&rwsem->dep_map, 1, _RET_IP_);
12013 + __rt_up_read(rwsem);
12015 +EXPORT_SYMBOL(rt_up_read);
12018 + * downgrade a write lock into a read lock
12019 + * - just wake up any readers at the front of the queue
12021 +void rt_downgrade_write(struct rw_semaphore *rwsem)
12023 + BUG_ON(rt_mutex_owner(&rwsem->lock) != current);
12024 + rwsem->read_depth = 1;
12026 +EXPORT_SYMBOL(rt_downgrade_write);
12028 +int rt_down_write_trylock(struct rw_semaphore *rwsem)
12030 + int ret = rt_mutex_trylock(&rwsem->lock);
12033 + rwsem_acquire(&rwsem->dep_map, 0, 1, _RET_IP_);
12036 +EXPORT_SYMBOL(rt_down_write_trylock);
12038 +void rt_down_write(struct rw_semaphore *rwsem)
12040 + rwsem_acquire(&rwsem->dep_map, 0, 0, _RET_IP_);
12041 + rt_mutex_lock(&rwsem->lock);
12043 +EXPORT_SYMBOL(rt_down_write);
12045 +int rt_down_write_killable(struct rw_semaphore *rwsem)
12049 + rwsem_acquire(&rwsem->dep_map, 0, 0, _RET_IP_);
12050 + ret = rt_mutex_lock_killable(&rwsem->lock);
12052 + rwsem_release(&rwsem->dep_map, 1, _RET_IP_);
12055 +EXPORT_SYMBOL(rt_down_write_killable);
12057 +int rt_down_write_killable_nested(struct rw_semaphore *rwsem, int subclass)
12061 + rwsem_acquire(&rwsem->dep_map, subclass, 0, _RET_IP_);
12062 + ret = rt_mutex_lock_killable(&rwsem->lock);
12064 + rwsem_release(&rwsem->dep_map, 1, _RET_IP_);
12067 +EXPORT_SYMBOL(rt_down_write_killable_nested);
12069 +void rt_down_write_nested(struct rw_semaphore *rwsem, int subclass)
12071 + rwsem_acquire(&rwsem->dep_map, subclass, 0, _RET_IP_);
12072 + rt_mutex_lock(&rwsem->lock);
12074 +EXPORT_SYMBOL(rt_down_write_nested);
12076 +void rt_down_write_nested_lock(struct rw_semaphore *rwsem,
12077 + struct lockdep_map *nest)
12079 + rwsem_acquire_nest(&rwsem->dep_map, 0, 0, nest, _RET_IP_);
12080 + rt_mutex_lock(&rwsem->lock);
12082 +EXPORT_SYMBOL(rt_down_write_nested_lock);
12084 +int rt__down_read_trylock(struct rw_semaphore *rwsem)
12086 + struct rt_mutex *lock = &rwsem->lock;
12090 + * recursive read locks succeed when current owns the rwsem,
12091 + * but not when read_depth == 0 which means that the rwsem is
12094 + if (rt_mutex_owner(lock) != current)
12095 + ret = rt_mutex_trylock(&rwsem->lock);
12096 + else if (!rwsem->read_depth)
12100 + rwsem->read_depth++;
12105 +int rt_down_read_trylock(struct rw_semaphore *rwsem)
12109 + ret = rt__down_read_trylock(rwsem);
12111 + rwsem_acquire(&rwsem->dep_map, 0, 1, _RET_IP_);
12115 +EXPORT_SYMBOL(rt_down_read_trylock);
12117 +void rt__down_read(struct rw_semaphore *rwsem)
12119 + struct rt_mutex *lock = &rwsem->lock;
12121 + if (rt_mutex_owner(lock) != current)
12122 + rt_mutex_lock(&rwsem->lock);
12123 + rwsem->read_depth++;
12125 +EXPORT_SYMBOL(rt__down_read);
12127 +static void __rt_down_read(struct rw_semaphore *rwsem, int subclass)
12129 + rwsem_acquire_read(&rwsem->dep_map, subclass, 0, _RET_IP_);
12130 + rt__down_read(rwsem);
12133 +void rt_down_read(struct rw_semaphore *rwsem)
12135 + __rt_down_read(rwsem, 0);
12137 +EXPORT_SYMBOL(rt_down_read);
12139 +void rt_down_read_nested(struct rw_semaphore *rwsem, int subclass)
12141 + __rt_down_read(rwsem, subclass);
12143 +EXPORT_SYMBOL(rt_down_read_nested);
12145 +void __rt_rwsem_init(struct rw_semaphore *rwsem, const char *name,
12146 + struct lock_class_key *key)
12148 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
12150 + * Make sure we are not reinitializing a held lock:
12152 + debug_check_no_locks_freed((void *)rwsem, sizeof(*rwsem));
12153 + lockdep_init_map(&rwsem->dep_map, name, key, 0);
12155 + rwsem->read_depth = 0;
12156 + rwsem->lock.save_state = 0;
12158 +EXPORT_SYMBOL(__rt_rwsem_init);
12161 + * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0
12162 + * @cnt: the atomic which we are to dec
12163 + * @lock: the mutex to return holding if we dec to 0
12165 + * return true and hold lock if we dec to 0, return false otherwise
12167 +int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock)
12169 + /* dec if we can't possibly hit 0 */
12170 + if (atomic_add_unless(cnt, -1, 1))
12172 + /* we might hit 0, so take the lock */
12173 + mutex_lock(lock);
12174 + if (!atomic_dec_and_test(cnt)) {
12175 + /* when we actually did the dec, we didn't hit 0 */
12176 + mutex_unlock(lock);
12179 + /* we hit 0, and we hold the lock */
12182 +EXPORT_SYMBOL(atomic_dec_and_mutex_lock);
12183 diff -Nur linux-4.9.6.orig/kernel/locking/rtmutex.c linux-4.9.6/kernel/locking/rtmutex.c
12184 --- linux-4.9.6.orig/kernel/locking/rtmutex.c 2017-01-26 08:25:24.000000000 +0100
12185 +++ linux-4.9.6/kernel/locking/rtmutex.c 2017-01-28 13:59:10.035660185 +0100
12187 * Copyright (C) 2005-2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
12188 * Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt
12189 * Copyright (C) 2006 Esben Nielsen
12190 + * Adaptive Spinlocks:
12191 + * Copyright (C) 2008 Novell, Inc., Gregory Haskins, Sven Dietrich,
12192 + * and Peter Morreale,
12193 + * Adaptive Spinlocks simplification:
12194 + * Copyright (C) 2008 Red Hat, Inc., Steven Rostedt <srostedt@redhat.com>
12196 * See Documentation/locking/rt-mutex-design.txt for details.
12199 #include <linux/sched/rt.h>
12200 #include <linux/sched/deadline.h>
12201 #include <linux/timer.h>
12202 +#include <linux/ww_mutex.h>
12204 #include "rtmutex_common.h"
12206 @@ -133,6 +139,12 @@
12207 WRITE_ONCE(*p, owner & ~RT_MUTEX_HAS_WAITERS);
12210 +static int rt_mutex_real_waiter(struct rt_mutex_waiter *waiter)
12212 + return waiter && waiter != PI_WAKEUP_INPROGRESS &&
12213 + waiter != PI_REQUEUE_INPROGRESS;
12217 * We can speed up the acquire/release, if there's no debugging state to be
12219 @@ -414,6 +426,14 @@
12220 return debug_rt_mutex_detect_deadlock(waiter, chwalk);
12223 +static void rt_mutex_wake_waiter(struct rt_mutex_waiter *waiter)
12225 + if (waiter->savestate)
12226 + wake_up_lock_sleeper(waiter->task);
12228 + wake_up_process(waiter->task);
12232 * Max number of times we'll walk the boosting chain:
12234 @@ -421,7 +441,8 @@
12236 static inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p)
12238 - return p->pi_blocked_on ? p->pi_blocked_on->lock : NULL;
12239 + return rt_mutex_real_waiter(p->pi_blocked_on) ?
12240 + p->pi_blocked_on->lock : NULL;
12244 @@ -557,7 +578,7 @@
12245 * reached or the state of the chain has changed while we
12246 * dropped the locks.
12249 + if (!rt_mutex_real_waiter(waiter))
12250 goto out_unlock_pi;
12253 @@ -719,13 +740,16 @@
12254 * follow here. This is the end of the chain we are walking.
12256 if (!rt_mutex_owner(lock)) {
12257 + struct rt_mutex_waiter *lock_top_waiter;
12260 * If the requeue [7] above changed the top waiter,
12261 * then we need to wake the new top waiter up to try
12264 - if (prerequeue_top_waiter != rt_mutex_top_waiter(lock))
12265 - wake_up_process(rt_mutex_top_waiter(lock)->task);
12266 + lock_top_waiter = rt_mutex_top_waiter(lock);
12267 + if (prerequeue_top_waiter != lock_top_waiter)
12268 + rt_mutex_wake_waiter(lock_top_waiter);
12269 raw_spin_unlock_irq(&lock->wait_lock);
12272 @@ -818,6 +842,25 @@
12277 +#define STEAL_NORMAL 0
12278 +#define STEAL_LATERAL 1
12281 + * Note that RT tasks are excluded from lateral-steals to prevent the
12282 + * introduction of an unbounded latency
12284 +static inline int lock_is_stealable(struct task_struct *task,
12285 + struct task_struct *pendowner, int mode)
12287 + if (mode == STEAL_NORMAL || rt_task(task)) {
12288 + if (task->prio >= pendowner->prio)
12290 + } else if (task->prio > pendowner->prio)
12296 * Try to take an rt-mutex
12298 @@ -828,8 +871,9 @@
12299 * @waiter: The waiter that is queued to the lock's wait tree if the
12300 * callsite called task_blocked_on_lock(), otherwise NULL
12302 -static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
12303 - struct rt_mutex_waiter *waiter)
12304 +static int __try_to_take_rt_mutex(struct rt_mutex *lock,
12305 + struct task_struct *task,
12306 + struct rt_mutex_waiter *waiter, int mode)
12309 * Before testing whether we can acquire @lock, we set the
12310 @@ -866,8 +910,10 @@
12311 * If waiter is not the highest priority waiter of
12314 - if (waiter != rt_mutex_top_waiter(lock))
12315 + if (waiter != rt_mutex_top_waiter(lock)) {
12316 + /* XXX lock_is_stealable() ? */
12321 * We can acquire the lock. Remove the waiter from the
12322 @@ -885,14 +931,10 @@
12323 * not need to be dequeued.
12325 if (rt_mutex_has_waiters(lock)) {
12327 - * If @task->prio is greater than or equal to
12328 - * the top waiter priority (kernel view),
12331 - if (task->prio >= rt_mutex_top_waiter(lock)->prio)
12333 + struct task_struct *pown = rt_mutex_top_waiter(lock)->task;
12335 + if (task != pown && !lock_is_stealable(task, pown, mode))
12338 * The current top waiter stays enqueued. We
12339 * don't have to change anything in the lock
12340 @@ -941,6 +983,433 @@
12344 +#ifdef CONFIG_PREEMPT_RT_FULL
12346 + * preemptible spin_lock functions:
12348 +static inline void rt_spin_lock_fastlock(struct rt_mutex *lock,
12349 + void (*slowfn)(struct rt_mutex *lock,
12353 + might_sleep_no_state_check();
12356 + migrate_disable();
12358 + if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
12359 + rt_mutex_deadlock_account_lock(lock, current);
12361 + slowfn(lock, do_mig_dis);
12364 +static inline int rt_spin_lock_fastunlock(struct rt_mutex *lock,
12365 + int (*slowfn)(struct rt_mutex *lock))
12367 + if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) {
12368 + rt_mutex_deadlock_account_unlock(current);
12371 + return slowfn(lock);
12375 + * Note that owner is a speculative pointer and dereferencing relies
12376 + * on rcu_read_lock() and the check against the lock owner.
12378 +static int adaptive_wait(struct rt_mutex *lock,
12379 + struct task_struct *owner)
12385 + if (owner != rt_mutex_owner(lock))
12388 + * Ensure that owner->on_cpu is dereferenced _after_
12389 + * checking the above to be valid.
12392 + if (!owner->on_cpu) {
12398 + rcu_read_unlock();
12402 +static int adaptive_wait(struct rt_mutex *lock,
12403 + struct task_struct *orig_owner)
12409 +static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
12410 + struct rt_mutex_waiter *waiter,
12411 + struct task_struct *task,
12412 + enum rtmutex_chainwalk chwalk);
12414 + * Slow path lock function spin_lock style: this variant is very
12415 + * careful not to miss any non-lock wakeups.
12417 + * We store the current state under p->pi_lock in p->saved_state and
12418 + * the try_to_wake_up() code handles this accordingly.
12420 +static void noinline __sched rt_spin_lock_slowlock(struct rt_mutex *lock,
12423 + struct task_struct *lock_owner, *self = current;
12424 + struct rt_mutex_waiter waiter, *top_waiter;
12425 + unsigned long flags;
12428 + rt_mutex_init_waiter(&waiter, true);
12430 + raw_spin_lock_irqsave(&lock->wait_lock, flags);
12432 + if (__try_to_take_rt_mutex(lock, self, NULL, STEAL_LATERAL)) {
12433 + raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
12437 + BUG_ON(rt_mutex_owner(lock) == self);
12440 + * We save whatever state the task is in and we'll restore it
12441 + * after acquiring the lock taking real wakeups into account
12442 + * as well. We are serialized via pi_lock against wakeups. See
12443 + * try_to_wake_up().
12445 + raw_spin_lock(&self->pi_lock);
12446 + self->saved_state = self->state;
12447 + __set_current_state_no_track(TASK_UNINTERRUPTIBLE);
12448 + raw_spin_unlock(&self->pi_lock);
12450 + ret = task_blocks_on_rt_mutex(lock, &waiter, self, RT_MUTEX_MIN_CHAINWALK);
12454 + /* Try to acquire the lock again. */
12455 + if (__try_to_take_rt_mutex(lock, self, &waiter, STEAL_LATERAL))
12458 + top_waiter = rt_mutex_top_waiter(lock);
12459 + lock_owner = rt_mutex_owner(lock);
12461 + raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
12463 + debug_rt_mutex_print_deadlock(&waiter);
12465 + if (top_waiter != &waiter || adaptive_wait(lock, lock_owner)) {
12467 + migrate_enable();
12470 + migrate_disable();
12473 + raw_spin_lock_irqsave(&lock->wait_lock, flags);
12475 + raw_spin_lock(&self->pi_lock);
12476 + __set_current_state_no_track(TASK_UNINTERRUPTIBLE);
12477 + raw_spin_unlock(&self->pi_lock);
12481 + * Restore the task state to current->saved_state. We set it
12482 + * to the original state above and the try_to_wake_up() code
12483 + * has possibly updated it when a real (non-rtmutex) wakeup
12484 + * happened while we were blocked. Clear saved_state so
12485 + * try_to_wakeup() does not get confused.
12487 + raw_spin_lock(&self->pi_lock);
12488 + __set_current_state_no_track(self->saved_state);
12489 + self->saved_state = TASK_RUNNING;
12490 + raw_spin_unlock(&self->pi_lock);
12493 + * try_to_take_rt_mutex() sets the waiter bit
12494 + * unconditionally. We might have to fix that up:
12496 + fixup_rt_mutex_waiters(lock);
12498 + BUG_ON(rt_mutex_has_waiters(lock) && &waiter == rt_mutex_top_waiter(lock));
12499 + BUG_ON(!RB_EMPTY_NODE(&waiter.tree_entry));
12501 + raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
12503 + debug_rt_mutex_free_waiter(&waiter);
12506 +static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
12507 + struct wake_q_head *wake_sleeper_q,
12508 + struct rt_mutex *lock);
12510 + * Slow path to release a rt_mutex spin_lock style
12512 +static int noinline __sched rt_spin_lock_slowunlock(struct rt_mutex *lock)
12514 + unsigned long flags;
12516 + WAKE_Q(wake_sleeper_q);
12518 + raw_spin_lock_irqsave(&lock->wait_lock, flags);
12520 + debug_rt_mutex_unlock(lock);
12522 + rt_mutex_deadlock_account_unlock(current);
12524 + if (!rt_mutex_has_waiters(lock)) {
12525 + lock->owner = NULL;
12526 + raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
12530 + mark_wakeup_next_waiter(&wake_q, &wake_sleeper_q, lock);
12532 + raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
12533 + wake_up_q(&wake_q);
12534 + wake_up_q_sleeper(&wake_sleeper_q);
12536 + /* Undo pi boosting.when necessary */
12537 + rt_mutex_adjust_prio(current);
12541 +static int noinline __sched rt_spin_lock_slowunlock_no_deboost(struct rt_mutex *lock)
12543 + unsigned long flags;
12545 + WAKE_Q(wake_sleeper_q);
12547 + raw_spin_lock_irqsave(&lock->wait_lock, flags);
12549 + debug_rt_mutex_unlock(lock);
12551 + rt_mutex_deadlock_account_unlock(current);
12553 + if (!rt_mutex_has_waiters(lock)) {
12554 + lock->owner = NULL;
12555 + raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
12559 + mark_wakeup_next_waiter(&wake_q, &wake_sleeper_q, lock);
12561 + raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
12562 + wake_up_q(&wake_q);
12563 + wake_up_q_sleeper(&wake_sleeper_q);
12567 +void __lockfunc rt_spin_lock__no_mg(spinlock_t *lock)
12569 + rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock, false);
12570 + spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
12572 +EXPORT_SYMBOL(rt_spin_lock__no_mg);
12574 +void __lockfunc rt_spin_lock(spinlock_t *lock)
12576 + rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock, true);
12577 + spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
12579 +EXPORT_SYMBOL(rt_spin_lock);
12581 +void __lockfunc __rt_spin_lock(struct rt_mutex *lock)
12583 + rt_spin_lock_fastlock(lock, rt_spin_lock_slowlock, true);
12585 +EXPORT_SYMBOL(__rt_spin_lock);
12587 +void __lockfunc __rt_spin_lock__no_mg(struct rt_mutex *lock)
12589 + rt_spin_lock_fastlock(lock, rt_spin_lock_slowlock, false);
12591 +EXPORT_SYMBOL(__rt_spin_lock__no_mg);
12593 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
12594 +void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int subclass)
12596 + spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
12597 + rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock, true);
12599 +EXPORT_SYMBOL(rt_spin_lock_nested);
12602 +void __lockfunc rt_spin_unlock__no_mg(spinlock_t *lock)
12604 + /* NOTE: we always pass in '1' for nested, for simplicity */
12605 + spin_release(&lock->dep_map, 1, _RET_IP_);
12606 + rt_spin_lock_fastunlock(&lock->lock, rt_spin_lock_slowunlock);
12608 +EXPORT_SYMBOL(rt_spin_unlock__no_mg);
12610 +void __lockfunc rt_spin_unlock(spinlock_t *lock)
12612 + /* NOTE: we always pass in '1' for nested, for simplicity */
12613 + spin_release(&lock->dep_map, 1, _RET_IP_);
12614 + rt_spin_lock_fastunlock(&lock->lock, rt_spin_lock_slowunlock);
12615 + migrate_enable();
12617 +EXPORT_SYMBOL(rt_spin_unlock);
12619 +int __lockfunc rt_spin_unlock_no_deboost(spinlock_t *lock)
12623 + /* NOTE: we always pass in '1' for nested, for simplicity */
12624 + spin_release(&lock->dep_map, 1, _RET_IP_);
12625 + ret = rt_spin_lock_fastunlock(&lock->lock, rt_spin_lock_slowunlock_no_deboost);
12626 + migrate_enable();
12630 +void __lockfunc __rt_spin_unlock(struct rt_mutex *lock)
12632 + rt_spin_lock_fastunlock(lock, rt_spin_lock_slowunlock);
12634 +EXPORT_SYMBOL(__rt_spin_unlock);
12637 + * Wait for the lock to get unlocked: instead of polling for an unlock
12638 + * (like raw spinlocks do), we lock and unlock, to force the kernel to
12639 + * schedule if there's contention:
12641 +void __lockfunc rt_spin_unlock_wait(spinlock_t *lock)
12644 + spin_unlock(lock);
12646 +EXPORT_SYMBOL(rt_spin_unlock_wait);
12648 +int __lockfunc rt_spin_trylock__no_mg(spinlock_t *lock)
12652 + ret = rt_mutex_trylock(&lock->lock);
12654 + spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
12657 +EXPORT_SYMBOL(rt_spin_trylock__no_mg);
12659 +int __lockfunc rt_spin_trylock(spinlock_t *lock)
12663 + migrate_disable();
12664 + ret = rt_mutex_trylock(&lock->lock);
12666 + spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
12668 + migrate_enable();
12671 +EXPORT_SYMBOL(rt_spin_trylock);
12673 +int __lockfunc rt_spin_trylock_bh(spinlock_t *lock)
12677 + local_bh_disable();
12678 + ret = rt_mutex_trylock(&lock->lock);
12680 + migrate_disable();
12681 + spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
12683 + local_bh_enable();
12686 +EXPORT_SYMBOL(rt_spin_trylock_bh);
12688 +int __lockfunc rt_spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags)
12693 + ret = rt_mutex_trylock(&lock->lock);
12695 + migrate_disable();
12696 + spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
12700 +EXPORT_SYMBOL(rt_spin_trylock_irqsave);
12702 +int atomic_dec_and_spin_lock(atomic_t *atomic, spinlock_t *lock)
12704 + /* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */
12705 + if (atomic_add_unless(atomic, -1, 1))
12707 + rt_spin_lock(lock);
12708 + if (atomic_dec_and_test(atomic))
12710 + rt_spin_unlock(lock);
12713 +EXPORT_SYMBOL(atomic_dec_and_spin_lock);
12716 +__rt_spin_lock_init(spinlock_t *lock, char *name, struct lock_class_key *key)
12718 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
12720 + * Make sure we are not reinitializing a held lock:
12722 + debug_check_no_locks_freed((void *)lock, sizeof(*lock));
12723 + lockdep_init_map(&lock->dep_map, name, key, 0);
12726 +EXPORT_SYMBOL(__rt_spin_lock_init);
12728 +#endif /* PREEMPT_RT_FULL */
12730 +#ifdef CONFIG_PREEMPT_RT_FULL
12731 + static inline int __sched
12732 +__mutex_lock_check_stamp(struct rt_mutex *lock, struct ww_acquire_ctx *ctx)
12734 + struct ww_mutex *ww = container_of(lock, struct ww_mutex, base.lock);
12735 + struct ww_acquire_ctx *hold_ctx = ACCESS_ONCE(ww->ctx);
12740 + if (unlikely(ctx == hold_ctx))
12741 + return -EALREADY;
12743 + if (ctx->stamp - hold_ctx->stamp <= LONG_MAX &&
12744 + (ctx->stamp != hold_ctx->stamp || ctx > hold_ctx)) {
12745 +#ifdef CONFIG_DEBUG_MUTEXES
12746 + DEBUG_LOCKS_WARN_ON(ctx->contending_lock);
12747 + ctx->contending_lock = ww;
12755 + static inline int __sched
12756 +__mutex_lock_check_stamp(struct rt_mutex *lock, struct ww_acquire_ctx *ctx)
12765 +try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
12766 + struct rt_mutex_waiter *waiter)
12768 + return __try_to_take_rt_mutex(lock, task, waiter, STEAL_NORMAL);
12772 * Task blocks on lock.
12774 @@ -971,6 +1440,23 @@
12777 raw_spin_lock(&task->pi_lock);
12780 + * In the case of futex requeue PI, this will be a proxy
12781 + * lock. The task will wake unaware that it is enqueueed on
12782 + * this lock. Avoid blocking on two locks and corrupting
12783 + * pi_blocked_on via the PI_WAKEUP_INPROGRESS
12784 + * flag. futex_wait_requeue_pi() sets this when it wakes up
12785 + * before requeue (due to a signal or timeout). Do not enqueue
12786 + * the task if PI_WAKEUP_INPROGRESS is set.
12788 + if (task != current && task->pi_blocked_on == PI_WAKEUP_INPROGRESS) {
12789 + raw_spin_unlock(&task->pi_lock);
12793 + BUG_ON(rt_mutex_real_waiter(task->pi_blocked_on));
12795 __rt_mutex_adjust_prio(task);
12796 waiter->task = task;
12797 waiter->lock = lock;
12798 @@ -994,7 +1480,7 @@
12799 rt_mutex_enqueue_pi(owner, waiter);
12801 __rt_mutex_adjust_prio(owner);
12802 - if (owner->pi_blocked_on)
12803 + if (rt_mutex_real_waiter(owner->pi_blocked_on))
12805 } else if (rt_mutex_cond_detect_deadlock(waiter, chwalk)) {
12807 @@ -1036,6 +1522,7 @@
12808 * Called with lock->wait_lock held and interrupts disabled.
12810 static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
12811 + struct wake_q_head *wake_sleeper_q,
12812 struct rt_mutex *lock)
12814 struct rt_mutex_waiter *waiter;
12815 @@ -1064,7 +1551,10 @@
12817 raw_spin_unlock(¤t->pi_lock);
12819 - wake_q_add(wake_q, waiter->task);
12820 + if (waiter->savestate)
12821 + wake_q_add(wake_sleeper_q, waiter->task);
12823 + wake_q_add(wake_q, waiter->task);
12827 @@ -1078,7 +1568,7 @@
12829 bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock));
12830 struct task_struct *owner = rt_mutex_owner(lock);
12831 - struct rt_mutex *next_lock;
12832 + struct rt_mutex *next_lock = NULL;
12834 raw_spin_lock(¤t->pi_lock);
12835 rt_mutex_dequeue(lock, waiter);
12836 @@ -1102,7 +1592,8 @@
12837 __rt_mutex_adjust_prio(owner);
12839 /* Store the lock on which owner is blocked or NULL */
12840 - next_lock = task_blocked_on_lock(owner);
12841 + if (rt_mutex_real_waiter(owner->pi_blocked_on))
12842 + next_lock = task_blocked_on_lock(owner);
12844 raw_spin_unlock(&owner->pi_lock);
12846 @@ -1138,17 +1629,17 @@
12847 raw_spin_lock_irqsave(&task->pi_lock, flags);
12849 waiter = task->pi_blocked_on;
12850 - if (!waiter || (waiter->prio == task->prio &&
12851 + if (!rt_mutex_real_waiter(waiter) || (waiter->prio == task->prio &&
12852 !dl_prio(task->prio))) {
12853 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
12856 next_lock = waiter->lock;
12857 - raw_spin_unlock_irqrestore(&task->pi_lock, flags);
12859 /* gets dropped in rt_mutex_adjust_prio_chain()! */
12860 get_task_struct(task);
12862 + raw_spin_unlock_irqrestore(&task->pi_lock, flags);
12863 rt_mutex_adjust_prio_chain(task, RT_MUTEX_MIN_CHAINWALK, NULL,
12864 next_lock, NULL, task);
12866 @@ -1166,7 +1657,8 @@
12868 __rt_mutex_slowlock(struct rt_mutex *lock, int state,
12869 struct hrtimer_sleeper *timeout,
12870 - struct rt_mutex_waiter *waiter)
12871 + struct rt_mutex_waiter *waiter,
12872 + struct ww_acquire_ctx *ww_ctx)
12876 @@ -1189,6 +1681,12 @@
12880 + if (ww_ctx && ww_ctx->acquired > 0) {
12881 + ret = __mutex_lock_check_stamp(lock, ww_ctx);
12886 raw_spin_unlock_irq(&lock->wait_lock);
12888 debug_rt_mutex_print_deadlock(waiter);
12889 @@ -1223,21 +1721,96 @@
12893 +static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww,
12894 + struct ww_acquire_ctx *ww_ctx)
12896 +#ifdef CONFIG_DEBUG_MUTEXES
12898 + * If this WARN_ON triggers, you used ww_mutex_lock to acquire,
12899 + * but released with a normal mutex_unlock in this call.
12901 + * This should never happen, always use ww_mutex_unlock.
12903 + DEBUG_LOCKS_WARN_ON(ww->ctx);
12906 + * Not quite done after calling ww_acquire_done() ?
12908 + DEBUG_LOCKS_WARN_ON(ww_ctx->done_acquire);
12910 + if (ww_ctx->contending_lock) {
12912 + * After -EDEADLK you tried to
12913 + * acquire a different ww_mutex? Bad!
12915 + DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock != ww);
12918 + * You called ww_mutex_lock after receiving -EDEADLK,
12919 + * but 'forgot' to unlock everything else first?
12921 + DEBUG_LOCKS_WARN_ON(ww_ctx->acquired > 0);
12922 + ww_ctx->contending_lock = NULL;
12926 + * Naughty, using a different class will lead to undefined behavior!
12928 + DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class);
12930 + ww_ctx->acquired++;
12933 +#ifdef CONFIG_PREEMPT_RT_FULL
12934 +static void ww_mutex_account_lock(struct rt_mutex *lock,
12935 + struct ww_acquire_ctx *ww_ctx)
12937 + struct ww_mutex *ww = container_of(lock, struct ww_mutex, base.lock);
12938 + struct rt_mutex_waiter *waiter, *n;
12941 + * This branch gets optimized out for the common case,
12942 + * and is only important for ww_mutex_lock.
12944 + ww_mutex_lock_acquired(ww, ww_ctx);
12945 + ww->ctx = ww_ctx;
12948 + * Give any possible sleeping processes the chance to wake up,
12949 + * so they can recheck if they have to back off.
12951 + rbtree_postorder_for_each_entry_safe(waiter, n, &lock->waiters,
12953 + /* XXX debug rt mutex waiter wakeup */
12955 + BUG_ON(waiter->lock != lock);
12956 + rt_mutex_wake_waiter(waiter);
12962 +static void ww_mutex_account_lock(struct rt_mutex *lock,
12963 + struct ww_acquire_ctx *ww_ctx)
12970 * Slow path lock function:
12973 rt_mutex_slowlock(struct rt_mutex *lock, int state,
12974 struct hrtimer_sleeper *timeout,
12975 - enum rtmutex_chainwalk chwalk)
12976 + enum rtmutex_chainwalk chwalk,
12977 + struct ww_acquire_ctx *ww_ctx)
12979 struct rt_mutex_waiter waiter;
12980 unsigned long flags;
12983 - debug_rt_mutex_init_waiter(&waiter);
12984 - RB_CLEAR_NODE(&waiter.pi_tree_entry);
12985 - RB_CLEAR_NODE(&waiter.tree_entry);
12986 + rt_mutex_init_waiter(&waiter, false);
12989 * Technically we could use raw_spin_[un]lock_irq() here, but this can
12990 @@ -1251,6 +1824,8 @@
12992 /* Try to acquire the lock again: */
12993 if (try_to_take_rt_mutex(lock, current, NULL)) {
12995 + ww_mutex_account_lock(lock, ww_ctx);
12996 raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
12999 @@ -1265,13 +1840,23 @@
13002 /* sleep on the mutex */
13003 - ret = __rt_mutex_slowlock(lock, state, timeout, &waiter);
13004 + ret = __rt_mutex_slowlock(lock, state, timeout, &waiter,
13006 + else if (ww_ctx) {
13007 + /* ww_mutex received EDEADLK, let it become EALREADY */
13008 + ret = __mutex_lock_check_stamp(lock, ww_ctx);
13012 if (unlikely(ret)) {
13013 __set_current_state(TASK_RUNNING);
13014 if (rt_mutex_has_waiters(lock))
13015 remove_waiter(lock, &waiter);
13016 - rt_mutex_handle_deadlock(ret, chwalk, &waiter);
13017 + /* ww_mutex want to report EDEADLK/EALREADY, let them */
13019 + rt_mutex_handle_deadlock(ret, chwalk, &waiter);
13020 + } else if (ww_ctx) {
13021 + ww_mutex_account_lock(lock, ww_ctx);
13025 @@ -1331,7 +1916,8 @@
13026 * Return whether the current task needs to undo a potential priority boosting.
13028 static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
13029 - struct wake_q_head *wake_q)
13030 + struct wake_q_head *wake_q,
13031 + struct wake_q_head *wake_sleeper_q)
13033 unsigned long flags;
13035 @@ -1387,7 +1973,7 @@
13037 * Queue the next waiter for wakeup once we release the wait_lock.
13039 - mark_wakeup_next_waiter(wake_q, lock);
13040 + mark_wakeup_next_waiter(wake_q, wake_sleeper_q, lock);
13042 raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
13044 @@ -1403,31 +1989,36 @@
13047 rt_mutex_fastlock(struct rt_mutex *lock, int state,
13048 + struct ww_acquire_ctx *ww_ctx,
13049 int (*slowfn)(struct rt_mutex *lock, int state,
13050 struct hrtimer_sleeper *timeout,
13051 - enum rtmutex_chainwalk chwalk))
13052 + enum rtmutex_chainwalk chwalk,
13053 + struct ww_acquire_ctx *ww_ctx))
13055 if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) {
13056 rt_mutex_deadlock_account_lock(lock, current);
13059 - return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK);
13060 + return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK,
13065 rt_mutex_timed_fastlock(struct rt_mutex *lock, int state,
13066 struct hrtimer_sleeper *timeout,
13067 enum rtmutex_chainwalk chwalk,
13068 + struct ww_acquire_ctx *ww_ctx,
13069 int (*slowfn)(struct rt_mutex *lock, int state,
13070 struct hrtimer_sleeper *timeout,
13071 - enum rtmutex_chainwalk chwalk))
13072 + enum rtmutex_chainwalk chwalk,
13073 + struct ww_acquire_ctx *ww_ctx))
13075 if (chwalk == RT_MUTEX_MIN_CHAINWALK &&
13076 likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) {
13077 rt_mutex_deadlock_account_lock(lock, current);
13080 - return slowfn(lock, state, timeout, chwalk);
13081 + return slowfn(lock, state, timeout, chwalk, ww_ctx);
13085 @@ -1444,17 +2035,20 @@
13087 rt_mutex_fastunlock(struct rt_mutex *lock,
13088 bool (*slowfn)(struct rt_mutex *lock,
13089 - struct wake_q_head *wqh))
13090 + struct wake_q_head *wqh,
13091 + struct wake_q_head *wq_sleeper))
13094 + WAKE_Q(wake_sleeper_q);
13096 if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) {
13097 rt_mutex_deadlock_account_unlock(current);
13100 - bool deboost = slowfn(lock, &wake_q);
13101 + bool deboost = slowfn(lock, &wake_q, &wake_sleeper_q);
13103 wake_up_q(&wake_q);
13104 + wake_up_q_sleeper(&wake_sleeper_q);
13106 /* Undo pi boosting if necessary: */
13108 @@ -1471,7 +2065,7 @@
13112 - rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, rt_mutex_slowlock);
13113 + rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, NULL, rt_mutex_slowlock);
13115 EXPORT_SYMBOL_GPL(rt_mutex_lock);
13117 @@ -1488,7 +2082,7 @@
13121 - return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, rt_mutex_slowlock);
13122 + return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, NULL, rt_mutex_slowlock);
13124 EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
13126 @@ -1501,11 +2095,30 @@
13129 return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
13130 - RT_MUTEX_FULL_CHAINWALK,
13131 + RT_MUTEX_FULL_CHAINWALK, NULL,
13132 rt_mutex_slowlock);
13136 + * rt_mutex_lock_killable - lock a rt_mutex killable
13138 + * @lock: the rt_mutex to be locked
13139 + * @detect_deadlock: deadlock detection on/off
13143 + * -EINTR when interrupted by a signal
13144 + * -EDEADLK when the lock would deadlock (when deadlock detection is on)
13146 +int __sched rt_mutex_lock_killable(struct rt_mutex *lock)
13150 + return rt_mutex_fastlock(lock, TASK_KILLABLE, NULL, rt_mutex_slowlock);
13152 +EXPORT_SYMBOL_GPL(rt_mutex_lock_killable);
13155 * rt_mutex_timed_lock - lock a rt_mutex interruptible
13156 * the timeout structure is provided
13158 @@ -1525,6 +2138,7 @@
13160 return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
13161 RT_MUTEX_MIN_CHAINWALK,
13163 rt_mutex_slowlock);
13165 EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);
13166 @@ -1542,7 +2156,11 @@
13168 int __sched rt_mutex_trylock(struct rt_mutex *lock)
13170 +#ifdef CONFIG_PREEMPT_RT_FULL
13171 + if (WARN_ON_ONCE(in_irq() || in_nmi()))
13173 if (WARN_ON_ONCE(in_irq() || in_nmi() || in_serving_softirq()))
13177 return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock);
13178 @@ -1568,13 +2186,14 @@
13181 bool __sched rt_mutex_futex_unlock(struct rt_mutex *lock,
13182 - struct wake_q_head *wqh)
13183 + struct wake_q_head *wqh,
13184 + struct wake_q_head *wq_sleeper)
13186 if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) {
13187 rt_mutex_deadlock_account_unlock(current);
13190 - return rt_mutex_slowunlock(lock, wqh);
13191 + return rt_mutex_slowunlock(lock, wqh, wq_sleeper);
13195 @@ -1607,13 +2226,12 @@
13196 void __rt_mutex_init(struct rt_mutex *lock, const char *name)
13198 lock->owner = NULL;
13199 - raw_spin_lock_init(&lock->wait_lock);
13200 lock->waiters = RB_ROOT;
13201 lock->waiters_leftmost = NULL;
13203 debug_rt_mutex_init(lock, name);
13205 -EXPORT_SYMBOL_GPL(__rt_mutex_init);
13206 +EXPORT_SYMBOL(__rt_mutex_init);
13209 * rt_mutex_init_proxy_locked - initialize and lock a rt_mutex on behalf of a
13210 @@ -1628,7 +2246,7 @@
13211 void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
13212 struct task_struct *proxy_owner)
13214 - __rt_mutex_init(lock, NULL);
13215 + rt_mutex_init(lock);
13216 debug_rt_mutex_proxy_lock(lock, proxy_owner);
13217 rt_mutex_set_owner(lock, proxy_owner);
13218 rt_mutex_deadlock_account_lock(lock, proxy_owner);
13219 @@ -1676,6 +2294,35 @@
13223 +#ifdef CONFIG_PREEMPT_RT_FULL
13225 + * In PREEMPT_RT there's an added race.
13226 + * If the task, that we are about to requeue, times out,
13227 + * it can set the PI_WAKEUP_INPROGRESS. This tells the requeue
13228 + * to skip this task. But right after the task sets
13229 + * its pi_blocked_on to PI_WAKEUP_INPROGRESS it can then
13230 + * block on the spin_lock(&hb->lock), which in RT is an rtmutex.
13231 + * This will replace the PI_WAKEUP_INPROGRESS with the actual
13232 + * lock that it blocks on. We *must not* place this task
13233 + * on this proxy lock in that case.
13235 + * To prevent this race, we first take the task's pi_lock
13236 + * and check if it has updated its pi_blocked_on. If it has,
13237 + * we assume that it woke up and we return -EAGAIN.
13238 + * Otherwise, we set the task's pi_blocked_on to
13239 + * PI_REQUEUE_INPROGRESS, so that if the task is waking up
13240 + * it will know that we are in the process of requeuing it.
13242 + raw_spin_lock(&task->pi_lock);
13243 + if (task->pi_blocked_on) {
13244 + raw_spin_unlock(&task->pi_lock);
13245 + raw_spin_unlock_irq(&lock->wait_lock);
13248 + task->pi_blocked_on = PI_REQUEUE_INPROGRESS;
13249 + raw_spin_unlock(&task->pi_lock);
13252 /* We enforce deadlock detection for futexes */
13253 ret = task_blocks_on_rt_mutex(lock, waiter, task,
13254 RT_MUTEX_FULL_CHAINWALK);
13255 @@ -1690,7 +2337,7 @@
13259 - if (unlikely(ret))
13260 + if (ret && rt_mutex_has_waiters(lock))
13261 remove_waiter(lock, waiter);
13263 raw_spin_unlock_irq(&lock->wait_lock);
13264 @@ -1746,7 +2393,7 @@
13265 set_current_state(TASK_INTERRUPTIBLE);
13267 /* sleep on the mutex */
13268 - ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter);
13269 + ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter, NULL);
13272 remove_waiter(lock, waiter);
13273 @@ -1761,3 +2408,89 @@
13279 +ww_mutex_deadlock_injection(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
13281 +#ifdef CONFIG_DEBUG_WW_MUTEX_SLOWPATH
13284 + if (ctx->deadlock_inject_countdown-- == 0) {
13285 + tmp = ctx->deadlock_inject_interval;
13286 + if (tmp > UINT_MAX/4)
13289 + tmp = tmp*2 + tmp + tmp/2;
13291 + ctx->deadlock_inject_interval = tmp;
13292 + ctx->deadlock_inject_countdown = tmp;
13293 + ctx->contending_lock = lock;
13295 + ww_mutex_unlock(lock);
13304 +#ifdef CONFIG_PREEMPT_RT_FULL
13306 +__ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ww_ctx)
13312 + mutex_acquire_nest(&lock->base.dep_map, 0, 0, &ww_ctx->dep_map, _RET_IP_);
13313 + ret = rt_mutex_slowlock(&lock->base.lock, TASK_INTERRUPTIBLE, NULL, 0, ww_ctx);
13315 + mutex_release(&lock->base.dep_map, 1, _RET_IP_);
13316 + else if (!ret && ww_ctx->acquired > 1)
13317 + return ww_mutex_deadlock_injection(lock, ww_ctx);
13321 +EXPORT_SYMBOL_GPL(__ww_mutex_lock_interruptible);
13324 +__ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ww_ctx)
13330 + mutex_acquire_nest(&lock->base.dep_map, 0, 0, &ww_ctx->dep_map, _RET_IP_);
13331 + ret = rt_mutex_slowlock(&lock->base.lock, TASK_UNINTERRUPTIBLE, NULL, 0, ww_ctx);
13333 + mutex_release(&lock->base.dep_map, 1, _RET_IP_);
13334 + else if (!ret && ww_ctx->acquired > 1)
13335 + return ww_mutex_deadlock_injection(lock, ww_ctx);
13339 +EXPORT_SYMBOL_GPL(__ww_mutex_lock);
13341 +void __sched ww_mutex_unlock(struct ww_mutex *lock)
13343 + int nest = !!lock->ctx;
13346 + * The unlocking fastpath is the 0->1 transition from 'locked'
13347 + * into 'unlocked' state:
13350 +#ifdef CONFIG_DEBUG_MUTEXES
13351 + DEBUG_LOCKS_WARN_ON(!lock->ctx->acquired);
13353 + if (lock->ctx->acquired > 0)
13354 + lock->ctx->acquired--;
13355 + lock->ctx = NULL;
13358 + mutex_release(&lock->base.dep_map, nest, _RET_IP_);
13359 + rt_mutex_unlock(&lock->base.lock);
13361 +EXPORT_SYMBOL(ww_mutex_unlock);
13363 diff -Nur linux-4.9.6.orig/kernel/locking/rtmutex_common.h linux-4.9.6/kernel/locking/rtmutex_common.h
13364 --- linux-4.9.6.orig/kernel/locking/rtmutex_common.h 2017-01-26 08:25:24.000000000 +0100
13365 +++ linux-4.9.6/kernel/locking/rtmutex_common.h 2017-01-28 13:59:10.035660185 +0100
13367 struct rb_node pi_tree_entry;
13368 struct task_struct *task;
13369 struct rt_mutex *lock;
13371 #ifdef CONFIG_DEBUG_RT_MUTEXES
13373 struct pid *deadlock_task_pid;
13376 * PI-futex support (proxy locking functions, etc.):
13378 +#define PI_WAKEUP_INPROGRESS ((struct rt_mutex_waiter *) 1)
13379 +#define PI_REQUEUE_INPROGRESS ((struct rt_mutex_waiter *) 2)
13381 extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock);
13382 extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
13383 struct task_struct *proxy_owner);
13384 @@ -111,7 +115,8 @@
13385 struct rt_mutex_waiter *waiter);
13386 extern int rt_mutex_timed_futex_lock(struct rt_mutex *l, struct hrtimer_sleeper *to);
13387 extern bool rt_mutex_futex_unlock(struct rt_mutex *lock,
13388 - struct wake_q_head *wqh);
13389 + struct wake_q_head *wqh,
13390 + struct wake_q_head *wq_sleeper);
13391 extern void rt_mutex_adjust_prio(struct task_struct *task);
13393 #ifdef CONFIG_DEBUG_RT_MUTEXES
13394 @@ -120,4 +125,14 @@
13395 # include "rtmutex.h"
13398 +static inline void
13399 +rt_mutex_init_waiter(struct rt_mutex_waiter *waiter, bool savestate)
13401 + debug_rt_mutex_init_waiter(waiter);
13402 + waiter->task = NULL;
13403 + waiter->savestate = savestate;
13404 + RB_CLEAR_NODE(&waiter->pi_tree_entry);
13405 + RB_CLEAR_NODE(&waiter->tree_entry);
13409 diff -Nur linux-4.9.6.orig/kernel/locking/spinlock.c linux-4.9.6/kernel/locking/spinlock.c
13410 --- linux-4.9.6.orig/kernel/locking/spinlock.c 2017-01-26 08:25:24.000000000 +0100
13411 +++ linux-4.9.6/kernel/locking/spinlock.c 2017-01-28 13:59:10.035660185 +0100
13412 @@ -124,8 +124,11 @@
13413 * __[spin|read|write]_lock_bh()
13415 BUILD_LOCK_OPS(spin, raw_spinlock);
13417 +#ifndef CONFIG_PREEMPT_RT_FULL
13418 BUILD_LOCK_OPS(read, rwlock);
13419 BUILD_LOCK_OPS(write, rwlock);
13424 @@ -209,6 +212,8 @@
13425 EXPORT_SYMBOL(_raw_spin_unlock_bh);
13428 +#ifndef CONFIG_PREEMPT_RT_FULL
13430 #ifndef CONFIG_INLINE_READ_TRYLOCK
13431 int __lockfunc _raw_read_trylock(rwlock_t *lock)
13433 @@ -353,6 +358,8 @@
13434 EXPORT_SYMBOL(_raw_write_unlock_bh);
13437 +#endif /* !PREEMPT_RT_FULL */
13439 #ifdef CONFIG_DEBUG_LOCK_ALLOC
13441 void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass)
13442 diff -Nur linux-4.9.6.orig/kernel/locking/spinlock_debug.c linux-4.9.6/kernel/locking/spinlock_debug.c
13443 --- linux-4.9.6.orig/kernel/locking/spinlock_debug.c 2017-01-26 08:25:24.000000000 +0100
13444 +++ linux-4.9.6/kernel/locking/spinlock_debug.c 2017-01-28 13:59:10.035660185 +0100
13447 EXPORT_SYMBOL(__raw_spin_lock_init);
13449 +#ifndef CONFIG_PREEMPT_RT_FULL
13450 void __rwlock_init(rwlock_t *lock, const char *name,
13451 struct lock_class_key *key)
13456 EXPORT_SYMBOL(__rwlock_init);
13459 static void spin_dump(raw_spinlock_t *lock, const char *msg)
13461 @@ -159,6 +161,7 @@
13462 arch_spin_unlock(&lock->raw_lock);
13465 +#ifndef CONFIG_PREEMPT_RT_FULL
13466 static void rwlock_bug(rwlock_t *lock, const char *msg)
13468 if (!debug_locks_off())
13469 @@ -300,3 +303,5 @@
13470 debug_write_unlock(lock);
13471 arch_write_unlock(&lock->raw_lock);
13475 diff -Nur linux-4.9.6.orig/kernel/panic.c linux-4.9.6/kernel/panic.c
13476 --- linux-4.9.6.orig/kernel/panic.c 2017-01-26 08:25:24.000000000 +0100
13477 +++ linux-4.9.6/kernel/panic.c 2017-01-28 13:59:10.035660185 +0100
13478 @@ -482,9 +482,11 @@
13480 static int init_oops_id(void)
13482 +#ifndef CONFIG_PREEMPT_RT_FULL
13484 get_random_bytes(&oops_id, sizeof(oops_id));
13490 diff -Nur linux-4.9.6.orig/kernel/power/hibernate.c linux-4.9.6/kernel/power/hibernate.c
13491 --- linux-4.9.6.orig/kernel/power/hibernate.c 2017-01-26 08:25:24.000000000 +0100
13492 +++ linux-4.9.6/kernel/power/hibernate.c 2017-01-28 13:59:10.039660339 +0100
13493 @@ -286,6 +286,8 @@
13495 local_irq_disable();
13497 + system_state = SYSTEM_SUSPEND;
13499 error = syscore_suspend();
13501 printk(KERN_ERR "PM: Some system devices failed to power down, "
13502 @@ -317,6 +319,7 @@
13506 + system_state = SYSTEM_RUNNING;
13507 local_irq_enable();
13510 @@ -446,6 +449,7 @@
13513 local_irq_disable();
13514 + system_state = SYSTEM_SUSPEND;
13516 error = syscore_suspend();
13518 @@ -479,6 +483,7 @@
13522 + system_state = SYSTEM_RUNNING;
13523 local_irq_enable();
13526 @@ -564,6 +569,7 @@
13529 local_irq_disable();
13530 + system_state = SYSTEM_SUSPEND;
13532 if (pm_wakeup_pending()) {
13534 @@ -576,6 +582,7 @@
13538 + system_state = SYSTEM_RUNNING;
13539 local_irq_enable();
13542 @@ -676,6 +683,10 @@
13546 +#ifndef CONFIG_SUSPEND
13547 +bool pm_in_action;
13551 * hibernate - Carry out system hibernation, including saving the image.
13553 @@ -689,6 +700,8 @@
13557 + pm_in_action = true;
13559 lock_system_sleep();
13560 /* The snapshot device should not be opened while we're running */
13561 if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
13562 @@ -766,6 +779,7 @@
13563 atomic_inc(&snapshot_device_available);
13565 unlock_system_sleep();
13566 + pm_in_action = false;
13570 diff -Nur linux-4.9.6.orig/kernel/power/suspend.c linux-4.9.6/kernel/power/suspend.c
13571 --- linux-4.9.6.orig/kernel/power/suspend.c 2017-01-26 08:25:24.000000000 +0100
13572 +++ linux-4.9.6/kernel/power/suspend.c 2017-01-28 13:59:10.039660339 +0100
13573 @@ -369,6 +369,8 @@
13574 arch_suspend_disable_irqs();
13575 BUG_ON(!irqs_disabled());
13577 + system_state = SYSTEM_SUSPEND;
13579 error = syscore_suspend();
13581 *wakeup = pm_wakeup_pending();
13582 @@ -385,6 +387,8 @@
13586 + system_state = SYSTEM_RUNNING;
13588 arch_suspend_enable_irqs();
13589 BUG_ON(irqs_disabled());
13591 @@ -527,6 +531,8 @@
13595 +bool pm_in_action;
13598 * pm_suspend - Externally visible function for suspending the system.
13599 * @state: System sleep state to enter.
13600 @@ -541,6 +547,8 @@
13601 if (state <= PM_SUSPEND_ON || state >= PM_SUSPEND_MAX)
13604 + pm_in_action = true;
13606 error = enter_state(state);
13608 suspend_stats.fail++;
13609 @@ -548,6 +556,7 @@
13611 suspend_stats.success++;
13613 + pm_in_action = false;
13616 EXPORT_SYMBOL(pm_suspend);
13617 diff -Nur linux-4.9.6.orig/kernel/printk/printk.c linux-4.9.6/kernel/printk/printk.c
13618 --- linux-4.9.6.orig/kernel/printk/printk.c 2017-01-26 08:25:24.000000000 +0100
13619 +++ linux-4.9.6/kernel/printk/printk.c 2017-01-28 13:59:10.039660339 +0100
13620 @@ -351,6 +351,65 @@
13622 DEFINE_RAW_SPINLOCK(logbuf_lock);
13624 +#ifdef CONFIG_EARLY_PRINTK
13625 +struct console *early_console;
13627 +static void early_vprintk(const char *fmt, va_list ap)
13629 + if (early_console) {
13631 + int n = vscnprintf(buf, sizeof(buf), fmt, ap);
13633 + early_console->write(early_console, buf, n);
13637 +asmlinkage void early_printk(const char *fmt, ...)
13641 + va_start(ap, fmt);
13642 + early_vprintk(fmt, ap);
13647 + * This is independent of any log levels - a global
13648 + * kill switch that turns off all of printk.
13650 + * Used by the NMI watchdog if early-printk is enabled.
13652 +static bool __read_mostly printk_killswitch;
13654 +static int __init force_early_printk_setup(char *str)
13656 + printk_killswitch = true;
13659 +early_param("force_early_printk", force_early_printk_setup);
13661 +void printk_kill(void)
13663 + printk_killswitch = true;
13666 +#ifdef CONFIG_PRINTK
13667 +static int forced_early_printk(const char *fmt, va_list ap)
13669 + if (!printk_killswitch)
13671 + early_vprintk(fmt, ap);
13677 +static inline int forced_early_printk(const char *fmt, va_list ap)
13683 #ifdef CONFIG_PRINTK
13684 DECLARE_WAIT_QUEUE_HEAD(log_wait);
13685 /* the next printk record to read by syslog(READ) or /proc/kmsg */
13686 @@ -1337,6 +1396,7 @@
13690 + int attempts = 0;
13692 text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL);
13694 @@ -1348,6 +1408,14 @@
13697 enum log_flags prev;
13701 + if (attempts > 10) {
13708 * Find first record that fits, including all following records,
13709 @@ -1363,6 +1431,14 @@
13711 idx = log_next(idx);
13714 + if (num_msg > 5) {
13716 + raw_spin_unlock_irq(&logbuf_lock);
13717 + raw_spin_lock_irq(&logbuf_lock);
13718 + if (clear_seq < log_first_seq)
13723 /* move first record forward until length fits into the buffer */
13724 @@ -1376,6 +1452,14 @@
13726 idx = log_next(idx);
13729 + if (num_msg > 5) {
13731 + raw_spin_unlock_irq(&logbuf_lock);
13732 + raw_spin_lock_irq(&logbuf_lock);
13733 + if (clear_seq < log_first_seq)
13738 /* last message fitting into this dump */
13739 @@ -1416,6 +1500,7 @@
13740 clear_seq = log_next_seq;
13741 clear_idx = log_next_idx;
13744 raw_spin_unlock_irq(&logbuf_lock);
13747 @@ -1569,6 +1654,12 @@
13748 if (!console_drivers)
13751 + if (IS_ENABLED(CONFIG_PREEMPT_RT_BASE)) {
13752 + if (in_irq() || in_nmi())
13756 + migrate_disable();
13757 for_each_console(con) {
13758 if (exclusive_console && con != exclusive_console)
13760 @@ -1584,6 +1675,7 @@
13762 con->write(con, text, len);
13764 + migrate_enable();
13768 @@ -1781,6 +1873,13 @@
13769 /* cpu currently holding logbuf_lock in this function */
13770 static unsigned int logbuf_cpu = UINT_MAX;
13773 + * Fall back to early_printk if a debugging subsystem has
13774 + * killed printk output
13776 + if (unlikely(forced_early_printk(fmt, args)))
13779 if (level == LOGLEVEL_SCHED) {
13780 level = LOGLEVEL_DEFAULT;
13782 @@ -1885,13 +1984,23 @@
13784 /* If called from the scheduler, we can not call up(). */
13786 + int may_trylock = 1;
13789 +#ifdef CONFIG_PREEMPT_RT_FULL
13791 + * we can't take a sleeping lock with IRQs or preeption disabled
13792 + * so we can't print in these contexts
13794 + if (!(preempt_count() == 0 && !irqs_disabled()))
13798 * Try to acquire and then immediately release the console
13799 * semaphore. The release will print out buffers and wake up
13800 * /dev/kmsg and syslog() users.
13802 - if (console_trylock())
13803 + if (may_trylock && console_trylock())
13807 @@ -2014,26 +2123,6 @@
13809 #endif /* CONFIG_PRINTK */
13811 -#ifdef CONFIG_EARLY_PRINTK
13812 -struct console *early_console;
13814 -asmlinkage __visible void early_printk(const char *fmt, ...)
13820 - if (!early_console)
13823 - va_start(ap, fmt);
13824 - n = vscnprintf(buf, sizeof(buf), fmt, ap);
13827 - early_console->write(early_console, buf, n);
13831 static int __add_preferred_console(char *name, int idx, char *options,
13834 @@ -2303,11 +2392,16 @@
13837 len = cont_print_text(text, size);
13838 +#ifdef CONFIG_PREEMPT_RT_FULL
13839 + raw_spin_unlock_irqrestore(&logbuf_lock, flags);
13840 + call_console_drivers(cont.level, NULL, 0, text, len);
13842 raw_spin_unlock(&logbuf_lock);
13843 stop_critical_timings();
13844 call_console_drivers(cont.level, NULL, 0, text, len);
13845 start_critical_timings();
13846 local_irq_restore(flags);
13850 raw_spin_unlock_irqrestore(&logbuf_lock, flags);
13851 @@ -2431,13 +2525,17 @@
13852 console_idx = log_next(console_idx);
13854 console_prev = msg->flags;
13855 +#ifdef CONFIG_PREEMPT_RT_FULL
13856 + raw_spin_unlock_irqrestore(&logbuf_lock, flags);
13857 + call_console_drivers(level, ext_text, ext_len, text, len);
13859 raw_spin_unlock(&logbuf_lock);
13861 stop_critical_timings(); /* don't trace print latency */
13862 call_console_drivers(level, ext_text, ext_len, text, len);
13863 start_critical_timings();
13864 local_irq_restore(flags);
13867 if (do_cond_resched)
13870 @@ -2489,6 +2587,11 @@
13874 + if (IS_ENABLED(CONFIG_PREEMPT_RT_BASE)) {
13875 + if (in_irq() || in_nmi())
13880 * console_unblank can no longer be called in interrupt context unless
13881 * oops_in_progress is set to 1..
13882 diff -Nur linux-4.9.6.orig/kernel/ptrace.c linux-4.9.6/kernel/ptrace.c
13883 --- linux-4.9.6.orig/kernel/ptrace.c 2017-01-26 08:25:24.000000000 +0100
13884 +++ linux-4.9.6/kernel/ptrace.c 2017-01-28 13:59:10.039660339 +0100
13885 @@ -166,7 +166,14 @@
13887 spin_lock_irq(&task->sighand->siglock);
13888 if (task_is_traced(task) && !__fatal_signal_pending(task)) {
13889 - task->state = __TASK_TRACED;
13890 + unsigned long flags;
13892 + raw_spin_lock_irqsave(&task->pi_lock, flags);
13893 + if (task->state & __TASK_TRACED)
13894 + task->state = __TASK_TRACED;
13896 + task->saved_state = __TASK_TRACED;
13897 + raw_spin_unlock_irqrestore(&task->pi_lock, flags);
13900 spin_unlock_irq(&task->sighand->siglock);
13901 diff -Nur linux-4.9.6.orig/kernel/rcu/rcutorture.c linux-4.9.6/kernel/rcu/rcutorture.c
13902 --- linux-4.9.6.orig/kernel/rcu/rcutorture.c 2017-01-26 08:25:24.000000000 +0100
13903 +++ linux-4.9.6/kernel/rcu/rcutorture.c 2017-01-28 13:59:10.039660339 +0100
13904 @@ -404,6 +404,7 @@
13908 +#ifndef CONFIG_PREEMPT_RT_FULL
13910 * Definitions for rcu_bh torture testing.
13912 @@ -443,6 +444,12 @@
13917 +static struct rcu_torture_ops rcu_bh_ops = {
13918 + .ttype = INVALID_RCU_FLAVOR,
13923 * Don't even think about trying any of these in real life!!!
13924 * The names includes "busted", and they really means it!
13925 diff -Nur linux-4.9.6.orig/kernel/rcu/tree.c linux-4.9.6/kernel/rcu/tree.c
13926 --- linux-4.9.6.orig/kernel/rcu/tree.c 2017-01-26 08:25:24.000000000 +0100
13927 +++ linux-4.9.6/kernel/rcu/tree.c 2017-01-28 13:59:10.039660339 +0100
13929 #include <linux/random.h>
13930 #include <linux/trace_events.h>
13931 #include <linux/suspend.h>
13932 +#include <linux/delay.h>
13933 +#include <linux/gfp.h>
13934 +#include <linux/oom.h>
13935 +#include <linux/smpboot.h>
13936 +#include "../time/tick-internal.h"
13940 @@ -260,6 +265,19 @@
13941 this_cpu_ptr(&rcu_sched_data), true);
13944 +#ifdef CONFIG_PREEMPT_RT_FULL
13945 +static void rcu_preempt_qs(void);
13947 +void rcu_bh_qs(void)
13949 + unsigned long flags;
13951 + /* Callers to this function, rcu_preempt_qs(), must disable irqs. */
13952 + local_irq_save(flags);
13953 + rcu_preempt_qs();
13954 + local_irq_restore(flags);
13957 void rcu_bh_qs(void)
13959 if (__this_cpu_read(rcu_bh_data.cpu_no_qs.s)) {
13960 @@ -269,6 +287,7 @@
13961 __this_cpu_write(rcu_bh_data.cpu_no_qs.b.norm, false);
13966 static DEFINE_PER_CPU(int, rcu_sched_qs_mask);
13968 @@ -449,11 +468,13 @@
13970 * Return the number of RCU BH batches started thus far for debug & stats.
13972 +#ifndef CONFIG_PREEMPT_RT_FULL
13973 unsigned long rcu_batches_started_bh(void)
13975 return rcu_bh_state.gpnum;
13977 EXPORT_SYMBOL_GPL(rcu_batches_started_bh);
13981 * Return the number of RCU batches completed thus far for debug & stats.
13982 @@ -473,6 +494,7 @@
13984 EXPORT_SYMBOL_GPL(rcu_batches_completed_sched);
13986 +#ifndef CONFIG_PREEMPT_RT_FULL
13988 * Return the number of RCU BH batches completed thus far for debug & stats.
13990 @@ -481,6 +503,7 @@
13991 return rcu_bh_state.completed;
13993 EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
13997 * Return the number of RCU expedited batches completed thus far for
13998 @@ -504,6 +527,7 @@
14000 EXPORT_SYMBOL_GPL(rcu_exp_batches_completed_sched);
14002 +#ifndef CONFIG_PREEMPT_RT_FULL
14004 * Force a quiescent state.
14006 @@ -522,6 +546,13 @@
14008 EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state);
14011 +void rcu_force_quiescent_state(void)
14014 +EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
14018 * Force a quiescent state for RCU-sched.
14020 @@ -572,9 +603,11 @@
14024 +#ifndef CONFIG_PREEMPT_RT_FULL
14025 case RCU_BH_FLAVOR:
14026 rsp = &rcu_bh_state;
14029 case RCU_SCHED_FLAVOR:
14030 rsp = &rcu_sched_state;
14032 @@ -3016,18 +3049,17 @@
14034 * Do RCU core processing for the current CPU.
14036 -static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused)
14037 +static __latent_entropy void rcu_process_callbacks(void)
14039 struct rcu_state *rsp;
14041 if (cpu_is_offline(smp_processor_id()))
14043 - trace_rcu_utilization(TPS("Start RCU core"));
14044 for_each_rcu_flavor(rsp)
14045 __rcu_process_callbacks(rsp);
14046 - trace_rcu_utilization(TPS("End RCU core"));
14049 +static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
14051 * Schedule RCU callback invocation. If the specified type of RCU
14052 * does not support RCU priority boosting, just do a direct call,
14053 @@ -3039,18 +3071,105 @@
14055 if (unlikely(!READ_ONCE(rcu_scheduler_fully_active)))
14057 - if (likely(!rsp->boost)) {
14058 - rcu_do_batch(rsp, rdp);
14059 + rcu_do_batch(rsp, rdp);
14062 +static void rcu_wake_cond(struct task_struct *t, int status)
14065 + * If the thread is yielding, only wake it when this
14066 + * is invoked from idle
14068 + if (t && (status != RCU_KTHREAD_YIELDING || is_idle_task(current)))
14069 + wake_up_process(t);
14073 + * Wake up this CPU's rcuc kthread to do RCU core processing.
14075 +static void invoke_rcu_core(void)
14077 + unsigned long flags;
14078 + struct task_struct *t;
14080 + if (!cpu_online(smp_processor_id()))
14082 + local_irq_save(flags);
14083 + __this_cpu_write(rcu_cpu_has_work, 1);
14084 + t = __this_cpu_read(rcu_cpu_kthread_task);
14085 + if (t != NULL && current != t)
14086 + rcu_wake_cond(t, __this_cpu_read(rcu_cpu_kthread_status));
14087 + local_irq_restore(flags);
14090 +static void rcu_cpu_kthread_park(unsigned int cpu)
14092 + per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
14095 +static int rcu_cpu_kthread_should_run(unsigned int cpu)
14097 + return __this_cpu_read(rcu_cpu_has_work);
14101 + * Per-CPU kernel thread that invokes RCU callbacks. This replaces the
14102 + * RCU softirq used in flavors and configurations of RCU that do not
14103 + * support RCU priority boosting.
14105 +static void rcu_cpu_kthread(unsigned int cpu)
14107 + unsigned int *statusp = this_cpu_ptr(&rcu_cpu_kthread_status);
14108 + char work, *workp = this_cpu_ptr(&rcu_cpu_has_work);
14111 + for (spincnt = 0; spincnt < 10; spincnt++) {
14112 + trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait"));
14113 + local_bh_disable();
14114 + *statusp = RCU_KTHREAD_RUNNING;
14115 + this_cpu_inc(rcu_cpu_kthread_loops);
14116 + local_irq_disable();
14119 + local_irq_enable();
14121 + rcu_process_callbacks();
14122 + local_bh_enable();
14123 + if (*workp == 0) {
14124 + trace_rcu_utilization(TPS("End CPU kthread@rcu_wait"));
14125 + *statusp = RCU_KTHREAD_WAITING;
14129 - invoke_rcu_callbacks_kthread();
14130 + *statusp = RCU_KTHREAD_YIELDING;
14131 + trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield"));
14132 + schedule_timeout_interruptible(2);
14133 + trace_rcu_utilization(TPS("End CPU kthread@rcu_yield"));
14134 + *statusp = RCU_KTHREAD_WAITING;
14137 -static void invoke_rcu_core(void)
14138 +static struct smp_hotplug_thread rcu_cpu_thread_spec = {
14139 + .store = &rcu_cpu_kthread_task,
14140 + .thread_should_run = rcu_cpu_kthread_should_run,
14141 + .thread_fn = rcu_cpu_kthread,
14142 + .thread_comm = "rcuc/%u",
14143 + .setup = rcu_cpu_kthread_setup,
14144 + .park = rcu_cpu_kthread_park,
14148 + * Spawn per-CPU RCU core processing kthreads.
14150 +static int __init rcu_spawn_core_kthreads(void)
14152 - if (cpu_online(smp_processor_id()))
14153 - raise_softirq(RCU_SOFTIRQ);
14156 + for_each_possible_cpu(cpu)
14157 + per_cpu(rcu_cpu_has_work, cpu) = 0;
14158 + BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec));
14161 +early_initcall(rcu_spawn_core_kthreads);
14164 * Handle any core-RCU processing required by a call_rcu() invocation.
14165 @@ -3195,6 +3314,7 @@
14167 EXPORT_SYMBOL_GPL(call_rcu_sched);
14169 +#ifndef CONFIG_PREEMPT_RT_FULL
14171 * Queue an RCU callback for invocation after a quicker grace period.
14173 @@ -3203,6 +3323,7 @@
14174 __call_rcu(head, func, &rcu_bh_state, -1, 0);
14176 EXPORT_SYMBOL_GPL(call_rcu_bh);
14180 * Queue an RCU callback for lazy invocation after a grace period.
14181 @@ -3294,6 +3415,7 @@
14183 EXPORT_SYMBOL_GPL(synchronize_sched);
14185 +#ifndef CONFIG_PREEMPT_RT_FULL
14187 * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed.
14189 @@ -3320,6 +3442,7 @@
14190 wait_rcu_gp(call_rcu_bh);
14192 EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
14196 * get_state_synchronize_rcu - Snapshot current RCU state
14197 @@ -3698,6 +3821,7 @@
14198 mutex_unlock(&rsp->barrier_mutex);
14201 +#ifndef CONFIG_PREEMPT_RT_FULL
14203 * rcu_barrier_bh - Wait until all in-flight call_rcu_bh() callbacks complete.
14205 @@ -3706,6 +3830,7 @@
14206 _rcu_barrier(&rcu_bh_state);
14208 EXPORT_SYMBOL_GPL(rcu_barrier_bh);
14212 * rcu_barrier_sched - Wait for in-flight call_rcu_sched() callbacks.
14213 @@ -4227,12 +4352,13 @@
14215 rcu_bootup_announce();
14216 rcu_init_geometry();
14217 +#ifndef CONFIG_PREEMPT_RT_FULL
14218 rcu_init_one(&rcu_bh_state);
14220 rcu_init_one(&rcu_sched_state);
14222 rcu_dump_rcu_node_tree(&rcu_sched_state);
14223 __rcu_init_preempt();
14224 - open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
14227 * We don't need protection against CPU-hotplug here because
14228 diff -Nur linux-4.9.6.orig/kernel/rcu/tree.h linux-4.9.6/kernel/rcu/tree.h
14229 --- linux-4.9.6.orig/kernel/rcu/tree.h 2017-01-26 08:25:24.000000000 +0100
14230 +++ linux-4.9.6/kernel/rcu/tree.h 2017-01-28 13:59:10.043660493 +0100
14231 @@ -588,18 +588,18 @@
14233 extern struct rcu_state rcu_sched_state;
14235 +#ifndef CONFIG_PREEMPT_RT_FULL
14236 extern struct rcu_state rcu_bh_state;
14239 #ifdef CONFIG_PREEMPT_RCU
14240 extern struct rcu_state rcu_preempt_state;
14241 #endif /* #ifdef CONFIG_PREEMPT_RCU */
14243 -#ifdef CONFIG_RCU_BOOST
14244 DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
14245 DECLARE_PER_CPU(int, rcu_cpu_kthread_cpu);
14246 DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
14247 DECLARE_PER_CPU(char, rcu_cpu_has_work);
14248 -#endif /* #ifdef CONFIG_RCU_BOOST */
14250 #ifndef RCU_TREE_NONCORE
14252 @@ -619,10 +619,9 @@
14253 static void __init __rcu_init_preempt(void);
14254 static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
14255 static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
14256 -static void invoke_rcu_callbacks_kthread(void);
14257 static bool rcu_is_callbacks_kthread(void);
14258 +static void rcu_cpu_kthread_setup(unsigned int cpu);
14259 #ifdef CONFIG_RCU_BOOST
14260 -static void rcu_preempt_do_callbacks(void);
14261 static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
14262 struct rcu_node *rnp);
14263 #endif /* #ifdef CONFIG_RCU_BOOST */
14264 diff -Nur linux-4.9.6.orig/kernel/rcu/tree_plugin.h linux-4.9.6/kernel/rcu/tree_plugin.h
14265 --- linux-4.9.6.orig/kernel/rcu/tree_plugin.h 2017-01-26 08:25:24.000000000 +0100
14266 +++ linux-4.9.6/kernel/rcu/tree_plugin.h 2017-01-28 13:59:10.043660493 +0100
14267 @@ -24,25 +24,10 @@
14268 * Paul E. McKenney <paulmck@linux.vnet.ibm.com>
14271 -#include <linux/delay.h>
14272 -#include <linux/gfp.h>
14273 -#include <linux/oom.h>
14274 -#include <linux/smpboot.h>
14275 -#include "../time/tick-internal.h"
14277 #ifdef CONFIG_RCU_BOOST
14279 #include "../locking/rtmutex_common.h"
14282 - * Control variables for per-CPU and per-rcu_node kthreads. These
14283 - * handle all flavors of RCU.
14285 -static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
14286 -DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
14287 -DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
14288 -DEFINE_PER_CPU(char, rcu_cpu_has_work);
14290 #else /* #ifdef CONFIG_RCU_BOOST */
14295 #endif /* #else #ifdef CONFIG_RCU_BOOST */
14298 + * Control variables for per-CPU and per-rcu_node kthreads. These
14299 + * handle all flavors of RCU.
14301 +DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
14302 +DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
14303 +DEFINE_PER_CPU(char, rcu_cpu_has_work);
14305 #ifdef CONFIG_RCU_NOCB_CPU
14306 static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */
14307 static bool have_rcu_nocb_mask; /* Was rcu_nocb_mask allocated? */
14308 @@ -426,7 +419,7 @@
14311 /* Hardware IRQ handlers cannot block, complain if they get here. */
14312 - if (in_irq() || in_serving_softirq()) {
14313 + if (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_OFFSET)) {
14314 lockdep_rcu_suspicious(__FILE__, __LINE__,
14315 "rcu_read_unlock() from irq or softirq with blocking in critical section!!!\n");
14316 pr_alert("->rcu_read_unlock_special: %#x (b: %d, enq: %d nq: %d)\n",
14317 @@ -632,15 +625,6 @@
14318 t->rcu_read_unlock_special.b.need_qs = true;
14321 -#ifdef CONFIG_RCU_BOOST
14323 -static void rcu_preempt_do_callbacks(void)
14325 - rcu_do_batch(rcu_state_p, this_cpu_ptr(rcu_data_p));
14328 -#endif /* #ifdef CONFIG_RCU_BOOST */
14331 * Queue a preemptible-RCU callback for invocation after a grace period.
14333 @@ -829,6 +813,19 @@
14335 #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
14338 + * If boosting, set rcuc kthreads to realtime priority.
14340 +static void rcu_cpu_kthread_setup(unsigned int cpu)
14342 +#ifdef CONFIG_RCU_BOOST
14343 + struct sched_param sp;
14345 + sp.sched_priority = kthread_prio;
14346 + sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
14347 +#endif /* #ifdef CONFIG_RCU_BOOST */
14350 #ifdef CONFIG_RCU_BOOST
14352 #include "../locking/rtmutex_common.h"
14353 @@ -860,16 +857,6 @@
14355 #endif /* #else #ifdef CONFIG_RCU_TRACE */
14357 -static void rcu_wake_cond(struct task_struct *t, int status)
14360 - * If the thread is yielding, only wake it when this
14361 - * is invoked from idle
14363 - if (status != RCU_KTHREAD_YIELDING || is_idle_task(current))
14364 - wake_up_process(t);
14368 * Carry out RCU priority boosting on the task indicated by ->exp_tasks
14369 * or ->boost_tasks, advancing the pointer to the next task in the
14370 @@ -1013,23 +1000,6 @@
14374 - * Wake up the per-CPU kthread to invoke RCU callbacks.
14376 -static void invoke_rcu_callbacks_kthread(void)
14378 - unsigned long flags;
14380 - local_irq_save(flags);
14381 - __this_cpu_write(rcu_cpu_has_work, 1);
14382 - if (__this_cpu_read(rcu_cpu_kthread_task) != NULL &&
14383 - current != __this_cpu_read(rcu_cpu_kthread_task)) {
14384 - rcu_wake_cond(__this_cpu_read(rcu_cpu_kthread_task),
14385 - __this_cpu_read(rcu_cpu_kthread_status));
14387 - local_irq_restore(flags);
14391 * Is the current CPU running the RCU-callbacks kthread?
14392 * Caller must have preemption disabled.
14394 @@ -1083,67 +1053,6 @@
14398 -static void rcu_kthread_do_work(void)
14400 - rcu_do_batch(&rcu_sched_state, this_cpu_ptr(&rcu_sched_data));
14401 - rcu_do_batch(&rcu_bh_state, this_cpu_ptr(&rcu_bh_data));
14402 - rcu_preempt_do_callbacks();
14405 -static void rcu_cpu_kthread_setup(unsigned int cpu)
14407 - struct sched_param sp;
14409 - sp.sched_priority = kthread_prio;
14410 - sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
14413 -static void rcu_cpu_kthread_park(unsigned int cpu)
14415 - per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
14418 -static int rcu_cpu_kthread_should_run(unsigned int cpu)
14420 - return __this_cpu_read(rcu_cpu_has_work);
14424 - * Per-CPU kernel thread that invokes RCU callbacks. This replaces the
14425 - * RCU softirq used in flavors and configurations of RCU that do not
14426 - * support RCU priority boosting.
14428 -static void rcu_cpu_kthread(unsigned int cpu)
14430 - unsigned int *statusp = this_cpu_ptr(&rcu_cpu_kthread_status);
14431 - char work, *workp = this_cpu_ptr(&rcu_cpu_has_work);
14434 - for (spincnt = 0; spincnt < 10; spincnt++) {
14435 - trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait"));
14436 - local_bh_disable();
14437 - *statusp = RCU_KTHREAD_RUNNING;
14438 - this_cpu_inc(rcu_cpu_kthread_loops);
14439 - local_irq_disable();
14442 - local_irq_enable();
14444 - rcu_kthread_do_work();
14445 - local_bh_enable();
14446 - if (*workp == 0) {
14447 - trace_rcu_utilization(TPS("End CPU kthread@rcu_wait"));
14448 - *statusp = RCU_KTHREAD_WAITING;
14452 - *statusp = RCU_KTHREAD_YIELDING;
14453 - trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield"));
14454 - schedule_timeout_interruptible(2);
14455 - trace_rcu_utilization(TPS("End CPU kthread@rcu_yield"));
14456 - *statusp = RCU_KTHREAD_WAITING;
14460 * Set the per-rcu_node kthread's affinity to cover all CPUs that are
14461 * served by the rcu_node in question. The CPU hotplug lock is still
14462 @@ -1174,26 +1083,12 @@
14463 free_cpumask_var(cm);
14466 -static struct smp_hotplug_thread rcu_cpu_thread_spec = {
14467 - .store = &rcu_cpu_kthread_task,
14468 - .thread_should_run = rcu_cpu_kthread_should_run,
14469 - .thread_fn = rcu_cpu_kthread,
14470 - .thread_comm = "rcuc/%u",
14471 - .setup = rcu_cpu_kthread_setup,
14472 - .park = rcu_cpu_kthread_park,
14476 * Spawn boost kthreads -- called as soon as the scheduler is running.
14478 static void __init rcu_spawn_boost_kthreads(void)
14480 struct rcu_node *rnp;
14483 - for_each_possible_cpu(cpu)
14484 - per_cpu(rcu_cpu_has_work, cpu) = 0;
14485 - BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec));
14486 rcu_for_each_leaf_node(rcu_state_p, rnp)
14487 (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp);
14489 @@ -1216,11 +1111,6 @@
14490 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
14493 -static void invoke_rcu_callbacks_kthread(void)
14498 static bool rcu_is_callbacks_kthread(void)
14501 @@ -1244,7 +1134,7 @@
14503 #endif /* #else #ifdef CONFIG_RCU_BOOST */
14505 -#if !defined(CONFIG_RCU_FAST_NO_HZ)
14506 +#if !defined(CONFIG_RCU_FAST_NO_HZ) || defined(CONFIG_PREEMPT_RT_FULL)
14509 * Check to see if any future RCU-related work will need to be done
14510 @@ -1261,7 +1151,9 @@
14511 return IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL)
14512 ? 0 : rcu_cpu_has_callbacks(NULL);
14514 +#endif /* !defined(CONFIG_RCU_FAST_NO_HZ) || defined(CONFIG_PREEMPT_RT_FULL) */
14516 +#if !defined(CONFIG_RCU_FAST_NO_HZ)
14518 * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up
14520 @@ -1357,6 +1249,8 @@
14524 +#ifndef CONFIG_PREEMPT_RT_FULL
14527 * Allow the CPU to enter dyntick-idle mode unless it has callbacks ready
14528 * to invoke. If the CPU has callbacks, try to advance them. Tell the
14529 @@ -1402,6 +1296,7 @@
14530 *nextevt = basemono + dj * TICK_NSEC;
14533 +#endif /* #ifndef CONFIG_PREEMPT_RT_FULL */
14536 * Prepare a CPU for idle from an RCU perspective. The first major task
14537 diff -Nur linux-4.9.6.orig/kernel/rcu/update.c linux-4.9.6/kernel/rcu/update.c
14538 --- linux-4.9.6.orig/kernel/rcu/update.c 2017-01-26 08:25:24.000000000 +0100
14539 +++ linux-4.9.6/kernel/rcu/update.c 2017-01-28 13:59:10.043660493 +0100
14541 #ifndef CONFIG_TINY_RCU
14542 module_param(rcu_expedited, int, 0);
14543 module_param(rcu_normal, int, 0);
14544 -static int rcu_normal_after_boot;
14545 +static int rcu_normal_after_boot = IS_ENABLED(CONFIG_PREEMPT_RT_FULL);
14546 module_param(rcu_normal_after_boot, int, 0);
14547 #endif /* #ifndef CONFIG_TINY_RCU */
14549 @@ -132,8 +132,7 @@
14551 EXPORT_SYMBOL_GPL(rcu_gp_is_normal);
14553 -static atomic_t rcu_expedited_nesting =
14554 - ATOMIC_INIT(IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT) ? 1 : 0);
14555 +static atomic_t rcu_expedited_nesting = ATOMIC_INIT(1);
14558 * Should normal grace-period primitives be expedited? Intended for
14559 @@ -182,8 +181,7 @@
14561 void rcu_end_inkernel_boot(void)
14563 - if (IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT))
14564 - rcu_unexpedite_gp();
14565 + rcu_unexpedite_gp();
14566 if (rcu_normal_after_boot)
14567 WRITE_ONCE(rcu_normal, 1);
14569 @@ -298,6 +296,7 @@
14571 EXPORT_SYMBOL_GPL(rcu_read_lock_held);
14573 +#ifndef CONFIG_PREEMPT_RT_FULL
14575 * rcu_read_lock_bh_held() - might we be in RCU-bh read-side critical section?
14577 @@ -324,6 +323,7 @@
14578 return in_softirq() || irqs_disabled();
14580 EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
14583 #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
14585 diff -Nur linux-4.9.6.orig/kernel/sched/completion.c linux-4.9.6/kernel/sched/completion.c
14586 --- linux-4.9.6.orig/kernel/sched/completion.c 2017-01-26 08:25:24.000000000 +0100
14587 +++ linux-4.9.6/kernel/sched/completion.c 2017-01-28 13:59:10.043660493 +0100
14588 @@ -30,10 +30,10 @@
14590 unsigned long flags;
14592 - spin_lock_irqsave(&x->wait.lock, flags);
14593 + raw_spin_lock_irqsave(&x->wait.lock, flags);
14595 - __wake_up_locked(&x->wait, TASK_NORMAL, 1);
14596 - spin_unlock_irqrestore(&x->wait.lock, flags);
14597 + swake_up_locked(&x->wait);
14598 + raw_spin_unlock_irqrestore(&x->wait.lock, flags);
14600 EXPORT_SYMBOL(complete);
14602 @@ -50,10 +50,10 @@
14604 unsigned long flags;
14606 - spin_lock_irqsave(&x->wait.lock, flags);
14607 + raw_spin_lock_irqsave(&x->wait.lock, flags);
14608 x->done += UINT_MAX/2;
14609 - __wake_up_locked(&x->wait, TASK_NORMAL, 0);
14610 - spin_unlock_irqrestore(&x->wait.lock, flags);
14611 + swake_up_all_locked(&x->wait);
14612 + raw_spin_unlock_irqrestore(&x->wait.lock, flags);
14614 EXPORT_SYMBOL(complete_all);
14616 @@ -62,20 +62,20 @@
14617 long (*action)(long), long timeout, int state)
14620 - DECLARE_WAITQUEUE(wait, current);
14621 + DECLARE_SWAITQUEUE(wait);
14623 - __add_wait_queue_tail_exclusive(&x->wait, &wait);
14624 + __prepare_to_swait(&x->wait, &wait);
14626 if (signal_pending_state(state, current)) {
14627 timeout = -ERESTARTSYS;
14630 __set_current_state(state);
14631 - spin_unlock_irq(&x->wait.lock);
14632 + raw_spin_unlock_irq(&x->wait.lock);
14633 timeout = action(timeout);
14634 - spin_lock_irq(&x->wait.lock);
14635 + raw_spin_lock_irq(&x->wait.lock);
14636 } while (!x->done && timeout);
14637 - __remove_wait_queue(&x->wait, &wait);
14638 + __finish_swait(&x->wait, &wait);
14646 - spin_lock_irq(&x->wait.lock);
14647 + raw_spin_lock_irq(&x->wait.lock);
14648 timeout = do_wait_for_common(x, action, timeout, state);
14649 - spin_unlock_irq(&x->wait.lock);
14650 + raw_spin_unlock_irq(&x->wait.lock);
14654 @@ -277,12 +277,12 @@
14655 if (!READ_ONCE(x->done))
14658 - spin_lock_irqsave(&x->wait.lock, flags);
14659 + raw_spin_lock_irqsave(&x->wait.lock, flags);
14664 - spin_unlock_irqrestore(&x->wait.lock, flags);
14665 + raw_spin_unlock_irqrestore(&x->wait.lock, flags);
14668 EXPORT_SYMBOL(try_wait_for_completion);
14669 @@ -311,7 +311,7 @@
14670 * after it's acquired the lock.
14673 - spin_unlock_wait(&x->wait.lock);
14674 + raw_spin_unlock_wait(&x->wait.lock);
14677 EXPORT_SYMBOL(completion_done);
14678 diff -Nur linux-4.9.6.orig/kernel/sched/core.c linux-4.9.6/kernel/sched/core.c
14679 --- linux-4.9.6.orig/kernel/sched/core.c 2017-01-26 08:25:24.000000000 +0100
14680 +++ linux-4.9.6/kernel/sched/core.c 2017-01-28 13:59:10.043660493 +0100
14681 @@ -129,7 +129,11 @@
14682 * Number of tasks to iterate in a single balance run.
14683 * Limited because this is done with IRQs disabled.
14685 +#ifndef CONFIG_PREEMPT_RT_FULL
14686 const_debug unsigned int sysctl_sched_nr_migrate = 32;
14688 +const_debug unsigned int sysctl_sched_nr_migrate = 8;
14692 * period over which we average the RT time consumption, measured
14693 @@ -345,6 +349,7 @@
14695 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
14696 rq->hrtick_timer.function = hrtick;
14697 + rq->hrtick_timer.irqsafe = 1;
14699 #else /* CONFIG_SCHED_HRTICK */
14700 static inline void hrtick_clear(struct rq *rq)
14701 @@ -449,7 +454,7 @@
14702 head->lastp = &node->next;
14705 -void wake_up_q(struct wake_q_head *head)
14706 +void __wake_up_q(struct wake_q_head *head, bool sleeper)
14708 struct wake_q_node *node = head->first;
14710 @@ -466,7 +471,10 @@
14711 * wake_up_process() implies a wmb() to pair with the queueing
14712 * in wake_q_add() so as not to miss wakeups.
14714 - wake_up_process(task);
14716 + wake_up_lock_sleeper(task);
14718 + wake_up_process(task);
14719 put_task_struct(task);
14722 @@ -502,6 +510,38 @@
14723 trace_sched_wake_idle_without_ipi(cpu);
14726 +#ifdef CONFIG_PREEMPT_LAZY
14727 +void resched_curr_lazy(struct rq *rq)
14729 + struct task_struct *curr = rq->curr;
14732 + if (!sched_feat(PREEMPT_LAZY)) {
14733 + resched_curr(rq);
14737 + lockdep_assert_held(&rq->lock);
14739 + if (test_tsk_need_resched(curr))
14742 + if (test_tsk_need_resched_lazy(curr))
14745 + set_tsk_need_resched_lazy(curr);
14747 + cpu = cpu_of(rq);
14748 + if (cpu == smp_processor_id())
14751 + /* NEED_RESCHED_LAZY must be visible before we test polling */
14753 + if (!tsk_is_polling(curr))
14754 + smp_send_reschedule(cpu);
14758 void resched_cpu(int cpu)
14760 struct rq *rq = cpu_rq(cpu);
14761 @@ -525,11 +565,14 @@
14763 int get_nohz_timer_target(void)
14765 - int i, cpu = smp_processor_id();
14767 struct sched_domain *sd;
14769 + preempt_disable_rt();
14770 + cpu = smp_processor_id();
14772 if (!idle_cpu(cpu) && is_housekeeping_cpu(cpu))
14774 + goto preempt_en_rt;
14777 for_each_domain(cpu, sd) {
14778 @@ -548,6 +591,8 @@
14779 cpu = housekeeping_any_cpu();
14783 + preempt_enable_rt();
14787 @@ -1100,6 +1145,11 @@
14789 lockdep_assert_held(&p->pi_lock);
14791 + if (__migrate_disabled(p)) {
14792 + cpumask_copy(&p->cpus_allowed, new_mask);
14796 queued = task_on_rq_queued(p);
14797 running = task_current(rq, p);
14799 @@ -1122,6 +1172,84 @@
14800 set_curr_task(rq, p);
14803 +static DEFINE_PER_CPU(struct cpumask, sched_cpumasks);
14804 +static DEFINE_MUTEX(sched_down_mutex);
14805 +static cpumask_t sched_down_cpumask;
14807 +void tell_sched_cpu_down_begin(int cpu)
14809 + mutex_lock(&sched_down_mutex);
14810 + cpumask_set_cpu(cpu, &sched_down_cpumask);
14811 + mutex_unlock(&sched_down_mutex);
14814 +void tell_sched_cpu_down_done(int cpu)
14816 + mutex_lock(&sched_down_mutex);
14817 + cpumask_clear_cpu(cpu, &sched_down_cpumask);
14818 + mutex_unlock(&sched_down_mutex);
14822 + * migrate_me - try to move the current task off this cpu
14824 + * Used by the pin_current_cpu() code to try to get tasks
14825 + * to move off the current CPU as it is going down.
14826 + * It will only move the task if the task isn't pinned to
14827 + * the CPU (with migrate_disable, affinity or NO_SETAFFINITY)
14828 + * and the task has to be in a RUNNING state. Otherwise the
14829 + * movement of the task will wake it up (change its state
14830 + * to running) when the task did not expect it.
14832 + * Returns 1 if it succeeded in moving the current task
14835 +int migrate_me(void)
14837 + struct task_struct *p = current;
14838 + struct migration_arg arg;
14839 + struct cpumask *cpumask;
14840 + struct cpumask *mask;
14841 + unsigned int dest_cpu;
14842 + struct rq_flags rf;
14846 + * We can not migrate tasks bounded to a CPU or tasks not
14847 + * running. The movement of the task will wake it up.
14849 + if (p->flags & PF_NO_SETAFFINITY || p->state)
14852 + mutex_lock(&sched_down_mutex);
14853 + rq = task_rq_lock(p, &rf);
14855 + cpumask = this_cpu_ptr(&sched_cpumasks);
14856 + mask = &p->cpus_allowed;
14858 + cpumask_andnot(cpumask, mask, &sched_down_cpumask);
14860 + if (!cpumask_weight(cpumask)) {
14861 + /* It's only on this CPU? */
14862 + task_rq_unlock(rq, p, &rf);
14863 + mutex_unlock(&sched_down_mutex);
14867 + dest_cpu = cpumask_any_and(cpu_active_mask, cpumask);
14870 + arg.dest_cpu = dest_cpu;
14872 + task_rq_unlock(rq, p, &rf);
14874 + stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
14875 + tlb_migrate_finish(p->mm);
14876 + mutex_unlock(&sched_down_mutex);
14882 * Change a given task's CPU affinity. Migrate the thread to a
14883 * proper CPU and schedule it away if the CPU it's executing on
14884 @@ -1179,7 +1307,7 @@
14887 /* Can the task run on the task's current CPU? If so, we're done */
14888 - if (cpumask_test_cpu(task_cpu(p), new_mask))
14889 + if (cpumask_test_cpu(task_cpu(p), new_mask) || __migrate_disabled(p))
14892 dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);
14893 @@ -1366,6 +1494,18 @@
14897 +static bool check_task_state(struct task_struct *p, long match_state)
14899 + bool match = false;
14901 + raw_spin_lock_irq(&p->pi_lock);
14902 + if (p->state == match_state || p->saved_state == match_state)
14904 + raw_spin_unlock_irq(&p->pi_lock);
14910 * wait_task_inactive - wait for a thread to unschedule.
14912 @@ -1410,7 +1550,7 @@
14913 * is actually now running somewhere else!
14915 while (task_running(rq, p)) {
14916 - if (match_state && unlikely(p->state != match_state))
14917 + if (match_state && !check_task_state(p, match_state))
14921 @@ -1425,7 +1565,8 @@
14922 running = task_running(rq, p);
14923 queued = task_on_rq_queued(p);
14925 - if (!match_state || p->state == match_state)
14926 + if (!match_state || p->state == match_state ||
14927 + p->saved_state == match_state)
14928 ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
14929 task_rq_unlock(rq, p, &rf);
14931 @@ -1680,10 +1821,6 @@
14933 activate_task(rq, p, en_flags);
14934 p->on_rq = TASK_ON_RQ_QUEUED;
14936 - /* if a worker is waking up, notify workqueue */
14937 - if (p->flags & PF_WQ_WORKER)
14938 - wq_worker_waking_up(p, cpu_of(rq));
14942 @@ -2018,8 +2155,27 @@
14944 smp_mb__before_spinlock();
14945 raw_spin_lock_irqsave(&p->pi_lock, flags);
14946 - if (!(p->state & state))
14947 + if (!(p->state & state)) {
14949 + * The task might be running due to a spinlock sleeper
14950 + * wakeup. Check the saved state and set it to running
14951 + * if the wakeup condition is true.
14953 + if (!(wake_flags & WF_LOCK_SLEEPER)) {
14954 + if (p->saved_state & state) {
14955 + p->saved_state = TASK_RUNNING;
14963 + * If this is a regular wakeup, then we can unconditionally
14964 + * clear the saved state of a "lock sleeper".
14966 + if (!(wake_flags & WF_LOCK_SLEEPER))
14967 + p->saved_state = TASK_RUNNING;
14969 trace_sched_waking(p);
14971 @@ -2102,53 +2258,6 @@
14975 - * try_to_wake_up_local - try to wake up a local task with rq lock held
14976 - * @p: the thread to be awakened
14977 - * @cookie: context's cookie for pinning
14979 - * Put @p on the run-queue if it's not already there. The caller must
14980 - * ensure that this_rq() is locked, @p is bound to this_rq() and not
14981 - * the current task.
14983 -static void try_to_wake_up_local(struct task_struct *p, struct pin_cookie cookie)
14985 - struct rq *rq = task_rq(p);
14987 - if (WARN_ON_ONCE(rq != this_rq()) ||
14988 - WARN_ON_ONCE(p == current))
14991 - lockdep_assert_held(&rq->lock);
14993 - if (!raw_spin_trylock(&p->pi_lock)) {
14995 - * This is OK, because current is on_cpu, which avoids it being
14996 - * picked for load-balance and preemption/IRQs are still
14997 - * disabled avoiding further scheduler activity on it and we've
14998 - * not yet picked a replacement task.
15000 - lockdep_unpin_lock(&rq->lock, cookie);
15001 - raw_spin_unlock(&rq->lock);
15002 - raw_spin_lock(&p->pi_lock);
15003 - raw_spin_lock(&rq->lock);
15004 - lockdep_repin_lock(&rq->lock, cookie);
15007 - if (!(p->state & TASK_NORMAL))
15010 - trace_sched_waking(p);
15012 - if (!task_on_rq_queued(p))
15013 - ttwu_activate(rq, p, ENQUEUE_WAKEUP);
15015 - ttwu_do_wakeup(rq, p, 0, cookie);
15016 - ttwu_stat(p, smp_processor_id(), 0);
15018 - raw_spin_unlock(&p->pi_lock);
15022 * wake_up_process - Wake up a specific process
15023 * @p: The process to be woken up.
15025 @@ -2166,6 +2275,18 @@
15027 EXPORT_SYMBOL(wake_up_process);
15030 + * wake_up_lock_sleeper - Wake up a specific process blocked on a "sleeping lock"
15031 + * @p: The process to be woken up.
15033 + * Same as wake_up_process() above, but wake_flags=WF_LOCK_SLEEPER to indicate
15034 + * the nature of the wakeup.
15036 +int wake_up_lock_sleeper(struct task_struct *p)
15038 + return try_to_wake_up(p, TASK_ALL, WF_LOCK_SLEEPER);
15041 int wake_up_state(struct task_struct *p, unsigned int state)
15043 return try_to_wake_up(p, state, 0);
15044 @@ -2442,6 +2563,9 @@
15047 init_task_preempt_count(p);
15048 +#ifdef CONFIG_HAVE_PREEMPT_LAZY
15049 + task_thread_info(p)->preempt_lazy_count = 0;
15052 plist_node_init(&p->pushable_tasks, MAX_PRIO);
15053 RB_CLEAR_NODE(&p->pushable_dl_tasks);
15054 @@ -2770,21 +2894,16 @@
15055 finish_arch_post_lock_switch();
15057 fire_sched_in_preempt_notifiers(current);
15059 + * We use mmdrop_delayed() here so we don't have to do the
15060 + * full __mmdrop() when we are the last user.
15064 + mmdrop_delayed(mm);
15065 if (unlikely(prev_state == TASK_DEAD)) {
15066 if (prev->sched_class->task_dead)
15067 prev->sched_class->task_dead(prev);
15070 - * Remove function-return probe instances associated with this
15071 - * task and put them back on the free list.
15073 - kprobe_flush_task(prev);
15075 - /* Task is done with its stack. */
15076 - put_task_stack(prev);
15078 put_task_struct(prev);
15081 @@ -3252,6 +3371,77 @@
15082 schedstat_inc(this_rq()->sched_count);
15085 +#if defined(CONFIG_PREEMPT_RT_FULL) && defined(CONFIG_SMP)
15087 +void migrate_disable(void)
15089 + struct task_struct *p = current;
15091 + if (in_atomic() || irqs_disabled()) {
15092 +#ifdef CONFIG_SCHED_DEBUG
15093 + p->migrate_disable_atomic++;
15098 +#ifdef CONFIG_SCHED_DEBUG
15099 + if (unlikely(p->migrate_disable_atomic)) {
15105 + if (p->migrate_disable) {
15106 + p->migrate_disable++;
15110 + preempt_disable();
15111 + preempt_lazy_disable();
15112 + pin_current_cpu();
15113 + p->migrate_disable = 1;
15114 + preempt_enable();
15116 +EXPORT_SYMBOL(migrate_disable);
15118 +void migrate_enable(void)
15120 + struct task_struct *p = current;
15122 + if (in_atomic() || irqs_disabled()) {
15123 +#ifdef CONFIG_SCHED_DEBUG
15124 + p->migrate_disable_atomic--;
15129 +#ifdef CONFIG_SCHED_DEBUG
15130 + if (unlikely(p->migrate_disable_atomic)) {
15135 + WARN_ON_ONCE(p->migrate_disable <= 0);
15137 + if (p->migrate_disable > 1) {
15138 + p->migrate_disable--;
15142 + preempt_disable();
15144 + * Clearing migrate_disable causes tsk_cpus_allowed to
15145 + * show the tasks original cpu affinity.
15147 + p->migrate_disable = 0;
15149 + unpin_current_cpu();
15150 + preempt_enable();
15151 + preempt_lazy_enable();
15153 +EXPORT_SYMBOL(migrate_enable);
15157 * Pick up the highest-prio task:
15159 @@ -3368,19 +3558,6 @@
15161 deactivate_task(rq, prev, DEQUEUE_SLEEP);
15165 - * If a worker went to sleep, notify and ask workqueue
15166 - * whether it wants to wake up a task to maintain
15169 - if (prev->flags & PF_WQ_WORKER) {
15170 - struct task_struct *to_wakeup;
15172 - to_wakeup = wq_worker_sleeping(prev);
15174 - try_to_wake_up_local(to_wakeup, cookie);
15177 switch_count = &prev->nvcsw;
15179 @@ -3390,6 +3567,7 @@
15181 next = pick_next_task(rq, prev, cookie);
15182 clear_tsk_need_resched(prev);
15183 + clear_tsk_need_resched_lazy(prev);
15184 clear_preempt_need_resched();
15185 rq->clock_skip_update = 0;
15187 @@ -3437,9 +3615,20 @@
15189 static inline void sched_submit_work(struct task_struct *tsk)
15191 - if (!tsk->state || tsk_is_pi_blocked(tsk))
15195 + * If a worker went to sleep, notify and ask workqueue whether
15196 + * it wants to wake up a task to maintain concurrency.
15198 + if (tsk->flags & PF_WQ_WORKER)
15199 + wq_worker_sleeping(tsk);
15202 + if (tsk_is_pi_blocked(tsk))
15206 * If we are going to sleep and we have plugged IO queued,
15207 * make sure to submit it to avoid deadlocks.
15209 @@ -3447,6 +3636,12 @@
15210 blk_schedule_flush_plug(tsk);
15213 +static void sched_update_worker(struct task_struct *tsk)
15215 + if (tsk->flags & PF_WQ_WORKER)
15216 + wq_worker_running(tsk);
15219 asmlinkage __visible void __sched schedule(void)
15221 struct task_struct *tsk = current;
15222 @@ -3457,6 +3652,7 @@
15224 sched_preempt_enable_no_resched();
15225 } while (need_resched());
15226 + sched_update_worker(tsk);
15228 EXPORT_SYMBOL(schedule);
15230 @@ -3520,6 +3716,30 @@
15231 } while (need_resched());
15234 +#ifdef CONFIG_PREEMPT_LAZY
15236 + * If TIF_NEED_RESCHED is then we allow to be scheduled away since this is
15237 + * set by a RT task. Oterwise we try to avoid beeing scheduled out as long as
15238 + * preempt_lazy_count counter >0.
15240 +static __always_inline int preemptible_lazy(void)
15242 + if (test_thread_flag(TIF_NEED_RESCHED))
15244 + if (current_thread_info()->preempt_lazy_count)
15251 +static inline int preemptible_lazy(void)
15258 #ifdef CONFIG_PREEMPT
15260 * this is the entry point to schedule() from in-kernel preemption
15261 @@ -3534,7 +3754,8 @@
15263 if (likely(!preemptible()))
15266 + if (!preemptible_lazy())
15268 preempt_schedule_common();
15270 NOKPROBE_SYMBOL(preempt_schedule);
15271 @@ -3561,6 +3782,9 @@
15272 if (likely(!preemptible()))
15275 + if (!preemptible_lazy())
15280 * Because the function tracer can trace preempt_count_sub()
15281 @@ -3583,7 +3807,16 @@
15282 * an infinite recursion.
15284 prev_ctx = exception_enter();
15286 + * The add/subtract must not be traced by the function
15287 + * tracer. But we still want to account for the
15288 + * preempt off latency tracer. Since the _notrace versions
15289 + * of add/subtract skip the accounting for latency tracer
15290 + * we must force it manually.
15292 + start_critical_timings();
15294 + stop_critical_timings();
15295 exception_exit(prev_ctx);
15297 preempt_latency_stop(1);
15298 @@ -4939,6 +5172,7 @@
15300 EXPORT_SYMBOL(__cond_resched_lock);
15302 +#ifndef CONFIG_PREEMPT_RT_FULL
15303 int __sched __cond_resched_softirq(void)
15305 BUG_ON(!in_softirq());
15306 @@ -4952,6 +5186,7 @@
15309 EXPORT_SYMBOL(__cond_resched_softirq);
15313 * yield - yield the current processor to other threads.
15314 @@ -5315,7 +5550,9 @@
15316 /* Set the preempt count _outside_ the spinlocks! */
15317 init_idle_preempt_count(idle, cpu);
15319 +#ifdef CONFIG_HAVE_PREEMPT_LAZY
15320 + task_thread_info(idle)->preempt_lazy_count = 0;
15323 * The idle tasks have their own, simple scheduling class:
15325 @@ -5458,6 +5695,8 @@
15326 #endif /* CONFIG_NUMA_BALANCING */
15328 #ifdef CONFIG_HOTPLUG_CPU
15329 +static DEFINE_PER_CPU(struct mm_struct *, idle_last_mm);
15332 * Ensures that the idle task is using init_mm right before its cpu goes
15334 @@ -5472,7 +5711,12 @@
15335 switch_mm_irqs_off(mm, &init_mm, current);
15336 finish_arch_post_lock_switch();
15340 + * Defer the cleanup to an alive cpu. On RT we can neither
15341 + * call mmdrop() nor mmdrop_delayed() from here.
15343 + per_cpu(idle_last_mm, smp_processor_id()) = mm;
15348 @@ -7418,6 +7662,10 @@
15349 update_max_interval();
15350 nohz_balance_exit_idle(cpu);
15352 + if (per_cpu(idle_last_mm, cpu)) {
15353 + mmdrop_delayed(per_cpu(idle_last_mm, cpu));
15354 + per_cpu(idle_last_mm, cpu) = NULL;
15359 @@ -7698,7 +7946,7 @@
15360 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
15361 static inline int preempt_count_equals(int preempt_offset)
15363 - int nested = preempt_count() + rcu_preempt_depth();
15364 + int nested = preempt_count() + sched_rcu_preempt_depth();
15366 return (nested == preempt_offset);
15368 diff -Nur linux-4.9.6.orig/kernel/sched/deadline.c linux-4.9.6/kernel/sched/deadline.c
15369 --- linux-4.9.6.orig/kernel/sched/deadline.c 2017-01-26 08:25:24.000000000 +0100
15370 +++ linux-4.9.6/kernel/sched/deadline.c 2017-01-28 13:59:10.043660493 +0100
15371 @@ -687,6 +687,7 @@
15373 hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
15374 timer->function = dl_task_timer;
15375 + timer->irqsafe = 1;
15379 diff -Nur linux-4.9.6.orig/kernel/sched/debug.c linux-4.9.6/kernel/sched/debug.c
15380 --- linux-4.9.6.orig/kernel/sched/debug.c 2017-01-26 08:25:24.000000000 +0100
15381 +++ linux-4.9.6/kernel/sched/debug.c 2017-01-28 13:59:10.043660493 +0100
15382 @@ -558,6 +558,9 @@
15387 + P(rt_nr_migratory);
15392 @@ -953,6 +956,10 @@
15396 +#ifdef CONFIG_PREEMPT_RT_FULL
15397 + P(migrate_disable);
15399 + P(nr_cpus_allowed);
15400 #undef PN_SCHEDSTAT
15403 diff -Nur linux-4.9.6.orig/kernel/sched/fair.c linux-4.9.6/kernel/sched/fair.c
15404 --- linux-4.9.6.orig/kernel/sched/fair.c 2017-01-26 08:25:24.000000000 +0100
15405 +++ linux-4.9.6/kernel/sched/fair.c 2017-01-28 13:59:10.047660646 +0100
15406 @@ -3518,7 +3518,7 @@
15407 ideal_runtime = sched_slice(cfs_rq, curr);
15408 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
15409 if (delta_exec > ideal_runtime) {
15410 - resched_curr(rq_of(cfs_rq));
15411 + resched_curr_lazy(rq_of(cfs_rq));
15413 * The current task ran long enough, ensure it doesn't get
15414 * re-elected due to buddy favours.
15415 @@ -3542,7 +3542,7 @@
15418 if (delta > ideal_runtime)
15419 - resched_curr(rq_of(cfs_rq));
15420 + resched_curr_lazy(rq_of(cfs_rq));
15424 @@ -3684,7 +3684,7 @@
15425 * validating it and just reschedule.
15428 - resched_curr(rq_of(cfs_rq));
15429 + resched_curr_lazy(rq_of(cfs_rq));
15433 @@ -3866,7 +3866,7 @@
15434 * hierarchy can be throttled
15436 if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
15437 - resched_curr(rq_of(cfs_rq));
15438 + resched_curr_lazy(rq_of(cfs_rq));
15441 static __always_inline
15442 @@ -4494,7 +4494,7 @@
15446 - resched_curr(rq);
15447 + resched_curr_lazy(rq);
15450 hrtick_start(rq, delta);
15451 @@ -5905,7 +5905,7 @@
15455 - resched_curr(rq);
15456 + resched_curr_lazy(rq);
15458 * Only set the backward buddy when the current task is still
15459 * on the rq. This can happen when a wakeup gets interleaved
15460 @@ -8631,7 +8631,7 @@
15461 * 'current' within the tree based on its new key value.
15463 swap(curr->vruntime, se->vruntime);
15464 - resched_curr(rq);
15465 + resched_curr_lazy(rq);
15468 se->vruntime -= cfs_rq->min_vruntime;
15469 @@ -8655,7 +8655,7 @@
15471 if (rq->curr == p) {
15472 if (p->prio > oldprio)
15473 - resched_curr(rq);
15474 + resched_curr_lazy(rq);
15476 check_preempt_curr(rq, p, 0);
15478 diff -Nur linux-4.9.6.orig/kernel/sched/features.h linux-4.9.6/kernel/sched/features.h
15479 --- linux-4.9.6.orig/kernel/sched/features.h 2017-01-26 08:25:24.000000000 +0100
15480 +++ linux-4.9.6/kernel/sched/features.h 2017-01-28 13:59:10.047660646 +0100
15481 @@ -45,11 +45,19 @@
15483 SCHED_FEAT(NONTASK_CAPACITY, true)
15485 +#ifdef CONFIG_PREEMPT_RT_FULL
15486 +SCHED_FEAT(TTWU_QUEUE, false)
15487 +# ifdef CONFIG_PREEMPT_LAZY
15488 +SCHED_FEAT(PREEMPT_LAZY, true)
15493 * Queue remote wakeups on the target CPU and process them
15494 * using the scheduler IPI. Reduces rq->lock contention/bounces.
15496 SCHED_FEAT(TTWU_QUEUE, true)
15499 #ifdef HAVE_RT_PUSH_IPI
15501 diff -Nur linux-4.9.6.orig/kernel/sched/Makefile linux-4.9.6/kernel/sched/Makefile
15502 --- linux-4.9.6.orig/kernel/sched/Makefile 2017-01-26 08:25:24.000000000 +0100
15503 +++ linux-4.9.6/kernel/sched/Makefile 2017-01-28 13:59:10.043660493 +0100
15506 obj-y += core.o loadavg.o clock.o cputime.o
15507 obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
15508 -obj-y += wait.o swait.o completion.o idle.o
15509 +obj-y += wait.o swait.o swork.o completion.o idle.o
15510 obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o
15511 obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
15512 obj-$(CONFIG_SCHEDSTATS) += stats.o
15513 diff -Nur linux-4.9.6.orig/kernel/sched/rt.c linux-4.9.6/kernel/sched/rt.c
15514 --- linux-4.9.6.orig/kernel/sched/rt.c 2017-01-26 08:25:24.000000000 +0100
15515 +++ linux-4.9.6/kernel/sched/rt.c 2017-01-28 13:59:10.047660646 +0100
15518 hrtimer_init(&rt_b->rt_period_timer,
15519 CLOCK_MONOTONIC, HRTIMER_MODE_REL);
15520 + rt_b->rt_period_timer.irqsafe = 1;
15521 rt_b->rt_period_timer.function = sched_rt_period_timer;
15524 @@ -101,6 +102,7 @@
15525 rt_rq->push_cpu = nr_cpu_ids;
15526 raw_spin_lock_init(&rt_rq->push_lock);
15527 init_irq_work(&rt_rq->push_work, push_irq_work_func);
15528 + rt_rq->push_work.flags |= IRQ_WORK_HARD_IRQ;
15530 #endif /* CONFIG_SMP */
15531 /* We start is dequeued state, because no RT tasks are queued */
15532 diff -Nur linux-4.9.6.orig/kernel/sched/sched.h linux-4.9.6/kernel/sched/sched.h
15533 --- linux-4.9.6.orig/kernel/sched/sched.h 2017-01-26 08:25:24.000000000 +0100
15534 +++ linux-4.9.6/kernel/sched/sched.h 2017-01-28 13:59:10.047660646 +0100
15535 @@ -1163,6 +1163,7 @@
15536 #define WF_SYNC 0x01 /* waker goes to sleep after wakeup */
15537 #define WF_FORK 0x02 /* child wakeup after fork */
15538 #define WF_MIGRATED 0x4 /* internal use, task got migrated */
15539 +#define WF_LOCK_SLEEPER 0x08 /* wakeup spinlock "sleeper" */
15542 * To aid in avoiding the subversion of "niceness" due to uneven distribution
15543 @@ -1346,6 +1347,15 @@
15544 extern void resched_curr(struct rq *rq);
15545 extern void resched_cpu(int cpu);
15547 +#ifdef CONFIG_PREEMPT_LAZY
15548 +extern void resched_curr_lazy(struct rq *rq);
15550 +static inline void resched_curr_lazy(struct rq *rq)
15552 + resched_curr(rq);
15556 extern struct rt_bandwidth def_rt_bandwidth;
15557 extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
15559 diff -Nur linux-4.9.6.orig/kernel/sched/swait.c linux-4.9.6/kernel/sched/swait.c
15560 --- linux-4.9.6.orig/kernel/sched/swait.c 2017-01-26 08:25:24.000000000 +0100
15561 +++ linux-4.9.6/kernel/sched/swait.c 2017-01-28 13:59:10.047660646 +0100
15563 #include <linux/sched.h>
15564 #include <linux/swait.h>
15565 +#include <linux/suspend.h>
15567 void __init_swait_queue_head(struct swait_queue_head *q, const char *name,
15568 struct lock_class_key *key)
15571 EXPORT_SYMBOL(swake_up_locked);
15573 +void swake_up_all_locked(struct swait_queue_head *q)
15575 + struct swait_queue *curr;
15578 + while (!list_empty(&q->task_list)) {
15580 + curr = list_first_entry(&q->task_list, typeof(*curr),
15582 + wake_up_process(curr->task);
15583 + list_del_init(&curr->task_list);
15586 + if (pm_in_action)
15588 + WARN(wakes > 2, "complete_all() with %d waiters\n", wakes);
15590 +EXPORT_SYMBOL(swake_up_all_locked);
15592 void swake_up(struct swait_queue_head *q)
15594 unsigned long flags;
15596 if (!swait_active(q))
15599 + WARN_ON(irqs_disabled());
15600 raw_spin_lock_irq(&q->lock);
15601 list_splice_init(&q->task_list, &tmp);
15602 while (!list_empty(&tmp)) {
15603 diff -Nur linux-4.9.6.orig/kernel/sched/swork.c linux-4.9.6/kernel/sched/swork.c
15604 --- linux-4.9.6.orig/kernel/sched/swork.c 1970-01-01 01:00:00.000000000 +0100
15605 +++ linux-4.9.6/kernel/sched/swork.c 2017-01-28 13:59:10.047660646 +0100
15608 + * Copyright (C) 2014 BMW Car IT GmbH, Daniel Wagner daniel.wagner@bmw-carit.de
15610 + * Provides a framework for enqueuing callbacks from irq context
15611 + * PREEMPT_RT_FULL safe. The callbacks are executed in kthread context.
15614 +#include <linux/swait.h>
15615 +#include <linux/swork.h>
15616 +#include <linux/kthread.h>
15617 +#include <linux/slab.h>
15618 +#include <linux/spinlock.h>
15619 +#include <linux/export.h>
15621 +#define SWORK_EVENT_PENDING (1 << 0)
15623 +static DEFINE_MUTEX(worker_mutex);
15624 +static struct sworker *glob_worker;
15627 + struct list_head events;
15628 + struct swait_queue_head wq;
15630 + raw_spinlock_t lock;
15632 + struct task_struct *task;
15636 +static bool swork_readable(struct sworker *worker)
15640 + if (kthread_should_stop())
15643 + raw_spin_lock_irq(&worker->lock);
15644 + r = !list_empty(&worker->events);
15645 + raw_spin_unlock_irq(&worker->lock);
15650 +static int swork_kthread(void *arg)
15652 + struct sworker *worker = arg;
15655 + swait_event_interruptible(worker->wq,
15656 + swork_readable(worker));
15657 + if (kthread_should_stop())
15660 + raw_spin_lock_irq(&worker->lock);
15661 + while (!list_empty(&worker->events)) {
15662 + struct swork_event *sev;
15664 + sev = list_first_entry(&worker->events,
15665 + struct swork_event, item);
15666 + list_del(&sev->item);
15667 + raw_spin_unlock_irq(&worker->lock);
15669 + WARN_ON_ONCE(!test_and_clear_bit(SWORK_EVENT_PENDING,
15672 + raw_spin_lock_irq(&worker->lock);
15674 + raw_spin_unlock_irq(&worker->lock);
15679 +static struct sworker *swork_create(void)
15681 + struct sworker *worker;
15683 + worker = kzalloc(sizeof(*worker), GFP_KERNEL);
15685 + return ERR_PTR(-ENOMEM);
15687 + INIT_LIST_HEAD(&worker->events);
15688 + raw_spin_lock_init(&worker->lock);
15689 + init_swait_queue_head(&worker->wq);
15691 + worker->task = kthread_run(swork_kthread, worker, "kswork");
15692 + if (IS_ERR(worker->task)) {
15694 + return ERR_PTR(-ENOMEM);
15700 +static void swork_destroy(struct sworker *worker)
15702 + kthread_stop(worker->task);
15704 + WARN_ON(!list_empty(&worker->events));
15709 + * swork_queue - queue swork
15711 + * Returns %false if @work was already on a queue, %true otherwise.
15713 + * The work is queued and processed on a random CPU
15715 +bool swork_queue(struct swork_event *sev)
15717 + unsigned long flags;
15719 + if (test_and_set_bit(SWORK_EVENT_PENDING, &sev->flags))
15722 + raw_spin_lock_irqsave(&glob_worker->lock, flags);
15723 + list_add_tail(&sev->item, &glob_worker->events);
15724 + raw_spin_unlock_irqrestore(&glob_worker->lock, flags);
15726 + swake_up(&glob_worker->wq);
15729 +EXPORT_SYMBOL_GPL(swork_queue);
15732 + * swork_get - get an instance of the sworker
15734 + * Returns an negative error code if the initialization if the worker did not
15735 + * work, %0 otherwise.
15738 +int swork_get(void)
15740 + struct sworker *worker;
15742 + mutex_lock(&worker_mutex);
15743 + if (!glob_worker) {
15744 + worker = swork_create();
15745 + if (IS_ERR(worker)) {
15746 + mutex_unlock(&worker_mutex);
15750 + glob_worker = worker;
15753 + glob_worker->refs++;
15754 + mutex_unlock(&worker_mutex);
15758 +EXPORT_SYMBOL_GPL(swork_get);
15761 + * swork_put - puts an instance of the sworker
15763 + * Will destroy the sworker thread. This function must not be called until all
15764 + * queued events have been completed.
15766 +void swork_put(void)
15768 + mutex_lock(&worker_mutex);
15770 + glob_worker->refs--;
15771 + if (glob_worker->refs > 0)
15774 + swork_destroy(glob_worker);
15775 + glob_worker = NULL;
15777 + mutex_unlock(&worker_mutex);
15779 +EXPORT_SYMBOL_GPL(swork_put);
15780 diff -Nur linux-4.9.6.orig/kernel/signal.c linux-4.9.6/kernel/signal.c
15781 --- linux-4.9.6.orig/kernel/signal.c 2017-01-26 08:25:24.000000000 +0100
15782 +++ linux-4.9.6/kernel/signal.c 2017-01-28 13:59:10.047660646 +0100
15784 #include <linux/export.h>
15785 #include <linux/init.h>
15786 #include <linux/sched.h>
15787 +#include <linux/sched/rt.h>
15788 #include <linux/fs.h>
15789 #include <linux/tty.h>
15790 #include <linux/binfmts.h>
15791 @@ -352,13 +353,30 @@
15795 +static inline struct sigqueue *get_task_cache(struct task_struct *t)
15797 + struct sigqueue *q = t->sigqueue_cache;
15799 + if (cmpxchg(&t->sigqueue_cache, q, NULL) != q)
15804 +static inline int put_task_cache(struct task_struct *t, struct sigqueue *q)
15806 + if (cmpxchg(&t->sigqueue_cache, NULL, q) == NULL)
15812 * allocate a new signal queue record
15813 * - this may be called without locks if and only if t == current, otherwise an
15814 * appropriate lock must be held to stop the target task from exiting
15816 static struct sigqueue *
15817 -__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimit)
15818 +__sigqueue_do_alloc(int sig, struct task_struct *t, gfp_t flags,
15819 + int override_rlimit, int fromslab)
15821 struct sigqueue *q = NULL;
15822 struct user_struct *user;
15823 @@ -375,7 +393,10 @@
15824 if (override_rlimit ||
15825 atomic_read(&user->sigpending) <=
15826 task_rlimit(t, RLIMIT_SIGPENDING)) {
15827 - q = kmem_cache_alloc(sigqueue_cachep, flags);
15829 + q = get_task_cache(t);
15831 + q = kmem_cache_alloc(sigqueue_cachep, flags);
15833 print_dropped_signal(sig);
15835 @@ -392,6 +413,13 @@
15839 +static struct sigqueue *
15840 +__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags,
15841 + int override_rlimit)
15843 + return __sigqueue_do_alloc(sig, t, flags, override_rlimit, 0);
15846 static void __sigqueue_free(struct sigqueue *q)
15848 if (q->flags & SIGQUEUE_PREALLOC)
15849 @@ -401,6 +429,21 @@
15850 kmem_cache_free(sigqueue_cachep, q);
15853 +static void sigqueue_free_current(struct sigqueue *q)
15855 + struct user_struct *up;
15857 + if (q->flags & SIGQUEUE_PREALLOC)
15861 + if (rt_prio(current->normal_prio) && !put_task_cache(current, q)) {
15862 + atomic_dec(&up->sigpending);
15865 + __sigqueue_free(q);
15868 void flush_sigqueue(struct sigpending *queue)
15870 struct sigqueue *q;
15871 @@ -414,6 +457,21 @@
15875 + * Called from __exit_signal. Flush tsk->pending and
15876 + * tsk->sigqueue_cache
15878 +void flush_task_sigqueue(struct task_struct *tsk)
15880 + struct sigqueue *q;
15882 + flush_sigqueue(&tsk->pending);
15884 + q = get_task_cache(tsk);
15886 + kmem_cache_free(sigqueue_cachep, q);
15890 * Flush all pending signals for this kthread.
15892 void flush_signals(struct task_struct *t)
15893 @@ -525,7 +583,7 @@
15895 list_del_init(&first->list);
15896 copy_siginfo(info, &first->info);
15897 - __sigqueue_free(first);
15898 + sigqueue_free_current(first);
15901 * Ok, it wasn't in the queue. This must be
15902 @@ -560,6 +618,8 @@
15906 + WARN_ON_ONCE(tsk != current);
15908 /* We only dequeue private signals from ourselves, we don't let
15909 * signalfd steal them
15911 @@ -1156,8 +1216,8 @@
15912 * We don't want to have recursive SIGSEGV's etc, for example,
15913 * that is why we also clear SIGNAL_UNKILLABLE.
15916 -force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
15918 +do_force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
15920 unsigned long int flags;
15921 int ret, blocked, ignored;
15922 @@ -1182,6 +1242,39 @@
15926 +int force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
15929 + * On some archs, PREEMPT_RT has to delay sending a signal from a trap
15930 + * since it can not enable preemption, and the signal code's spin_locks
15931 + * turn into mutexes. Instead, it must set TIF_NOTIFY_RESUME which will
15932 + * send the signal on exit of the trap.
15934 +#ifdef ARCH_RT_DELAYS_SIGNAL_SEND
15935 + if (in_atomic()) {
15936 + if (WARN_ON_ONCE(t != current))
15938 + if (WARN_ON_ONCE(t->forced_info.si_signo))
15941 + if (is_si_special(info)) {
15942 + WARN_ON_ONCE(info != SEND_SIG_PRIV);
15943 + t->forced_info.si_signo = sig;
15944 + t->forced_info.si_errno = 0;
15945 + t->forced_info.si_code = SI_KERNEL;
15946 + t->forced_info.si_pid = 0;
15947 + t->forced_info.si_uid = 0;
15949 + t->forced_info = *info;
15952 + set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
15956 + return do_force_sig_info(sig, info, t);
15960 * Nuke all other threads in the group.
15962 @@ -1216,12 +1309,12 @@
15963 * Disable interrupts early to avoid deadlocks.
15964 * See rcu_read_unlock() comment header for details.
15966 - local_irq_save(*flags);
15967 + local_irq_save_nort(*flags);
15969 sighand = rcu_dereference(tsk->sighand);
15970 if (unlikely(sighand == NULL)) {
15972 - local_irq_restore(*flags);
15973 + local_irq_restore_nort(*flags);
15977 @@ -1242,7 +1335,7 @@
15979 spin_unlock(&sighand->siglock);
15981 - local_irq_restore(*flags);
15982 + local_irq_restore_nort(*flags);
15986 @@ -1485,7 +1578,8 @@
15988 struct sigqueue *sigqueue_alloc(void)
15990 - struct sigqueue *q = __sigqueue_alloc(-1, current, GFP_KERNEL, 0);
15991 + /* Preallocated sigqueue objects always from the slabcache ! */
15992 + struct sigqueue *q = __sigqueue_do_alloc(-1, current, GFP_KERNEL, 0, 1);
15995 q->flags |= SIGQUEUE_PREALLOC;
15996 @@ -1846,15 +1940,7 @@
15997 if (gstop_done && ptrace_reparented(current))
15998 do_notify_parent_cldstop(current, false, why);
16001 - * Don't want to allow preemption here, because
16002 - * sys_ptrace() needs this task to be inactive.
16004 - * XXX: implement read_unlock_no_resched().
16006 - preempt_disable();
16007 read_unlock(&tasklist_lock);
16008 - preempt_enable_no_resched();
16009 freezable_schedule();
16012 diff -Nur linux-4.9.6.orig/kernel/softirq.c linux-4.9.6/kernel/softirq.c
16013 --- linux-4.9.6.orig/kernel/softirq.c 2017-01-26 08:25:24.000000000 +0100
16014 +++ linux-4.9.6/kernel/softirq.c 2017-01-28 13:59:10.083662028 +0100
16015 @@ -21,10 +21,12 @@
16016 #include <linux/freezer.h>
16017 #include <linux/kthread.h>
16018 #include <linux/rcupdate.h>
16019 +#include <linux/delay.h>
16020 #include <linux/ftrace.h>
16021 #include <linux/smp.h>
16022 #include <linux/smpboot.h>
16023 #include <linux/tick.h>
16024 +#include <linux/locallock.h>
16025 #include <linux/irq.h>
16027 #define CREATE_TRACE_POINTS
16028 @@ -56,12 +58,108 @@
16029 static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp;
16031 DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
16032 +#ifdef CONFIG_PREEMPT_RT_FULL
16033 +#define TIMER_SOFTIRQS ((1 << TIMER_SOFTIRQ) | (1 << HRTIMER_SOFTIRQ))
16034 +DEFINE_PER_CPU(struct task_struct *, ktimer_softirqd);
16037 const char * const softirq_to_name[NR_SOFTIRQS] = {
16038 "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "IRQ_POLL",
16039 "TASKLET", "SCHED", "HRTIMER", "RCU"
16042 +#ifdef CONFIG_NO_HZ_COMMON
16043 +# ifdef CONFIG_PREEMPT_RT_FULL
16045 +struct softirq_runner {
16046 + struct task_struct *runner[NR_SOFTIRQS];
16049 +static DEFINE_PER_CPU(struct softirq_runner, softirq_runners);
16051 +static inline void softirq_set_runner(unsigned int sirq)
16053 + struct softirq_runner *sr = this_cpu_ptr(&softirq_runners);
16055 + sr->runner[sirq] = current;
16058 +static inline void softirq_clr_runner(unsigned int sirq)
16060 + struct softirq_runner *sr = this_cpu_ptr(&softirq_runners);
16062 + sr->runner[sirq] = NULL;
16066 + * On preempt-rt a softirq running context might be blocked on a
16067 + * lock. There might be no other runnable task on this CPU because the
16068 + * lock owner runs on some other CPU. So we have to go into idle with
16069 + * the pending bit set. Therefor we need to check this otherwise we
16070 + * warn about false positives which confuses users and defeats the
16071 + * whole purpose of this test.
16073 + * This code is called with interrupts disabled.
16075 +void softirq_check_pending_idle(void)
16077 + static int rate_limit;
16078 + struct softirq_runner *sr = this_cpu_ptr(&softirq_runners);
16082 + if (rate_limit >= 10)
16085 + warnpending = local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK;
16086 + for (i = 0; i < NR_SOFTIRQS; i++) {
16087 + struct task_struct *tsk = sr->runner[i];
16090 + * The wakeup code in rtmutex.c wakes up the task
16091 + * _before_ it sets pi_blocked_on to NULL under
16092 + * tsk->pi_lock. So we need to check for both: state
16093 + * and pi_blocked_on.
16096 + raw_spin_lock(&tsk->pi_lock);
16097 + if (tsk->pi_blocked_on || tsk->state == TASK_RUNNING) {
16098 + /* Clear all bits pending in that task */
16099 + warnpending &= ~(tsk->softirqs_raised);
16100 + warnpending &= ~(1 << i);
16102 + raw_spin_unlock(&tsk->pi_lock);
16106 + if (warnpending) {
16107 + printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
16114 + * On !PREEMPT_RT we just printk rate limited:
16116 +void softirq_check_pending_idle(void)
16118 + static int rate_limit;
16120 + if (rate_limit < 10 &&
16121 + (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) {
16122 + printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
16123 + local_softirq_pending());
16129 +#else /* !CONFIG_NO_HZ_COMMON */
16130 +static inline void softirq_set_runner(unsigned int sirq) { }
16131 +static inline void softirq_clr_runner(unsigned int sirq) { }
16135 * we cannot loop indefinitely here to avoid userspace starvation,
16136 * but we also don't want to introduce a worst case 1/HZ latency
16137 @@ -77,6 +175,37 @@
16138 wake_up_process(tsk);
16141 +#ifdef CONFIG_PREEMPT_RT_FULL
16142 +static void wakeup_timer_softirqd(void)
16144 + /* Interrupts are disabled: no need to stop preemption */
16145 + struct task_struct *tsk = __this_cpu_read(ktimer_softirqd);
16147 + if (tsk && tsk->state != TASK_RUNNING)
16148 + wake_up_process(tsk);
16152 +static void handle_softirq(unsigned int vec_nr)
16154 + struct softirq_action *h = softirq_vec + vec_nr;
16157 + prev_count = preempt_count();
16159 + kstat_incr_softirqs_this_cpu(vec_nr);
16161 + trace_softirq_entry(vec_nr);
16163 + trace_softirq_exit(vec_nr);
16164 + if (unlikely(prev_count != preempt_count())) {
16165 + pr_err("huh, entered softirq %u %s %p with preempt_count %08x, exited with %08x?\n",
16166 + vec_nr, softirq_to_name[vec_nr], h->action,
16167 + prev_count, preempt_count());
16168 + preempt_count_set(prev_count);
16173 * If ksoftirqd is scheduled, we do not want to process pending softirqs
16174 * right now. Let ksoftirqd handle this at its own rate, to get fairness.
16175 @@ -88,6 +217,48 @@
16176 return tsk && (tsk->state == TASK_RUNNING);
16179 +#ifndef CONFIG_PREEMPT_RT_FULL
16180 +static inline int ksoftirqd_softirq_pending(void)
16182 + return local_softirq_pending();
16185 +static void handle_pending_softirqs(u32 pending)
16187 + struct softirq_action *h = softirq_vec;
16190 + local_irq_enable();
16194 + while ((softirq_bit = ffs(pending))) {
16195 + unsigned int vec_nr;
16197 + h += softirq_bit - 1;
16198 + vec_nr = h - softirq_vec;
16199 + handle_softirq(vec_nr);
16202 + pending >>= softirq_bit;
16206 + local_irq_disable();
16209 +static void run_ksoftirqd(unsigned int cpu)
16211 + local_irq_disable();
16212 + if (ksoftirqd_softirq_pending()) {
16214 + local_irq_enable();
16215 + cond_resched_rcu_qs();
16218 + local_irq_enable();
16222 * preempt_count and SOFTIRQ_OFFSET usage:
16223 * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving
16224 @@ -243,10 +414,8 @@
16225 unsigned long end = jiffies + MAX_SOFTIRQ_TIME;
16226 unsigned long old_flags = current->flags;
16227 int max_restart = MAX_SOFTIRQ_RESTART;
16228 - struct softirq_action *h;
16234 * Mask out PF_MEMALLOC s current task context is borrowed for the
16235 @@ -265,36 +434,7 @@
16236 /* Reset the pending bitmask before enabling irqs */
16237 set_softirq_pending(0);
16239 - local_irq_enable();
16243 - while ((softirq_bit = ffs(pending))) {
16244 - unsigned int vec_nr;
16247 - h += softirq_bit - 1;
16249 - vec_nr = h - softirq_vec;
16250 - prev_count = preempt_count();
16252 - kstat_incr_softirqs_this_cpu(vec_nr);
16254 - trace_softirq_entry(vec_nr);
16256 - trace_softirq_exit(vec_nr);
16257 - if (unlikely(prev_count != preempt_count())) {
16258 - pr_err("huh, entered softirq %u %s %p with preempt_count %08x, exited with %08x?\n",
16259 - vec_nr, softirq_to_name[vec_nr], h->action,
16260 - prev_count, preempt_count());
16261 - preempt_count_set(prev_count);
16264 - pending >>= softirq_bit;
16268 - local_irq_disable();
16269 + handle_pending_softirqs(pending);
16271 pending = local_softirq_pending();
16273 @@ -331,6 +471,309 @@
16277 + * This function must run with irqs disabled!
16279 +void raise_softirq_irqoff(unsigned int nr)
16281 + __raise_softirq_irqoff(nr);
16284 + * If we're in an interrupt or softirq, we're done
16285 + * (this also catches softirq-disabled code). We will
16286 + * actually run the softirq once we return from
16287 + * the irq or softirq.
16289 + * Otherwise we wake up ksoftirqd to make sure we
16290 + * schedule the softirq soon.
16292 + if (!in_interrupt())
16293 + wakeup_softirqd();
16296 +void __raise_softirq_irqoff(unsigned int nr)
16298 + trace_softirq_raise(nr);
16299 + or_softirq_pending(1UL << nr);
16302 +static inline void local_bh_disable_nort(void) { local_bh_disable(); }
16303 +static inline void _local_bh_enable_nort(void) { _local_bh_enable(); }
16304 +static void ksoftirqd_set_sched_params(unsigned int cpu) { }
16306 +#else /* !PREEMPT_RT_FULL */
16309 + * On RT we serialize softirq execution with a cpu local lock per softirq
16311 +static DEFINE_PER_CPU(struct local_irq_lock [NR_SOFTIRQS], local_softirq_locks);
16313 +void __init softirq_early_init(void)
16317 + for (i = 0; i < NR_SOFTIRQS; i++)
16318 + local_irq_lock_init(local_softirq_locks[i]);
16321 +static void lock_softirq(int which)
16323 + local_lock(local_softirq_locks[which]);
16326 +static void unlock_softirq(int which)
16328 + local_unlock(local_softirq_locks[which]);
16331 +static void do_single_softirq(int which)
16333 + unsigned long old_flags = current->flags;
16335 + current->flags &= ~PF_MEMALLOC;
16336 + vtime_account_irq_enter(current);
16337 + current->flags |= PF_IN_SOFTIRQ;
16338 + lockdep_softirq_enter();
16339 + local_irq_enable();
16340 + handle_softirq(which);
16341 + local_irq_disable();
16342 + lockdep_softirq_exit();
16343 + current->flags &= ~PF_IN_SOFTIRQ;
16344 + vtime_account_irq_enter(current);
16345 + tsk_restore_flags(current, old_flags, PF_MEMALLOC);
16349 + * Called with interrupts disabled. Process softirqs which were raised
16350 + * in current context (or on behalf of ksoftirqd).
16352 +static void do_current_softirqs(void)
16354 + while (current->softirqs_raised) {
16355 + int i = __ffs(current->softirqs_raised);
16356 + unsigned int pending, mask = (1U << i);
16358 + current->softirqs_raised &= ~mask;
16359 + local_irq_enable();
16362 + * If the lock is contended, we boost the owner to
16363 + * process the softirq or leave the critical section
16367 + local_irq_disable();
16368 + softirq_set_runner(i);
16370 + * Check with the local_softirq_pending() bits,
16371 + * whether we need to process this still or if someone
16372 + * else took care of it.
16374 + pending = local_softirq_pending();
16375 + if (pending & mask) {
16376 + set_softirq_pending(pending & ~mask);
16377 + do_single_softirq(i);
16379 + softirq_clr_runner(i);
16380 + WARN_ON(current->softirq_nestcnt != 1);
16381 + local_irq_enable();
16382 + unlock_softirq(i);
16383 + local_irq_disable();
16387 +void __local_bh_disable(void)
16389 + if (++current->softirq_nestcnt == 1)
16390 + migrate_disable();
16392 +EXPORT_SYMBOL(__local_bh_disable);
16394 +void __local_bh_enable(void)
16396 + if (WARN_ON(current->softirq_nestcnt == 0))
16399 + local_irq_disable();
16400 + if (current->softirq_nestcnt == 1 && current->softirqs_raised)
16401 + do_current_softirqs();
16402 + local_irq_enable();
16404 + if (--current->softirq_nestcnt == 0)
16405 + migrate_enable();
16407 +EXPORT_SYMBOL(__local_bh_enable);
16409 +void _local_bh_enable(void)
16411 + if (WARN_ON(current->softirq_nestcnt == 0))
16413 + if (--current->softirq_nestcnt == 0)
16414 + migrate_enable();
16416 +EXPORT_SYMBOL(_local_bh_enable);
16418 +int in_serving_softirq(void)
16420 + return current->flags & PF_IN_SOFTIRQ;
16422 +EXPORT_SYMBOL(in_serving_softirq);
16424 +/* Called with preemption disabled */
16425 +static void run_ksoftirqd(unsigned int cpu)
16427 + local_irq_disable();
16428 + current->softirq_nestcnt++;
16430 + do_current_softirqs();
16431 + current->softirq_nestcnt--;
16432 + local_irq_enable();
16433 + cond_resched_rcu_qs();
16437 + * Called from netif_rx_ni(). Preemption enabled, but migration
16438 + * disabled. So the cpu can't go away under us.
16440 +void thread_do_softirq(void)
16442 + if (!in_serving_softirq() && current->softirqs_raised) {
16443 + current->softirq_nestcnt++;
16444 + do_current_softirqs();
16445 + current->softirq_nestcnt--;
16449 +static void do_raise_softirq_irqoff(unsigned int nr)
16451 + unsigned int mask;
16453 + mask = 1UL << nr;
16455 + trace_softirq_raise(nr);
16456 + or_softirq_pending(mask);
16459 + * If we are not in a hard interrupt and inside a bh disabled
16460 + * region, we simply raise the flag on current. local_bh_enable()
16461 + * will make sure that the softirq is executed. Otherwise we
16462 + * delegate it to ksoftirqd.
16464 + if (!in_irq() && current->softirq_nestcnt)
16465 + current->softirqs_raised |= mask;
16466 + else if (!__this_cpu_read(ksoftirqd) || !__this_cpu_read(ktimer_softirqd))
16469 + if (mask & TIMER_SOFTIRQS)
16470 + __this_cpu_read(ktimer_softirqd)->softirqs_raised |= mask;
16472 + __this_cpu_read(ksoftirqd)->softirqs_raised |= mask;
16475 +static void wakeup_proper_softirq(unsigned int nr)
16477 + if ((1UL << nr) & TIMER_SOFTIRQS)
16478 + wakeup_timer_softirqd();
16480 + wakeup_softirqd();
16483 +void __raise_softirq_irqoff(unsigned int nr)
16485 + do_raise_softirq_irqoff(nr);
16486 + if (!in_irq() && !current->softirq_nestcnt)
16487 + wakeup_proper_softirq(nr);
16491 + * Same as __raise_softirq_irqoff() but will process them in ksoftirqd
16493 +void __raise_softirq_irqoff_ksoft(unsigned int nr)
16495 + unsigned int mask;
16497 + if (WARN_ON_ONCE(!__this_cpu_read(ksoftirqd) ||
16498 + !__this_cpu_read(ktimer_softirqd)))
16500 + mask = 1UL << nr;
16502 + trace_softirq_raise(nr);
16503 + or_softirq_pending(mask);
16504 + if (mask & TIMER_SOFTIRQS)
16505 + __this_cpu_read(ktimer_softirqd)->softirqs_raised |= mask;
16507 + __this_cpu_read(ksoftirqd)->softirqs_raised |= mask;
16508 + wakeup_proper_softirq(nr);
16512 + * This function must run with irqs disabled!
16514 +void raise_softirq_irqoff(unsigned int nr)
16516 + do_raise_softirq_irqoff(nr);
16519 + * If we're in an hard interrupt we let irq return code deal
16520 + * with the wakeup of ksoftirqd.
16525 + * If we are in thread context but outside of a bh disabled
16526 + * region, we need to wake ksoftirqd as well.
16528 + * CHECKME: Some of the places which do that could be wrapped
16529 + * into local_bh_disable/enable pairs. Though it's unclear
16530 + * whether this is worth the effort. To find those places just
16531 + * raise a WARN() if the condition is met.
16533 + if (!current->softirq_nestcnt)
16534 + wakeup_proper_softirq(nr);
16537 +static inline int ksoftirqd_softirq_pending(void)
16539 + return current->softirqs_raised;
16542 +static inline void local_bh_disable_nort(void) { }
16543 +static inline void _local_bh_enable_nort(void) { }
16545 +static inline void ksoftirqd_set_sched_params(unsigned int cpu)
16547 + /* Take over all but timer pending softirqs when starting */
16548 + local_irq_disable();
16549 + current->softirqs_raised = local_softirq_pending() & ~TIMER_SOFTIRQS;
16550 + local_irq_enable();
16553 +static inline void ktimer_softirqd_set_sched_params(unsigned int cpu)
16555 + struct sched_param param = { .sched_priority = 1 };
16557 + sched_setscheduler(current, SCHED_FIFO, ¶m);
16559 + /* Take over timer pending softirqs when starting */
16560 + local_irq_disable();
16561 + current->softirqs_raised = local_softirq_pending() & TIMER_SOFTIRQS;
16562 + local_irq_enable();
16565 +static inline void ktimer_softirqd_clr_sched_params(unsigned int cpu,
16568 + struct sched_param param = { .sched_priority = 0 };
16570 + sched_setscheduler(current, SCHED_NORMAL, ¶m);
16573 +static int ktimer_softirqd_should_run(unsigned int cpu)
16575 + return current->softirqs_raised;
16578 +#endif /* PREEMPT_RT_FULL */
16580 * Enter an interrupt context.
16582 void irq_enter(void)
16583 @@ -341,9 +784,9 @@
16584 * Prevent raise_softirq from needlessly waking up ksoftirqd
16585 * here, as softirq will be serviced on return from interrupt.
16587 - local_bh_disable();
16588 + local_bh_disable_nort();
16590 - _local_bh_enable();
16591 + _local_bh_enable_nort();
16595 @@ -351,9 +794,13 @@
16597 static inline void invoke_softirq(void)
16599 +#ifdef CONFIG_PREEMPT_RT_FULL
16600 + unsigned long flags;
16603 if (ksoftirqd_running())
16606 +#ifndef CONFIG_PREEMPT_RT_FULL
16607 if (!force_irqthreads) {
16608 #ifdef CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK
16610 @@ -373,6 +820,17 @@
16614 +#else /* PREEMPT_RT_FULL */
16616 + local_irq_save(flags);
16617 + if (__this_cpu_read(ksoftirqd) &&
16618 + __this_cpu_read(ksoftirqd)->softirqs_raised)
16619 + wakeup_softirqd();
16620 + if (__this_cpu_read(ktimer_softirqd) &&
16621 + __this_cpu_read(ktimer_softirqd)->softirqs_raised)
16622 + wakeup_timer_softirqd();
16623 + local_irq_restore(flags);
16627 static inline void tick_irq_exit(void)
16628 @@ -409,26 +867,6 @@
16629 trace_hardirq_exit(); /* must be last! */
16633 - * This function must run with irqs disabled!
16635 -inline void raise_softirq_irqoff(unsigned int nr)
16637 - __raise_softirq_irqoff(nr);
16640 - * If we're in an interrupt or softirq, we're done
16641 - * (this also catches softirq-disabled code). We will
16642 - * actually run the softirq once we return from
16643 - * the irq or softirq.
16645 - * Otherwise we wake up ksoftirqd to make sure we
16646 - * schedule the softirq soon.
16648 - if (!in_interrupt())
16649 - wakeup_softirqd();
16652 void raise_softirq(unsigned int nr)
16654 unsigned long flags;
16655 @@ -438,12 +876,6 @@
16656 local_irq_restore(flags);
16659 -void __raise_softirq_irqoff(unsigned int nr)
16661 - trace_softirq_raise(nr);
16662 - or_softirq_pending(1UL << nr);
16665 void open_softirq(int nr, void (*action)(struct softirq_action *))
16667 softirq_vec[nr].action = action;
16668 @@ -460,15 +892,45 @@
16669 static DEFINE_PER_CPU(struct tasklet_head, tasklet_vec);
16670 static DEFINE_PER_CPU(struct tasklet_head, tasklet_hi_vec);
16672 +static void inline
16673 +__tasklet_common_schedule(struct tasklet_struct *t, struct tasklet_head *head, unsigned int nr)
16675 + if (tasklet_trylock(t)) {
16677 + /* We may have been preempted before tasklet_trylock
16678 + * and __tasklet_action may have already run.
16679 + * So double check the sched bit while the takslet
16680 + * is locked before adding it to the list.
16682 + if (test_bit(TASKLET_STATE_SCHED, &t->state)) {
16685 + head->tail = &(t->next);
16686 + raise_softirq_irqoff(nr);
16687 + tasklet_unlock(t);
16689 + /* This is subtle. If we hit the corner case above
16690 + * It is possible that we get preempted right here,
16691 + * and another task has successfully called
16692 + * tasklet_schedule(), then this function, and
16693 + * failed on the trylock. Thus we must be sure
16694 + * before releasing the tasklet lock, that the
16695 + * SCHED_BIT is clear. Otherwise the tasklet
16696 + * may get its SCHED_BIT set, but not added to the
16699 + if (!tasklet_tryunlock(t))
16705 void __tasklet_schedule(struct tasklet_struct *t)
16707 unsigned long flags;
16709 local_irq_save(flags);
16711 - *__this_cpu_read(tasklet_vec.tail) = t;
16712 - __this_cpu_write(tasklet_vec.tail, &(t->next));
16713 - raise_softirq_irqoff(TASKLET_SOFTIRQ);
16714 + __tasklet_common_schedule(t, this_cpu_ptr(&tasklet_vec), TASKLET_SOFTIRQ);
16715 local_irq_restore(flags);
16717 EXPORT_SYMBOL(__tasklet_schedule);
16718 @@ -478,10 +940,7 @@
16719 unsigned long flags;
16721 local_irq_save(flags);
16723 - *__this_cpu_read(tasklet_hi_vec.tail) = t;
16724 - __this_cpu_write(tasklet_hi_vec.tail, &(t->next));
16725 - raise_softirq_irqoff(HI_SOFTIRQ);
16726 + __tasklet_common_schedule(t, this_cpu_ptr(&tasklet_hi_vec), HI_SOFTIRQ);
16727 local_irq_restore(flags);
16729 EXPORT_SYMBOL(__tasklet_hi_schedule);
16730 @@ -490,82 +949,122 @@
16732 BUG_ON(!irqs_disabled());
16734 - t->next = __this_cpu_read(tasklet_hi_vec.head);
16735 - __this_cpu_write(tasklet_hi_vec.head, t);
16736 - __raise_softirq_irqoff(HI_SOFTIRQ);
16737 + __tasklet_hi_schedule(t);
16739 EXPORT_SYMBOL(__tasklet_hi_schedule_first);
16741 -static __latent_entropy void tasklet_action(struct softirq_action *a)
16742 +void tasklet_enable(struct tasklet_struct *t)
16744 - struct tasklet_struct *list;
16745 + if (!atomic_dec_and_test(&t->count))
16747 + if (test_and_clear_bit(TASKLET_STATE_PENDING, &t->state))
16748 + tasklet_schedule(t);
16750 +EXPORT_SYMBOL(tasklet_enable);
16752 - local_irq_disable();
16753 - list = __this_cpu_read(tasklet_vec.head);
16754 - __this_cpu_write(tasklet_vec.head, NULL);
16755 - __this_cpu_write(tasklet_vec.tail, this_cpu_ptr(&tasklet_vec.head));
16756 - local_irq_enable();
16757 +static void __tasklet_action(struct softirq_action *a,
16758 + struct tasklet_struct *list)
16760 + int loops = 1000000;
16763 struct tasklet_struct *t = list;
16767 - if (tasklet_trylock(t)) {
16768 - if (!atomic_read(&t->count)) {
16769 - if (!test_and_clear_bit(TASKLET_STATE_SCHED,
16772 - t->func(t->data);
16773 - tasklet_unlock(t);
16776 - tasklet_unlock(t);
16778 + * Should always succeed - after a tasklist got on the
16779 + * list (after getting the SCHED bit set from 0 to 1),
16780 + * nothing but the tasklet softirq it got queued to can
16783 + if (!tasklet_trylock(t)) {
16788 - local_irq_disable();
16790 - *__this_cpu_read(tasklet_vec.tail) = t;
16791 - __this_cpu_write(tasklet_vec.tail, &(t->next));
16792 - __raise_softirq_irqoff(TASKLET_SOFTIRQ);
16793 - local_irq_enable();
16796 + * If we cannot handle the tasklet because it's disabled,
16797 + * mark it as pending. tasklet_enable() will later
16798 + * re-schedule the tasklet.
16800 + if (unlikely(atomic_read(&t->count))) {
16802 + /* implicit unlock: */
16804 + t->state = TASKLET_STATEF_PENDING;
16809 + * After this point on the tasklet might be rescheduled
16810 + * on another CPU, but it can only be added to another
16811 + * CPU's tasklet list if we unlock the tasklet (which we
16814 + if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
16818 + t->func(t->data);
16821 + * Try to unlock the tasklet. We must use cmpxchg, because
16822 + * another CPU might have scheduled or disabled the tasklet.
16823 + * We only allow the STATE_RUN -> 0 transition here.
16825 + while (!tasklet_tryunlock(t)) {
16827 + * If it got disabled meanwhile, bail out:
16829 + if (atomic_read(&t->count))
16830 + goto out_disabled;
16832 + * If it got scheduled meanwhile, re-execute
16833 + * the tasklet function:
16835 + if (test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
16838 + printk("hm, tasklet state: %08lx\n", t->state);
16840 + tasklet_unlock(t);
16847 +static void tasklet_action(struct softirq_action *a)
16849 + struct tasklet_struct *list;
16851 + local_irq_disable();
16853 + list = __this_cpu_read(tasklet_vec.head);
16854 + __this_cpu_write(tasklet_vec.head, NULL);
16855 + __this_cpu_write(tasklet_vec.tail, this_cpu_ptr(&tasklet_vec.head));
16857 + local_irq_enable();
16859 + __tasklet_action(a, list);
16862 static __latent_entropy void tasklet_hi_action(struct softirq_action *a)
16864 struct tasklet_struct *list;
16866 local_irq_disable();
16868 list = __this_cpu_read(tasklet_hi_vec.head);
16869 __this_cpu_write(tasklet_hi_vec.head, NULL);
16870 __this_cpu_write(tasklet_hi_vec.tail, this_cpu_ptr(&tasklet_hi_vec.head));
16871 - local_irq_enable();
16874 - struct tasklet_struct *t = list;
16876 - list = list->next;
16878 - if (tasklet_trylock(t)) {
16879 - if (!atomic_read(&t->count)) {
16880 - if (!test_and_clear_bit(TASKLET_STATE_SCHED,
16883 - t->func(t->data);
16884 - tasklet_unlock(t);
16887 - tasklet_unlock(t);
16889 + local_irq_enable();
16891 - local_irq_disable();
16893 - *__this_cpu_read(tasklet_hi_vec.tail) = t;
16894 - __this_cpu_write(tasklet_hi_vec.tail, &(t->next));
16895 - __raise_softirq_irqoff(HI_SOFTIRQ);
16896 - local_irq_enable();
16898 + __tasklet_action(a, list);
16901 void tasklet_init(struct tasklet_struct *t,
16902 @@ -586,7 +1085,7 @@
16904 while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) {
16908 } while (test_bit(TASKLET_STATE_SCHED, &t->state));
16910 tasklet_unlock_wait(t);
16911 @@ -660,25 +1159,26 @@
16912 open_softirq(HI_SOFTIRQ, tasklet_hi_action);
16915 -static int ksoftirqd_should_run(unsigned int cpu)
16916 +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
16917 +void tasklet_unlock_wait(struct tasklet_struct *t)
16919 - return local_softirq_pending();
16922 -static void run_ksoftirqd(unsigned int cpu)
16924 - local_irq_disable();
16925 - if (local_softirq_pending()) {
16926 + while (test_bit(TASKLET_STATE_RUN, &(t)->state)) {
16928 - * We can safely run softirq on inline stack, as we are not deep
16929 - * in the task stack here.
16930 + * Hack for now to avoid this busy-loop:
16933 - local_irq_enable();
16934 - cond_resched_rcu_qs();
16936 +#ifdef CONFIG_PREEMPT_RT_FULL
16942 - local_irq_enable();
16944 +EXPORT_SYMBOL(tasklet_unlock_wait);
16947 +static int ksoftirqd_should_run(unsigned int cpu)
16949 + return ksoftirqd_softirq_pending();
16952 #ifdef CONFIG_HOTPLUG_CPU
16953 @@ -745,17 +1245,31 @@
16955 static struct smp_hotplug_thread softirq_threads = {
16956 .store = &ksoftirqd,
16957 + .setup = ksoftirqd_set_sched_params,
16958 .thread_should_run = ksoftirqd_should_run,
16959 .thread_fn = run_ksoftirqd,
16960 .thread_comm = "ksoftirqd/%u",
16963 +#ifdef CONFIG_PREEMPT_RT_FULL
16964 +static struct smp_hotplug_thread softirq_timer_threads = {
16965 + .store = &ktimer_softirqd,
16966 + .setup = ktimer_softirqd_set_sched_params,
16967 + .cleanup = ktimer_softirqd_clr_sched_params,
16968 + .thread_should_run = ktimer_softirqd_should_run,
16969 + .thread_fn = run_ksoftirqd,
16970 + .thread_comm = "ktimersoftd/%u",
16974 static __init int spawn_ksoftirqd(void)
16976 cpuhp_setup_state_nocalls(CPUHP_SOFTIRQ_DEAD, "softirq:dead", NULL,
16977 takeover_tasklets);
16978 BUG_ON(smpboot_register_percpu_thread(&softirq_threads));
16980 +#ifdef CONFIG_PREEMPT_RT_FULL
16981 + BUG_ON(smpboot_register_percpu_thread(&softirq_timer_threads));
16985 early_initcall(spawn_ksoftirqd);
16986 diff -Nur linux-4.9.6.orig/kernel/stop_machine.c linux-4.9.6/kernel/stop_machine.c
16987 --- linux-4.9.6.orig/kernel/stop_machine.c 2017-01-26 08:25:24.000000000 +0100
16988 +++ linux-4.9.6/kernel/stop_machine.c 2017-01-28 13:59:10.087662182 +0100
16990 struct cpu_stopper {
16991 struct task_struct *thread;
16994 + raw_spinlock_t lock;
16995 bool enabled; /* is this stopper enabled? */
16996 struct list_head works; /* list of pending works */
16998 @@ -78,14 +78,14 @@
16999 unsigned long flags;
17002 - spin_lock_irqsave(&stopper->lock, flags);
17003 + raw_spin_lock_irqsave(&stopper->lock, flags);
17004 enabled = stopper->enabled;
17006 __cpu_stop_queue_work(stopper, work);
17007 else if (work->done)
17008 cpu_stop_signal_done(work->done);
17009 - spin_unlock_irqrestore(&stopper->lock, flags);
17011 + raw_spin_unlock_irqrestore(&stopper->lock, flags);
17015 @@ -231,8 +231,8 @@
17016 struct cpu_stopper *stopper2 = per_cpu_ptr(&cpu_stopper, cpu2);
17019 - spin_lock_irq(&stopper1->lock);
17020 - spin_lock_nested(&stopper2->lock, SINGLE_DEPTH_NESTING);
17021 + raw_spin_lock_irq(&stopper1->lock);
17022 + raw_spin_lock_nested(&stopper2->lock, SINGLE_DEPTH_NESTING);
17025 if (!stopper1->enabled || !stopper2->enabled)
17026 @@ -255,8 +255,8 @@
17027 __cpu_stop_queue_work(stopper1, work1);
17028 __cpu_stop_queue_work(stopper2, work2);
17030 - spin_unlock(&stopper2->lock);
17031 - spin_unlock_irq(&stopper1->lock);
17032 + raw_spin_unlock(&stopper2->lock);
17033 + raw_spin_unlock_irq(&stopper1->lock);
17035 if (unlikely(err == -EDEADLK)) {
17036 while (stop_cpus_in_progress)
17037 @@ -448,9 +448,9 @@
17038 unsigned long flags;
17041 - spin_lock_irqsave(&stopper->lock, flags);
17042 + raw_spin_lock_irqsave(&stopper->lock, flags);
17043 run = !list_empty(&stopper->works);
17044 - spin_unlock_irqrestore(&stopper->lock, flags);
17045 + raw_spin_unlock_irqrestore(&stopper->lock, flags);
17049 @@ -461,13 +461,13 @@
17053 - spin_lock_irq(&stopper->lock);
17054 + raw_spin_lock_irq(&stopper->lock);
17055 if (!list_empty(&stopper->works)) {
17056 work = list_first_entry(&stopper->works,
17057 struct cpu_stop_work, list);
17058 list_del_init(&work->list);
17060 - spin_unlock_irq(&stopper->lock);
17061 + raw_spin_unlock_irq(&stopper->lock);
17064 cpu_stop_fn_t fn = work->fn;
17065 @@ -475,6 +475,8 @@
17066 struct cpu_stop_done *done = work->done;
17071 /* cpu stop callbacks must not sleep, make in_atomic() == T */
17072 preempt_count_inc();
17074 @@ -541,7 +543,7 @@
17075 for_each_possible_cpu(cpu) {
17076 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
17078 - spin_lock_init(&stopper->lock);
17079 + raw_spin_lock_init(&stopper->lock);
17080 INIT_LIST_HEAD(&stopper->works);
17083 diff -Nur linux-4.9.6.orig/kernel/time/hrtimer.c linux-4.9.6/kernel/time/hrtimer.c
17084 --- linux-4.9.6.orig/kernel/time/hrtimer.c 2017-01-26 08:25:24.000000000 +0100
17085 +++ linux-4.9.6/kernel/time/hrtimer.c 2017-01-28 13:59:10.087662182 +0100
17087 #include <asm/uaccess.h>
17089 #include <trace/events/timer.h>
17090 +#include <trace/events/hist.h>
17092 #include "tick-internal.h"
17094 @@ -695,6 +696,29 @@
17095 retrigger_next_event(NULL);
17098 +#ifdef CONFIG_PREEMPT_RT_FULL
17100 +static struct swork_event clock_set_delay_work;
17102 +static void run_clock_set_delay(struct swork_event *event)
17107 +void clock_was_set_delayed(void)
17109 + swork_queue(&clock_set_delay_work);
17112 +static __init int create_clock_set_delay_thread(void)
17114 + WARN_ON(swork_get());
17115 + INIT_SWORK(&clock_set_delay_work, run_clock_set_delay);
17118 +early_initcall(create_clock_set_delay_thread);
17119 +#else /* PREEMPT_RT_FULL */
17121 static void clock_was_set_work(struct work_struct *work)
17124 @@ -710,6 +734,7 @@
17126 schedule_work(&hrtimer_work);
17132 @@ -719,11 +744,8 @@
17133 static inline void hrtimer_switch_to_hres(void) { }
17135 hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { }
17136 -static inline int hrtimer_reprogram(struct hrtimer *timer,
17137 - struct hrtimer_clock_base *base)
17141 +static inline void hrtimer_reprogram(struct hrtimer *timer,
17142 + struct hrtimer_clock_base *base) { }
17143 static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { }
17144 static inline void retrigger_next_event(void *arg) { }
17146 @@ -855,6 +877,32 @@
17148 EXPORT_SYMBOL_GPL(hrtimer_forward);
17150 +#ifdef CONFIG_PREEMPT_RT_BASE
17151 +# define wake_up_timer_waiters(b) wake_up(&(b)->wait)
17154 + * hrtimer_wait_for_timer - Wait for a running timer
17156 + * @timer: timer to wait for
17158 + * The function waits in case the timers callback function is
17159 + * currently executed on the waitqueue of the timer base. The
17160 + * waitqueue is woken up after the timer callback function has
17161 + * finished execution.
17163 +void hrtimer_wait_for_timer(const struct hrtimer *timer)
17165 + struct hrtimer_clock_base *base = timer->base;
17167 + if (base && base->cpu_base && !timer->irqsafe)
17168 + wait_event(base->cpu_base->wait,
17169 + !(hrtimer_callback_running(timer)));
17173 +# define wake_up_timer_waiters(b) do { } while (0)
17177 * enqueue_hrtimer - internal function to (re)start a timer
17179 @@ -896,6 +944,11 @@
17180 if (!(state & HRTIMER_STATE_ENQUEUED))
17183 + if (unlikely(!list_empty(&timer->cb_entry))) {
17184 + list_del_init(&timer->cb_entry);
17188 if (!timerqueue_del(&base->active, &timer->node))
17189 cpu_base->active_bases &= ~(1 << base->index);
17191 @@ -991,7 +1044,16 @@
17192 new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED);
17194 timer_stats_hrtimer_set_start_info(timer);
17195 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
17197 + ktime_t now = new_base->get_time();
17199 + if (ktime_to_ns(tim) < ktime_to_ns(now))
17200 + timer->praecox = now;
17202 + timer->praecox = ktime_set(0, 0);
17205 leftmost = enqueue_hrtimer(timer, new_base);
17208 @@ -1063,7 +1125,7 @@
17213 + hrtimer_wait_for_timer(timer);
17216 EXPORT_SYMBOL_GPL(hrtimer_cancel);
17217 @@ -1127,6 +1189,7 @@
17219 base = hrtimer_clockid_to_base(clock_id);
17220 timer->base = &cpu_base->clock_base[base];
17221 + INIT_LIST_HEAD(&timer->cb_entry);
17222 timerqueue_init(&timer->node);
17224 #ifdef CONFIG_TIMER_STATS
17225 @@ -1167,6 +1230,7 @@
17226 seq = raw_read_seqcount_begin(&cpu_base->seq);
17228 if (timer->state != HRTIMER_STATE_INACTIVE ||
17229 + cpu_base->running_soft == timer ||
17230 cpu_base->running == timer)
17233 @@ -1265,10 +1329,112 @@
17234 cpu_base->running = NULL;
17237 +#ifdef CONFIG_PREEMPT_RT_BASE
17238 +static void hrtimer_rt_reprogram(int restart, struct hrtimer *timer,
17239 + struct hrtimer_clock_base *base)
17243 + if (restart != HRTIMER_NORESTART &&
17244 + !(timer->state & HRTIMER_STATE_ENQUEUED)) {
17246 + leftmost = enqueue_hrtimer(timer, base);
17249 +#ifdef CONFIG_HIGH_RES_TIMERS
17250 + if (!hrtimer_is_hres_active(timer)) {
17252 + * Kick to reschedule the next tick to handle the new timer
17253 + * on dynticks target.
17255 + if (base->cpu_base->nohz_active)
17256 + wake_up_nohz_cpu(base->cpu_base->cpu);
17259 + hrtimer_reprogram(timer, base);
17266 + * The changes in mainline which removed the callback modes from
17267 + * hrtimer are not yet working with -rt. The non wakeup_process()
17268 + * based callbacks which involve sleeping locks need to be treated
17271 +static void hrtimer_rt_run_pending(void)
17273 + enum hrtimer_restart (*fn)(struct hrtimer *);
17274 + struct hrtimer_cpu_base *cpu_base;
17275 + struct hrtimer_clock_base *base;
17276 + struct hrtimer *timer;
17277 + int index, restart;
17279 + local_irq_disable();
17280 + cpu_base = &per_cpu(hrtimer_bases, smp_processor_id());
17282 + raw_spin_lock(&cpu_base->lock);
17284 + for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) {
17285 + base = &cpu_base->clock_base[index];
17287 + while (!list_empty(&base->expired)) {
17288 + timer = list_first_entry(&base->expired,
17289 + struct hrtimer, cb_entry);
17292 + * Same as the above __run_hrtimer function
17293 + * just we run with interrupts enabled.
17295 + debug_deactivate(timer);
17296 + cpu_base->running_soft = timer;
17297 + raw_write_seqcount_barrier(&cpu_base->seq);
17299 + __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0);
17300 + timer_stats_account_hrtimer(timer);
17301 + fn = timer->function;
17303 + raw_spin_unlock_irq(&cpu_base->lock);
17304 + restart = fn(timer);
17305 + raw_spin_lock_irq(&cpu_base->lock);
17307 + hrtimer_rt_reprogram(restart, timer, base);
17308 + raw_write_seqcount_barrier(&cpu_base->seq);
17310 + WARN_ON_ONCE(cpu_base->running_soft != timer);
17311 + cpu_base->running_soft = NULL;
17315 + raw_spin_unlock_irq(&cpu_base->lock);
17317 + wake_up_timer_waiters(cpu_base);
17320 +static int hrtimer_rt_defer(struct hrtimer *timer)
17322 + if (timer->irqsafe)
17325 + __remove_hrtimer(timer, timer->base, timer->state, 0);
17326 + list_add_tail(&timer->cb_entry, &timer->base->expired);
17332 +static inline int hrtimer_rt_defer(struct hrtimer *timer) { return 0; }
17336 +static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer);
17338 static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)
17340 struct hrtimer_clock_base *base = cpu_base->clock_base;
17341 unsigned int active = cpu_base->active_bases;
17344 for (; active; base++, active >>= 1) {
17345 struct timerqueue_node *node;
17346 @@ -1284,6 +1450,15 @@
17348 timer = container_of(node, struct hrtimer, node);
17350 + trace_hrtimer_interrupt(raw_smp_processor_id(),
17351 + ktime_to_ns(ktime_sub(ktime_to_ns(timer->praecox) ?
17352 + timer->praecox : hrtimer_get_expires(timer),
17355 + timer->function == hrtimer_wakeup ?
17356 + container_of(timer, struct hrtimer_sleeper,
17357 + timer)->task : NULL);
17360 * The immediate goal for using the softexpires is
17361 * minimizing wakeups, not running timers at the
17362 @@ -1299,9 +1474,14 @@
17363 if (basenow.tv64 < hrtimer_get_softexpires_tv64(timer))
17366 - __run_hrtimer(cpu_base, base, timer, &basenow);
17367 + if (!hrtimer_rt_defer(timer))
17368 + __run_hrtimer(cpu_base, base, timer, &basenow);
17374 + raise_softirq_irqoff(HRTIMER_SOFTIRQ);
17377 #ifdef CONFIG_HIGH_RES_TIMERS
17378 @@ -1464,16 +1644,18 @@
17379 void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
17381 sl->timer.function = hrtimer_wakeup;
17382 + sl->timer.irqsafe = 1;
17385 EXPORT_SYMBOL_GPL(hrtimer_init_sleeper);
17387 -static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode)
17388 +static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode,
17389 + unsigned long state)
17391 hrtimer_init_sleeper(t, current);
17394 - set_current_state(TASK_INTERRUPTIBLE);
17395 + set_current_state(state);
17396 hrtimer_start_expires(&t->timer, mode);
17398 if (likely(t->task))
17399 @@ -1515,7 +1697,8 @@
17401 hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires);
17403 - if (do_nanosleep(&t, HRTIMER_MODE_ABS))
17404 + /* cpu_chill() does not care about restart state. */
17405 + if (do_nanosleep(&t, HRTIMER_MODE_ABS, TASK_INTERRUPTIBLE))
17408 rmtp = restart->nanosleep.rmtp;
17409 @@ -1532,8 +1715,10 @@
17413 -long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
17414 - const enum hrtimer_mode mode, const clockid_t clockid)
17416 +__hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
17417 + const enum hrtimer_mode mode, const clockid_t clockid,
17418 + unsigned long state)
17420 struct restart_block *restart;
17421 struct hrtimer_sleeper t;
17422 @@ -1546,7 +1731,7 @@
17424 hrtimer_init_on_stack(&t.timer, clockid, mode);
17425 hrtimer_set_expires_range_ns(&t.timer, timespec_to_ktime(*rqtp), slack);
17426 - if (do_nanosleep(&t, mode))
17427 + if (do_nanosleep(&t, mode, state))
17430 /* Absolute timers do not update the rmtp value and restart: */
17431 @@ -1573,6 +1758,12 @@
17435 +long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
17436 + const enum hrtimer_mode mode, const clockid_t clockid)
17438 + return __hrtimer_nanosleep(rqtp, rmtp, mode, clockid, TASK_INTERRUPTIBLE);
17441 SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp,
17442 struct timespec __user *, rmtp)
17444 @@ -1587,6 +1778,26 @@
17445 return hrtimer_nanosleep(&tu, rmtp, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
17448 +#ifdef CONFIG_PREEMPT_RT_FULL
17450 + * Sleep for 1 ms in hope whoever holds what we want will let it go.
17452 +void cpu_chill(void)
17454 + struct timespec tu = {
17455 + .tv_nsec = NSEC_PER_MSEC,
17457 + unsigned int freeze_flag = current->flags & PF_NOFREEZE;
17459 + current->flags |= PF_NOFREEZE;
17460 + __hrtimer_nanosleep(&tu, NULL, HRTIMER_MODE_REL, CLOCK_MONOTONIC,
17461 + TASK_UNINTERRUPTIBLE);
17462 + if (!freeze_flag)
17463 + current->flags &= ~PF_NOFREEZE;
17465 +EXPORT_SYMBOL(cpu_chill);
17469 * Functions related to boot-time initialization:
17471 @@ -1598,10 +1809,14 @@
17472 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
17473 cpu_base->clock_base[i].cpu_base = cpu_base;
17474 timerqueue_init_head(&cpu_base->clock_base[i].active);
17475 + INIT_LIST_HEAD(&cpu_base->clock_base[i].expired);
17478 cpu_base->cpu = cpu;
17479 hrtimer_init_hres(cpu_base);
17480 +#ifdef CONFIG_PREEMPT_RT_BASE
17481 + init_waitqueue_head(&cpu_base->wait);
17486 @@ -1671,9 +1886,26 @@
17488 #endif /* CONFIG_HOTPLUG_CPU */
17490 +#ifdef CONFIG_PREEMPT_RT_BASE
17492 +static void run_hrtimer_softirq(struct softirq_action *h)
17494 + hrtimer_rt_run_pending();
17497 +static void hrtimers_open_softirq(void)
17499 + open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq);
17503 +static void hrtimers_open_softirq(void) { }
17506 void __init hrtimers_init(void)
17508 hrtimers_prepare_cpu(smp_processor_id());
17509 + hrtimers_open_softirq();
17513 diff -Nur linux-4.9.6.orig/kernel/time/itimer.c linux-4.9.6/kernel/time/itimer.c
17514 --- linux-4.9.6.orig/kernel/time/itimer.c 2017-01-26 08:25:24.000000000 +0100
17515 +++ linux-4.9.6/kernel/time/itimer.c 2017-01-28 13:59:10.087662182 +0100
17516 @@ -213,6 +213,7 @@
17517 /* We are sharing ->siglock with it_real_fn() */
17518 if (hrtimer_try_to_cancel(timer) < 0) {
17519 spin_unlock_irq(&tsk->sighand->siglock);
17520 + hrtimer_wait_for_timer(&tsk->signal->real_timer);
17523 expires = timeval_to_ktime(value->it_value);
17524 diff -Nur linux-4.9.6.orig/kernel/time/jiffies.c linux-4.9.6/kernel/time/jiffies.c
17525 --- linux-4.9.6.orig/kernel/time/jiffies.c 2017-01-26 08:25:24.000000000 +0100
17526 +++ linux-4.9.6/kernel/time/jiffies.c 2017-01-28 13:59:10.087662182 +0100
17531 -__cacheline_aligned_in_smp DEFINE_SEQLOCK(jiffies_lock);
17532 +__cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(jiffies_lock);
17533 +__cacheline_aligned_in_smp seqcount_t jiffies_seq;
17535 #if (BITS_PER_LONG < 64)
17536 u64 get_jiffies_64(void)
17541 - seq = read_seqbegin(&jiffies_lock);
17542 + seq = read_seqcount_begin(&jiffies_seq);
17544 - } while (read_seqretry(&jiffies_lock, seq));
17545 + } while (read_seqcount_retry(&jiffies_seq, seq));
17548 EXPORT_SYMBOL(get_jiffies_64);
17549 diff -Nur linux-4.9.6.orig/kernel/time/ntp.c linux-4.9.6/kernel/time/ntp.c
17550 --- linux-4.9.6.orig/kernel/time/ntp.c 2017-01-26 08:25:24.000000000 +0100
17551 +++ linux-4.9.6/kernel/time/ntp.c 2017-01-28 13:59:10.087662182 +0100
17553 #include <linux/module.h>
17554 #include <linux/rtc.h>
17555 #include <linux/math64.h>
17556 +#include <linux/swork.h>
17558 #include "ntp_internal.h"
17559 #include "timekeeping_internal.h"
17560 @@ -568,10 +569,35 @@
17561 &sync_cmos_work, timespec64_to_jiffies(&next));
17564 +#ifdef CONFIG_PREEMPT_RT_FULL
17566 +static void run_clock_set_delay(struct swork_event *event)
17568 + queue_delayed_work(system_power_efficient_wq, &sync_cmos_work, 0);
17571 +static struct swork_event ntp_cmos_swork;
17573 +void ntp_notify_cmos_timer(void)
17575 + swork_queue(&ntp_cmos_swork);
17578 +static __init int create_cmos_delay_thread(void)
17580 + WARN_ON(swork_get());
17581 + INIT_SWORK(&ntp_cmos_swork, run_clock_set_delay);
17584 +early_initcall(create_cmos_delay_thread);
17588 void ntp_notify_cmos_timer(void)
17590 queue_delayed_work(system_power_efficient_wq, &sync_cmos_work, 0);
17592 +#endif /* CONFIG_PREEMPT_RT_FULL */
17595 void ntp_notify_cmos_timer(void) { }
17596 diff -Nur linux-4.9.6.orig/kernel/time/posix-cpu-timers.c linux-4.9.6/kernel/time/posix-cpu-timers.c
17597 --- linux-4.9.6.orig/kernel/time/posix-cpu-timers.c 2017-01-26 08:25:24.000000000 +0100
17598 +++ linux-4.9.6/kernel/time/posix-cpu-timers.c 2017-01-28 13:59:10.087662182 +0100
17602 #include <linux/sched.h>
17603 +#include <linux/sched/rt.h>
17604 #include <linux/posix-timers.h>
17605 #include <linux/errno.h>
17606 #include <linux/math64.h>
17607 @@ -620,7 +621,7 @@
17609 * Disarm any old timer after extracting its expiry time.
17611 - WARN_ON_ONCE(!irqs_disabled());
17612 + WARN_ON_ONCE_NONRT(!irqs_disabled());
17615 old_incr = timer->it.cpu.incr;
17616 @@ -1064,7 +1065,7 @@
17618 * Now re-arm for the new expiry time.
17620 - WARN_ON_ONCE(!irqs_disabled());
17621 + WARN_ON_ONCE_NONRT(!irqs_disabled());
17623 unlock_task_sighand(p, &flags);
17625 @@ -1153,13 +1154,13 @@
17626 * already updated our counts. We need to check if any timers fire now.
17627 * Interrupts are disabled.
17629 -void run_posix_cpu_timers(struct task_struct *tsk)
17630 +static void __run_posix_cpu_timers(struct task_struct *tsk)
17633 struct k_itimer *timer, *next;
17634 unsigned long flags;
17636 - WARN_ON_ONCE(!irqs_disabled());
17637 + WARN_ON_ONCE_NONRT(!irqs_disabled());
17640 * The fast path checks that there are no expired thread or thread
17641 @@ -1213,6 +1214,190 @@
17645 +#ifdef CONFIG_PREEMPT_RT_BASE
17646 +#include <linux/kthread.h>
17647 +#include <linux/cpu.h>
17648 +DEFINE_PER_CPU(struct task_struct *, posix_timer_task);
17649 +DEFINE_PER_CPU(struct task_struct *, posix_timer_tasklist);
17651 +static int posix_cpu_timers_thread(void *data)
17653 + int cpu = (long)data;
17655 + BUG_ON(per_cpu(posix_timer_task,cpu) != current);
17657 + while (!kthread_should_stop()) {
17658 + struct task_struct *tsk = NULL;
17659 + struct task_struct *next = NULL;
17661 + if (cpu_is_offline(cpu))
17662 + goto wait_to_die;
17664 + /* grab task list */
17665 + raw_local_irq_disable();
17666 + tsk = per_cpu(posix_timer_tasklist, cpu);
17667 + per_cpu(posix_timer_tasklist, cpu) = NULL;
17668 + raw_local_irq_enable();
17670 + /* its possible the list is empty, just return */
17672 + set_current_state(TASK_INTERRUPTIBLE);
17674 + __set_current_state(TASK_RUNNING);
17678 + /* Process task list */
17681 + next = tsk->posix_timer_list;
17683 + /* run the task timers, clear its ptr and
17686 + __run_posix_cpu_timers(tsk);
17687 + tsk->posix_timer_list = NULL;
17688 + put_task_struct(tsk);
17690 + /* check if this is the last on the list */
17699 + /* Wait for kthread_stop */
17700 + set_current_state(TASK_INTERRUPTIBLE);
17701 + while (!kthread_should_stop()) {
17703 + set_current_state(TASK_INTERRUPTIBLE);
17705 + __set_current_state(TASK_RUNNING);
17709 +static inline int __fastpath_timer_check(struct task_struct *tsk)
17711 + /* tsk == current, ensure it is safe to use ->signal/sighand */
17712 + if (unlikely(tsk->exit_state))
17715 + if (!task_cputime_zero(&tsk->cputime_expires))
17718 + if (!task_cputime_zero(&tsk->signal->cputime_expires))
17724 +void run_posix_cpu_timers(struct task_struct *tsk)
17726 + unsigned long cpu = smp_processor_id();
17727 + struct task_struct *tasklist;
17729 + BUG_ON(!irqs_disabled());
17730 + if(!per_cpu(posix_timer_task, cpu))
17732 + /* get per-cpu references */
17733 + tasklist = per_cpu(posix_timer_tasklist, cpu);
17735 + /* check to see if we're already queued */
17736 + if (!tsk->posix_timer_list && __fastpath_timer_check(tsk)) {
17737 + get_task_struct(tsk);
17739 + tsk->posix_timer_list = tasklist;
17742 + * The list is terminated by a self-pointing
17745 + tsk->posix_timer_list = tsk;
17747 + per_cpu(posix_timer_tasklist, cpu) = tsk;
17749 + wake_up_process(per_cpu(posix_timer_task, cpu));
17754 + * posix_cpu_thread_call - callback that gets triggered when a CPU is added.
17755 + * Here we can start up the necessary migration thread for the new CPU.
17757 +static int posix_cpu_thread_call(struct notifier_block *nfb,
17758 + unsigned long action, void *hcpu)
17760 + int cpu = (long)hcpu;
17761 + struct task_struct *p;
17762 + struct sched_param param;
17764 + switch (action) {
17765 + case CPU_UP_PREPARE:
17766 + p = kthread_create(posix_cpu_timers_thread, hcpu,
17767 + "posixcputmr/%d",cpu);
17769 + return NOTIFY_BAD;
17770 + p->flags |= PF_NOFREEZE;
17771 + kthread_bind(p, cpu);
17772 + /* Must be high prio to avoid getting starved */
17773 + param.sched_priority = MAX_RT_PRIO-1;
17774 + sched_setscheduler(p, SCHED_FIFO, ¶m);
17775 + per_cpu(posix_timer_task,cpu) = p;
17778 + /* Strictly unneccessary, as first user will wake it. */
17779 + wake_up_process(per_cpu(posix_timer_task,cpu));
17781 +#ifdef CONFIG_HOTPLUG_CPU
17782 + case CPU_UP_CANCELED:
17783 + /* Unbind it from offline cpu so it can run. Fall thru. */
17784 + kthread_bind(per_cpu(posix_timer_task, cpu),
17785 + cpumask_any(cpu_online_mask));
17786 + kthread_stop(per_cpu(posix_timer_task,cpu));
17787 + per_cpu(posix_timer_task,cpu) = NULL;
17790 + kthread_stop(per_cpu(posix_timer_task,cpu));
17791 + per_cpu(posix_timer_task,cpu) = NULL;
17795 + return NOTIFY_OK;
17798 +/* Register at highest priority so that task migration (migrate_all_tasks)
17799 + * happens before everything else.
17801 +static struct notifier_block posix_cpu_thread_notifier = {
17802 + .notifier_call = posix_cpu_thread_call,
17806 +static int __init posix_cpu_thread_init(void)
17808 + void *hcpu = (void *)(long)smp_processor_id();
17809 + /* Start one for boot CPU. */
17810 + unsigned long cpu;
17812 + /* init the per-cpu posix_timer_tasklets */
17813 + for_each_possible_cpu(cpu)
17814 + per_cpu(posix_timer_tasklist, cpu) = NULL;
17816 + posix_cpu_thread_call(&posix_cpu_thread_notifier, CPU_UP_PREPARE, hcpu);
17817 + posix_cpu_thread_call(&posix_cpu_thread_notifier, CPU_ONLINE, hcpu);
17818 + register_cpu_notifier(&posix_cpu_thread_notifier);
17821 +early_initcall(posix_cpu_thread_init);
17822 +#else /* CONFIG_PREEMPT_RT_BASE */
17823 +void run_posix_cpu_timers(struct task_struct *tsk)
17825 + __run_posix_cpu_timers(tsk);
17827 +#endif /* CONFIG_PREEMPT_RT_BASE */
17830 * Set one of the process-wide special case CPU timers or RLIMIT_CPU.
17831 * The tsk->sighand->siglock must be held by the caller.
17832 diff -Nur linux-4.9.6.orig/kernel/time/posix-timers.c linux-4.9.6/kernel/time/posix-timers.c
17833 --- linux-4.9.6.orig/kernel/time/posix-timers.c 2017-01-26 08:25:24.000000000 +0100
17834 +++ linux-4.9.6/kernel/time/posix-timers.c 2017-01-28 13:59:10.087662182 +0100
17835 @@ -506,6 +506,7 @@
17836 static struct pid *good_sigevent(sigevent_t * event)
17838 struct task_struct *rtn = current->group_leader;
17839 + int sig = event->sigev_signo;
17841 if ((event->sigev_notify & SIGEV_THREAD_ID ) &&
17842 (!(rtn = find_task_by_vpid(event->sigev_notify_thread_id)) ||
17843 @@ -514,7 +515,8 @@
17846 if (((event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) &&
17847 - ((event->sigev_signo <= 0) || (event->sigev_signo > SIGRTMAX)))
17848 + (sig <= 0 || sig > SIGRTMAX || sig_kernel_only(sig) ||
17849 + sig_kernel_coredump(sig)))
17852 return task_pid(rtn);
17853 @@ -826,6 +828,20 @@
17858 + * Protected by RCU!
17860 +static void timer_wait_for_callback(struct k_clock *kc, struct k_itimer *timr)
17862 +#ifdef CONFIG_PREEMPT_RT_FULL
17863 + if (kc->timer_set == common_timer_set)
17864 + hrtimer_wait_for_timer(&timr->it.real.timer);
17866 + /* FIXME: Whacky hack for posix-cpu-timers */
17867 + schedule_timeout(1);
17871 /* Set a POSIX.1b interval timer. */
17872 /* timr->it_lock is taken. */
17874 @@ -903,6 +919,7 @@
17879 kc = clockid_to_kclock(timr->it_clock);
17880 if (WARN_ON_ONCE(!kc || !kc->timer_set))
17882 @@ -911,9 +928,12 @@
17884 unlock_timer(timr, flag);
17885 if (error == TIMER_RETRY) {
17886 + timer_wait_for_callback(kc, timr);
17887 rtn = NULL; // We already got the old time...
17888 + rcu_read_unlock();
17891 + rcu_read_unlock();
17893 if (old_setting && !error &&
17894 copy_to_user(old_setting, &old_spec, sizeof (old_spec)))
17895 @@ -951,10 +971,15 @@
17900 if (timer_delete_hook(timer) == TIMER_RETRY) {
17901 unlock_timer(timer, flags);
17902 + timer_wait_for_callback(clockid_to_kclock(timer->it_clock),
17904 + rcu_read_unlock();
17907 + rcu_read_unlock();
17909 spin_lock(¤t->sighand->siglock);
17910 list_del(&timer->list);
17911 @@ -980,8 +1005,18 @@
17913 spin_lock_irqsave(&timer->it_lock, flags);
17915 + /* On RT we can race with a deletion */
17916 + if (!timer->it_signal) {
17917 + unlock_timer(timer, flags);
17921 if (timer_delete_hook(timer) == TIMER_RETRY) {
17923 unlock_timer(timer, flags);
17924 + timer_wait_for_callback(clockid_to_kclock(timer->it_clock),
17926 + rcu_read_unlock();
17929 list_del(&timer->list);
17930 diff -Nur linux-4.9.6.orig/kernel/time/tick-broadcast-hrtimer.c linux-4.9.6/kernel/time/tick-broadcast-hrtimer.c
17931 --- linux-4.9.6.orig/kernel/time/tick-broadcast-hrtimer.c 2017-01-26 08:25:24.000000000 +0100
17932 +++ linux-4.9.6/kernel/time/tick-broadcast-hrtimer.c 2017-01-28 13:59:10.087662182 +0100
17933 @@ -107,5 +107,6 @@
17935 hrtimer_init(&bctimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
17936 bctimer.function = bc_handler;
17937 + bctimer.irqsafe = true;
17938 clockevents_register_device(&ce_broadcast_hrtimer);
17940 diff -Nur linux-4.9.6.orig/kernel/time/tick-common.c linux-4.9.6/kernel/time/tick-common.c
17941 --- linux-4.9.6.orig/kernel/time/tick-common.c 2017-01-26 08:25:24.000000000 +0100
17942 +++ linux-4.9.6/kernel/time/tick-common.c 2017-01-28 13:59:10.087662182 +0100
17943 @@ -79,13 +79,15 @@
17944 static void tick_periodic(int cpu)
17946 if (tick_do_timer_cpu == cpu) {
17947 - write_seqlock(&jiffies_lock);
17948 + raw_spin_lock(&jiffies_lock);
17949 + write_seqcount_begin(&jiffies_seq);
17951 /* Keep track of the next tick event */
17952 tick_next_period = ktime_add(tick_next_period, tick_period);
17955 - write_sequnlock(&jiffies_lock);
17956 + write_seqcount_end(&jiffies_seq);
17957 + raw_spin_unlock(&jiffies_lock);
17958 update_wall_time();
17961 @@ -157,9 +159,9 @@
17965 - seq = read_seqbegin(&jiffies_lock);
17966 + seq = read_seqcount_begin(&jiffies_seq);
17967 next = tick_next_period;
17968 - } while (read_seqretry(&jiffies_lock, seq));
17969 + } while (read_seqcount_retry(&jiffies_seq, seq));
17971 clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT);
17973 diff -Nur linux-4.9.6.orig/kernel/time/tick-sched.c linux-4.9.6/kernel/time/tick-sched.c
17974 --- linux-4.9.6.orig/kernel/time/tick-sched.c 2017-01-26 08:25:24.000000000 +0100
17975 +++ linux-4.9.6/kernel/time/tick-sched.c 2017-01-28 13:59:10.087662182 +0100
17979 /* Reevaluate with jiffies_lock held */
17980 - write_seqlock(&jiffies_lock);
17981 + raw_spin_lock(&jiffies_lock);
17982 + write_seqcount_begin(&jiffies_seq);
17984 delta = ktime_sub(now, last_jiffies_update);
17985 if (delta.tv64 >= tick_period.tv64) {
17986 @@ -85,10 +86,12 @@
17987 /* Keep the tick_next_period variable up to date */
17988 tick_next_period = ktime_add(last_jiffies_update, tick_period);
17990 - write_sequnlock(&jiffies_lock);
17991 + write_seqcount_end(&jiffies_seq);
17992 + raw_spin_unlock(&jiffies_lock);
17995 - write_sequnlock(&jiffies_lock);
17996 + write_seqcount_end(&jiffies_seq);
17997 + raw_spin_unlock(&jiffies_lock);
17998 update_wall_time();
18001 @@ -99,12 +102,14 @@
18005 - write_seqlock(&jiffies_lock);
18006 + raw_spin_lock(&jiffies_lock);
18007 + write_seqcount_begin(&jiffies_seq);
18008 /* Did we start the jiffies update yet ? */
18009 if (last_jiffies_update.tv64 == 0)
18010 last_jiffies_update = tick_next_period;
18011 period = last_jiffies_update;
18012 - write_sequnlock(&jiffies_lock);
18013 + write_seqcount_end(&jiffies_seq);
18014 + raw_spin_unlock(&jiffies_lock);
18018 @@ -215,6 +220,7 @@
18020 static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = {
18021 .func = nohz_full_kick_func,
18022 + .flags = IRQ_WORK_HARD_IRQ,
18026 @@ -673,10 +679,10 @@
18028 /* Read jiffies and the time when jiffies were updated last */
18030 - seq = read_seqbegin(&jiffies_lock);
18031 + seq = read_seqcount_begin(&jiffies_seq);
18032 basemono = last_jiffies_update.tv64;
18033 basejiff = jiffies;
18034 - } while (read_seqretry(&jiffies_lock, seq));
18035 + } while (read_seqcount_retry(&jiffies_seq, seq));
18036 ts->last_jiffies = basejiff;
18038 if (rcu_needs_cpu(basemono, &next_rcu) ||
18039 @@ -877,14 +883,7 @@
18042 if (unlikely(local_softirq_pending() && cpu_online(cpu))) {
18043 - static int ratelimit;
18045 - if (ratelimit < 10 &&
18046 - (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) {
18047 - pr_warn("NOHZ: local_softirq_pending %02x\n",
18048 - (unsigned int) local_softirq_pending());
18051 + softirq_check_pending_idle();
18055 @@ -1193,6 +1192,7 @@
18056 * Emulate tick processing via per-CPU hrtimers:
18058 hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
18059 + ts->sched_timer.irqsafe = 1;
18060 ts->sched_timer.function = tick_sched_timer;
18062 /* Get the next period (per-CPU) */
18063 diff -Nur linux-4.9.6.orig/kernel/time/timekeeping.c linux-4.9.6/kernel/time/timekeeping.c
18064 --- linux-4.9.6.orig/kernel/time/timekeeping.c 2017-01-26 08:25:24.000000000 +0100
18065 +++ linux-4.9.6/kernel/time/timekeeping.c 2017-01-28 13:59:10.087662182 +0100
18066 @@ -2328,8 +2328,10 @@
18068 void xtime_update(unsigned long ticks)
18070 - write_seqlock(&jiffies_lock);
18071 + raw_spin_lock(&jiffies_lock);
18072 + write_seqcount_begin(&jiffies_seq);
18074 - write_sequnlock(&jiffies_lock);
18075 + write_seqcount_end(&jiffies_seq);
18076 + raw_spin_unlock(&jiffies_lock);
18077 update_wall_time();
18079 diff -Nur linux-4.9.6.orig/kernel/time/timekeeping.h linux-4.9.6/kernel/time/timekeeping.h
18080 --- linux-4.9.6.orig/kernel/time/timekeeping.h 2017-01-26 08:25:24.000000000 +0100
18081 +++ linux-4.9.6/kernel/time/timekeeping.h 2017-01-28 13:59:10.087662182 +0100
18083 extern void do_timer(unsigned long ticks);
18084 extern void update_wall_time(void);
18086 -extern seqlock_t jiffies_lock;
18087 +extern raw_spinlock_t jiffies_lock;
18088 +extern seqcount_t jiffies_seq;
18090 #define CS_NAME_LEN 32
18092 diff -Nur linux-4.9.6.orig/kernel/time/timer.c linux-4.9.6/kernel/time/timer.c
18093 --- linux-4.9.6.orig/kernel/time/timer.c 2017-01-26 08:25:24.000000000 +0100
18094 +++ linux-4.9.6/kernel/time/timer.c 2017-01-28 13:59:10.087662182 +0100
18095 @@ -193,8 +193,11 @@
18098 struct timer_base {
18100 + raw_spinlock_t lock;
18101 struct timer_list *running_timer;
18102 +#ifdef CONFIG_PREEMPT_RT_FULL
18103 + struct swait_queue_head wait_for_running_timer;
18106 unsigned long next_expiry;
18108 @@ -948,10 +951,10 @@
18110 if (!(tf & TIMER_MIGRATING)) {
18111 base = get_timer_base(tf);
18112 - spin_lock_irqsave(&base->lock, *flags);
18113 + raw_spin_lock_irqsave(&base->lock, *flags);
18114 if (timer->flags == tf)
18116 - spin_unlock_irqrestore(&base->lock, *flags);
18117 + raw_spin_unlock_irqrestore(&base->lock, *flags);
18121 @@ -1023,9 +1026,9 @@
18122 /* See the comment in lock_timer_base() */
18123 timer->flags |= TIMER_MIGRATING;
18125 - spin_unlock(&base->lock);
18126 + raw_spin_unlock(&base->lock);
18128 - spin_lock(&base->lock);
18129 + raw_spin_lock(&base->lock);
18130 WRITE_ONCE(timer->flags,
18131 (timer->flags & ~TIMER_BASEMASK) | base->cpu);
18133 @@ -1050,7 +1053,7 @@
18137 - spin_unlock_irqrestore(&base->lock, flags);
18138 + raw_spin_unlock_irqrestore(&base->lock, flags);
18142 @@ -1144,19 +1147,46 @@
18143 if (base != new_base) {
18144 timer->flags |= TIMER_MIGRATING;
18146 - spin_unlock(&base->lock);
18147 + raw_spin_unlock(&base->lock);
18149 - spin_lock(&base->lock);
18150 + raw_spin_lock(&base->lock);
18151 WRITE_ONCE(timer->flags,
18152 (timer->flags & ~TIMER_BASEMASK) | cpu);
18155 debug_activate(timer, timer->expires);
18156 internal_add_timer(base, timer);
18157 - spin_unlock_irqrestore(&base->lock, flags);
18158 + raw_spin_unlock_irqrestore(&base->lock, flags);
18160 EXPORT_SYMBOL_GPL(add_timer_on);
18162 +#ifdef CONFIG_PREEMPT_RT_FULL
18164 + * Wait for a running timer
18166 +static void wait_for_running_timer(struct timer_list *timer)
18168 + struct timer_base *base;
18169 + u32 tf = timer->flags;
18171 + if (tf & TIMER_MIGRATING)
18174 + base = get_timer_base(tf);
18175 + swait_event(base->wait_for_running_timer,
18176 + base->running_timer != timer);
18179 +# define wakeup_timer_waiters(b) swake_up_all(&(b)->wait_for_running_timer)
18181 +static inline void wait_for_running_timer(struct timer_list *timer)
18186 +# define wakeup_timer_waiters(b) do { } while (0)
18190 * del_timer - deactive a timer.
18191 * @timer: the timer to be deactivated
18192 @@ -1180,7 +1210,7 @@
18193 if (timer_pending(timer)) {
18194 base = lock_timer_base(timer, &flags);
18195 ret = detach_if_pending(timer, base, true);
18196 - spin_unlock_irqrestore(&base->lock, flags);
18197 + raw_spin_unlock_irqrestore(&base->lock, flags);
18201 @@ -1208,13 +1238,13 @@
18202 timer_stats_timer_clear_start_info(timer);
18203 ret = detach_if_pending(timer, base, true);
18205 - spin_unlock_irqrestore(&base->lock, flags);
18206 + raw_spin_unlock_irqrestore(&base->lock, flags);
18210 EXPORT_SYMBOL(try_to_del_timer_sync);
18213 +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
18215 * del_timer_sync - deactivate a timer and wait for the handler to finish.
18216 * @timer: the timer to be deactivated
18217 @@ -1274,7 +1304,7 @@
18218 int ret = try_to_del_timer_sync(timer);
18222 + wait_for_running_timer(timer);
18225 EXPORT_SYMBOL(del_timer_sync);
18226 @@ -1339,14 +1369,17 @@
18227 fn = timer->function;
18228 data = timer->data;
18230 - if (timer->flags & TIMER_IRQSAFE) {
18231 - spin_unlock(&base->lock);
18232 + if (!IS_ENABLED(CONFIG_PREEMPT_RT_FULL) &&
18233 + timer->flags & TIMER_IRQSAFE) {
18234 + raw_spin_unlock(&base->lock);
18235 call_timer_fn(timer, fn, data);
18236 - spin_lock(&base->lock);
18237 + base->running_timer = NULL;
18238 + raw_spin_lock(&base->lock);
18240 - spin_unlock_irq(&base->lock);
18241 + raw_spin_unlock_irq(&base->lock);
18242 call_timer_fn(timer, fn, data);
18243 - spin_lock_irq(&base->lock);
18244 + base->running_timer = NULL;
18245 + raw_spin_lock_irq(&base->lock);
18249 @@ -1515,7 +1548,7 @@
18250 if (cpu_is_offline(smp_processor_id()))
18253 - spin_lock(&base->lock);
18254 + raw_spin_lock(&base->lock);
18255 nextevt = __next_timer_interrupt(base);
18256 is_max_delta = (nextevt == base->clk + NEXT_TIMER_MAX_DELTA);
18257 base->next_expiry = nextevt;
18258 @@ -1543,7 +1576,7 @@
18259 if ((expires - basem) > TICK_NSEC)
18260 base->is_idle = true;
18262 - spin_unlock(&base->lock);
18263 + raw_spin_unlock(&base->lock);
18265 return cmp_next_hrtimer_event(basem, expires);
18267 @@ -1608,13 +1641,13 @@
18269 /* Note: this timer irq context must be accounted for as well. */
18270 account_process_tick(p, user_tick);
18271 + scheduler_tick();
18272 run_local_timers();
18273 rcu_check_callbacks(user_tick);
18274 -#ifdef CONFIG_IRQ_WORK
18275 +#if defined(CONFIG_IRQ_WORK)
18279 - scheduler_tick();
18280 run_posix_cpu_timers(p);
18283 @@ -1630,7 +1663,7 @@
18284 if (!time_after_eq(jiffies, base->clk))
18287 - spin_lock_irq(&base->lock);
18288 + raw_spin_lock_irq(&base->lock);
18290 while (time_after_eq(jiffies, base->clk)) {
18292 @@ -1640,8 +1673,8 @@
18294 expire_timers(base, heads + levels);
18296 - base->running_timer = NULL;
18297 - spin_unlock_irq(&base->lock);
18298 + raw_spin_unlock_irq(&base->lock);
18299 + wakeup_timer_waiters(base);
18303 @@ -1651,6 +1684,8 @@
18305 struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
18307 + irq_work_tick_soft();
18309 __run_timers(base);
18310 if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active)
18311 __run_timers(this_cpu_ptr(&timer_bases[BASE_DEF]));
18312 @@ -1836,16 +1871,16 @@
18313 * The caller is globally serialized and nobody else
18314 * takes two locks at once, deadlock is not possible.
18316 - spin_lock_irq(&new_base->lock);
18317 - spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
18318 + raw_spin_lock_irq(&new_base->lock);
18319 + raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
18321 BUG_ON(old_base->running_timer);
18323 for (i = 0; i < WHEEL_SIZE; i++)
18324 migrate_timer_list(new_base, old_base->vectors + i);
18326 - spin_unlock(&old_base->lock);
18327 - spin_unlock_irq(&new_base->lock);
18328 + raw_spin_unlock(&old_base->lock);
18329 + raw_spin_unlock_irq(&new_base->lock);
18330 put_cpu_ptr(&timer_bases);
18333 @@ -1861,8 +1896,11 @@
18334 for (i = 0; i < NR_BASES; i++) {
18335 base = per_cpu_ptr(&timer_bases[i], cpu);
18337 - spin_lock_init(&base->lock);
18338 + raw_spin_lock_init(&base->lock);
18339 base->clk = jiffies;
18340 +#ifdef CONFIG_PREEMPT_RT_FULL
18341 + init_swait_queue_head(&base->wait_for_running_timer);
18346 diff -Nur linux-4.9.6.orig/kernel/trace/Kconfig linux-4.9.6/kernel/trace/Kconfig
18347 --- linux-4.9.6.orig/kernel/trace/Kconfig 2017-01-26 08:25:24.000000000 +0100
18348 +++ linux-4.9.6/kernel/trace/Kconfig 2017-01-28 13:59:10.087662182 +0100
18349 @@ -182,6 +182,24 @@
18350 enabled. This option and the preempt-off timing option can be
18351 used together or separately.)
18353 +config INTERRUPT_OFF_HIST
18354 + bool "Interrupts-off Latency Histogram"
18355 + depends on IRQSOFF_TRACER
18357 + This option generates continuously updated histograms (one per cpu)
18358 + of the duration of time periods with interrupts disabled. The
18359 + histograms are disabled by default. To enable them, write a non-zero
18362 + /sys/kernel/debug/tracing/latency_hist/enable/preemptirqsoff
18364 + If PREEMPT_OFF_HIST is also selected, additional histograms (one
18365 + per cpu) are generated that accumulate the duration of time periods
18366 + when both interrupts and preemption are disabled. The histogram data
18367 + will be located in the debug file system at
18369 + /sys/kernel/debug/tracing/latency_hist/irqsoff
18371 config PREEMPT_TRACER
18372 bool "Preemption-off Latency Tracer"
18374 @@ -206,6 +224,24 @@
18375 enabled. This option and the irqs-off timing option can be
18376 used together or separately.)
18378 +config PREEMPT_OFF_HIST
18379 + bool "Preemption-off Latency Histogram"
18380 + depends on PREEMPT_TRACER
18382 + This option generates continuously updated histograms (one per cpu)
18383 + of the duration of time periods with preemption disabled. The
18384 + histograms are disabled by default. To enable them, write a non-zero
18387 + /sys/kernel/debug/tracing/latency_hist/enable/preemptirqsoff
18389 + If INTERRUPT_OFF_HIST is also selected, additional histograms (one
18390 + per cpu) are generated that accumulate the duration of time periods
18391 + when both interrupts and preemption are disabled. The histogram data
18392 + will be located in the debug file system at
18394 + /sys/kernel/debug/tracing/latency_hist/preemptoff
18396 config SCHED_TRACER
18397 bool "Scheduling Latency Tracer"
18398 select GENERIC_TRACER
18399 @@ -251,6 +287,74 @@
18400 file. Every time a latency is greater than tracing_thresh, it will
18401 be recorded into the ring buffer.
18403 +config WAKEUP_LATENCY_HIST
18404 + bool "Scheduling Latency Histogram"
18405 + depends on SCHED_TRACER
18407 + This option generates continuously updated histograms (one per cpu)
18408 + of the scheduling latency of the highest priority task.
18409 + The histograms are disabled by default. To enable them, write a
18410 + non-zero number to
18412 + /sys/kernel/debug/tracing/latency_hist/enable/wakeup
18414 + Two different algorithms are used, one to determine the latency of
18415 + processes that exclusively use the highest priority of the system and
18416 + another one to determine the latency of processes that share the
18417 + highest system priority with other processes. The former is used to
18418 + improve hardware and system software, the latter to optimize the
18419 + priority design of a given system. The histogram data will be
18420 + located in the debug file system at
18422 + /sys/kernel/debug/tracing/latency_hist/wakeup
18426 + /sys/kernel/debug/tracing/latency_hist/wakeup/sharedprio
18428 + If both Scheduling Latency Histogram and Missed Timer Offsets
18429 + Histogram are selected, additional histogram data will be collected
18430 + that contain, in addition to the wakeup latency, the timer latency, in
18431 + case the wakeup was triggered by an expired timer. These histograms
18432 + are available in the
18434 + /sys/kernel/debug/tracing/latency_hist/timerandwakeup
18436 + directory. They reflect the apparent interrupt and scheduling latency
18437 + and are best suitable to determine the worst-case latency of a given
18438 + system. To enable these histograms, write a non-zero number to
18440 + /sys/kernel/debug/tracing/latency_hist/enable/timerandwakeup
18442 +config MISSED_TIMER_OFFSETS_HIST
18443 + depends on HIGH_RES_TIMERS
18444 + select GENERIC_TRACER
18445 + bool "Missed Timer Offsets Histogram"
18447 + Generate a histogram of missed timer offsets in microseconds. The
18448 + histograms are disabled by default. To enable them, write a non-zero
18451 + /sys/kernel/debug/tracing/latency_hist/enable/missed_timer_offsets
18453 + The histogram data will be located in the debug file system at
18455 + /sys/kernel/debug/tracing/latency_hist/missed_timer_offsets
18457 + If both Scheduling Latency Histogram and Missed Timer Offsets
18458 + Histogram are selected, additional histogram data will be collected
18459 + that contain, in addition to the wakeup latency, the timer latency, in
18460 + case the wakeup was triggered by an expired timer. These histograms
18461 + are available in the
18463 + /sys/kernel/debug/tracing/latency_hist/timerandwakeup
18465 + directory. They reflect the apparent interrupt and scheduling latency
18466 + and are best suitable to determine the worst-case latency of a given
18467 + system. To enable these histograms, write a non-zero number to
18469 + /sys/kernel/debug/tracing/latency_hist/enable/timerandwakeup
18471 config ENABLE_DEFAULT_TRACERS
18472 bool "Trace process context switches and events"
18473 depends on !GENERIC_TRACER
18474 diff -Nur linux-4.9.6.orig/kernel/trace/latency_hist.c linux-4.9.6/kernel/trace/latency_hist.c
18475 --- linux-4.9.6.orig/kernel/trace/latency_hist.c 1970-01-01 01:00:00.000000000 +0100
18476 +++ linux-4.9.6/kernel/trace/latency_hist.c 2017-01-28 13:59:10.091662336 +0100
18479 + * kernel/trace/latency_hist.c
18481 + * Add support for histograms of preemption-off latency and
18482 + * interrupt-off latency and wakeup latency, it depends on
18483 + * Real-Time Preemption Support.
18485 + * Copyright (C) 2005 MontaVista Software, Inc.
18486 + * Yi Yang <yyang@ch.mvista.com>
18488 + * Converted to work with the new latency tracer.
18489 + * Copyright (C) 2008 Red Hat, Inc.
18490 + * Steven Rostedt <srostedt@redhat.com>
18493 +#include <linux/module.h>
18494 +#include <linux/debugfs.h>
18495 +#include <linux/seq_file.h>
18496 +#include <linux/percpu.h>
18497 +#include <linux/kallsyms.h>
18498 +#include <linux/uaccess.h>
18499 +#include <linux/sched.h>
18500 +#include <linux/sched/rt.h>
18501 +#include <linux/slab.h>
18502 +#include <linux/atomic.h>
18503 +#include <asm/div64.h>
18505 +#include "trace.h"
18506 +#include <trace/events/sched.h>
18508 +#define NSECS_PER_USECS 1000L
18510 +#define CREATE_TRACE_POINTS
18511 +#include <trace/events/hist.h>
18514 + IRQSOFF_LATENCY = 0,
18515 + PREEMPTOFF_LATENCY,
18516 + PREEMPTIRQSOFF_LATENCY,
18518 + WAKEUP_LATENCY_SHAREDPRIO,
18519 + MISSED_TIMER_OFFSETS,
18520 + TIMERANDWAKEUP_LATENCY,
18521 + MAX_LATENCY_TYPE,
18524 +#define MAX_ENTRY_NUM 10240
18526 +struct hist_data {
18527 + atomic_t hist_mode; /* 0 log, 1 don't log */
18528 + long offset; /* set it to MAX_ENTRY_NUM/2 for a bipolar scale */
18531 + unsigned long long below_hist_bound_samples;
18532 + unsigned long long above_hist_bound_samples;
18533 + long long accumulate_lat;
18534 + unsigned long long total_samples;
18535 + unsigned long long hist_array[MAX_ENTRY_NUM];
18538 +struct enable_data {
18539 + int latency_type;
18543 +static char *latency_hist_dir_root = "latency_hist";
18545 +#ifdef CONFIG_INTERRUPT_OFF_HIST
18546 +static DEFINE_PER_CPU(struct hist_data, irqsoff_hist);
18547 +static char *irqsoff_hist_dir = "irqsoff";
18548 +static DEFINE_PER_CPU(cycles_t, hist_irqsoff_start);
18549 +static DEFINE_PER_CPU(int, hist_irqsoff_counting);
18552 +#ifdef CONFIG_PREEMPT_OFF_HIST
18553 +static DEFINE_PER_CPU(struct hist_data, preemptoff_hist);
18554 +static char *preemptoff_hist_dir = "preemptoff";
18555 +static DEFINE_PER_CPU(cycles_t, hist_preemptoff_start);
18556 +static DEFINE_PER_CPU(int, hist_preemptoff_counting);
18559 +#if defined(CONFIG_PREEMPT_OFF_HIST) && defined(CONFIG_INTERRUPT_OFF_HIST)
18560 +static DEFINE_PER_CPU(struct hist_data, preemptirqsoff_hist);
18561 +static char *preemptirqsoff_hist_dir = "preemptirqsoff";
18562 +static DEFINE_PER_CPU(cycles_t, hist_preemptirqsoff_start);
18563 +static DEFINE_PER_CPU(int, hist_preemptirqsoff_counting);
18566 +#if defined(CONFIG_PREEMPT_OFF_HIST) || defined(CONFIG_INTERRUPT_OFF_HIST)
18567 +static notrace void probe_preemptirqsoff_hist(void *v, int reason, int start);
18568 +static struct enable_data preemptirqsoff_enabled_data = {
18569 + .latency_type = PREEMPTIRQSOFF_LATENCY,
18574 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
18575 + defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
18576 +struct maxlatproc_data {
18577 + char comm[FIELD_SIZEOF(struct task_struct, comm)];
18578 + char current_comm[FIELD_SIZEOF(struct task_struct, comm)];
18582 + int current_prio;
18584 + long timeroffset;
18585 + cycle_t timestamp;
18589 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
18590 +static DEFINE_PER_CPU(struct hist_data, wakeup_latency_hist);
18591 +static DEFINE_PER_CPU(struct hist_data, wakeup_latency_hist_sharedprio);
18592 +static char *wakeup_latency_hist_dir = "wakeup";
18593 +static char *wakeup_latency_hist_dir_sharedprio = "sharedprio";
18594 +static notrace void probe_wakeup_latency_hist_start(void *v,
18595 + struct task_struct *p);
18596 +static notrace void probe_wakeup_latency_hist_stop(void *v,
18597 + bool preempt, struct task_struct *prev, struct task_struct *next);
18598 +static notrace void probe_sched_migrate_task(void *,
18599 + struct task_struct *task, int cpu);
18600 +static struct enable_data wakeup_latency_enabled_data = {
18601 + .latency_type = WAKEUP_LATENCY,
18604 +static DEFINE_PER_CPU(struct maxlatproc_data, wakeup_maxlatproc);
18605 +static DEFINE_PER_CPU(struct maxlatproc_data, wakeup_maxlatproc_sharedprio);
18606 +static DEFINE_PER_CPU(struct task_struct *, wakeup_task);
18607 +static DEFINE_PER_CPU(int, wakeup_sharedprio);
18608 +static unsigned long wakeup_pid;
18611 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
18612 +static DEFINE_PER_CPU(struct hist_data, missed_timer_offsets);
18613 +static char *missed_timer_offsets_dir = "missed_timer_offsets";
18614 +static notrace void probe_hrtimer_interrupt(void *v, int cpu,
18615 + long long offset, struct task_struct *curr, struct task_struct *task);
18616 +static struct enable_data missed_timer_offsets_enabled_data = {
18617 + .latency_type = MISSED_TIMER_OFFSETS,
18620 +static DEFINE_PER_CPU(struct maxlatproc_data, missed_timer_offsets_maxlatproc);
18621 +static unsigned long missed_timer_offsets_pid;
18624 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
18625 + defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
18626 +static DEFINE_PER_CPU(struct hist_data, timerandwakeup_latency_hist);
18627 +static char *timerandwakeup_latency_hist_dir = "timerandwakeup";
18628 +static struct enable_data timerandwakeup_enabled_data = {
18629 + .latency_type = TIMERANDWAKEUP_LATENCY,
18632 +static DEFINE_PER_CPU(struct maxlatproc_data, timerandwakeup_maxlatproc);
18635 +void notrace latency_hist(int latency_type, int cpu, long latency,
18636 + long timeroffset, cycle_t stop,
18637 + struct task_struct *p)
18639 + struct hist_data *my_hist;
18640 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
18641 + defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
18642 + struct maxlatproc_data *mp = NULL;
18645 + if (!cpu_possible(cpu) || latency_type < 0 ||
18646 + latency_type >= MAX_LATENCY_TYPE)
18649 + switch (latency_type) {
18650 +#ifdef CONFIG_INTERRUPT_OFF_HIST
18651 + case IRQSOFF_LATENCY:
18652 + my_hist = &per_cpu(irqsoff_hist, cpu);
18655 +#ifdef CONFIG_PREEMPT_OFF_HIST
18656 + case PREEMPTOFF_LATENCY:
18657 + my_hist = &per_cpu(preemptoff_hist, cpu);
18660 +#if defined(CONFIG_PREEMPT_OFF_HIST) && defined(CONFIG_INTERRUPT_OFF_HIST)
18661 + case PREEMPTIRQSOFF_LATENCY:
18662 + my_hist = &per_cpu(preemptirqsoff_hist, cpu);
18665 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
18666 + case WAKEUP_LATENCY:
18667 + my_hist = &per_cpu(wakeup_latency_hist, cpu);
18668 + mp = &per_cpu(wakeup_maxlatproc, cpu);
18670 + case WAKEUP_LATENCY_SHAREDPRIO:
18671 + my_hist = &per_cpu(wakeup_latency_hist_sharedprio, cpu);
18672 + mp = &per_cpu(wakeup_maxlatproc_sharedprio, cpu);
18675 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
18676 + case MISSED_TIMER_OFFSETS:
18677 + my_hist = &per_cpu(missed_timer_offsets, cpu);
18678 + mp = &per_cpu(missed_timer_offsets_maxlatproc, cpu);
18681 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
18682 + defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
18683 + case TIMERANDWAKEUP_LATENCY:
18684 + my_hist = &per_cpu(timerandwakeup_latency_hist, cpu);
18685 + mp = &per_cpu(timerandwakeup_maxlatproc, cpu);
18693 + latency += my_hist->offset;
18695 + if (atomic_read(&my_hist->hist_mode) == 0)
18698 + if (latency < 0 || latency >= MAX_ENTRY_NUM) {
18700 + my_hist->below_hist_bound_samples++;
18702 + my_hist->above_hist_bound_samples++;
18704 + my_hist->hist_array[latency]++;
18706 + if (unlikely(latency > my_hist->max_lat ||
18707 + my_hist->min_lat == LONG_MAX)) {
18708 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
18709 + defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
18710 + if (latency_type == WAKEUP_LATENCY ||
18711 + latency_type == WAKEUP_LATENCY_SHAREDPRIO ||
18712 + latency_type == MISSED_TIMER_OFFSETS ||
18713 + latency_type == TIMERANDWAKEUP_LATENCY) {
18714 + strncpy(mp->comm, p->comm, sizeof(mp->comm));
18715 + strncpy(mp->current_comm, current->comm,
18716 + sizeof(mp->current_comm));
18717 + mp->pid = task_pid_nr(p);
18718 + mp->current_pid = task_pid_nr(current);
18719 + mp->prio = p->prio;
18720 + mp->current_prio = current->prio;
18721 + mp->latency = latency;
18722 + mp->timeroffset = timeroffset;
18723 + mp->timestamp = stop;
18726 + my_hist->max_lat = latency;
18728 + if (unlikely(latency < my_hist->min_lat))
18729 + my_hist->min_lat = latency;
18730 + my_hist->total_samples++;
18731 + my_hist->accumulate_lat += latency;
18734 +static void *l_start(struct seq_file *m, loff_t *pos)
18736 + loff_t *index_ptr = NULL;
18737 + loff_t index = *pos;
18738 + struct hist_data *my_hist = m->private;
18740 + if (index == 0) {
18741 + char minstr[32], avgstr[32], maxstr[32];
18743 + atomic_dec(&my_hist->hist_mode);
18745 + if (likely(my_hist->total_samples)) {
18746 + long avg = (long) div64_s64(my_hist->accumulate_lat,
18747 + my_hist->total_samples);
18748 + snprintf(minstr, sizeof(minstr), "%ld",
18749 + my_hist->min_lat - my_hist->offset);
18750 + snprintf(avgstr, sizeof(avgstr), "%ld",
18751 + avg - my_hist->offset);
18752 + snprintf(maxstr, sizeof(maxstr), "%ld",
18753 + my_hist->max_lat - my_hist->offset);
18755 + strcpy(minstr, "<undef>");
18756 + strcpy(avgstr, minstr);
18757 + strcpy(maxstr, minstr);
18760 + seq_printf(m, "#Minimum latency: %s microseconds\n"
18761 + "#Average latency: %s microseconds\n"
18762 + "#Maximum latency: %s microseconds\n"
18763 + "#Total samples: %llu\n"
18764 + "#There are %llu samples lower than %ld"
18765 + " microseconds.\n"
18766 + "#There are %llu samples greater or equal"
18767 + " than %ld microseconds.\n"
18768 + "#usecs\t%16s\n",
18769 + minstr, avgstr, maxstr,
18770 + my_hist->total_samples,
18771 + my_hist->below_hist_bound_samples,
18772 + -my_hist->offset,
18773 + my_hist->above_hist_bound_samples,
18774 + MAX_ENTRY_NUM - my_hist->offset,
18777 + if (index < MAX_ENTRY_NUM) {
18778 + index_ptr = kmalloc(sizeof(loff_t), GFP_KERNEL);
18780 + *index_ptr = index;
18783 + return index_ptr;
18786 +static void *l_next(struct seq_file *m, void *p, loff_t *pos)
18788 + loff_t *index_ptr = p;
18789 + struct hist_data *my_hist = m->private;
18791 + if (++*pos >= MAX_ENTRY_NUM) {
18792 + atomic_inc(&my_hist->hist_mode);
18795 + *index_ptr = *pos;
18796 + return index_ptr;
18799 +static void l_stop(struct seq_file *m, void *p)
18804 +static int l_show(struct seq_file *m, void *p)
18806 + int index = *(loff_t *) p;
18807 + struct hist_data *my_hist = m->private;
18809 + seq_printf(m, "%6ld\t%16llu\n", index - my_hist->offset,
18810 + my_hist->hist_array[index]);
18814 +static const struct seq_operations latency_hist_seq_op = {
18815 + .start = l_start,
18821 +static int latency_hist_open(struct inode *inode, struct file *file)
18825 + ret = seq_open(file, &latency_hist_seq_op);
18827 + struct seq_file *seq = file->private_data;
18828 + seq->private = inode->i_private;
18833 +static const struct file_operations latency_hist_fops = {
18834 + .open = latency_hist_open,
18835 + .read = seq_read,
18836 + .llseek = seq_lseek,
18837 + .release = seq_release,
18840 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
18841 + defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
18842 +static void clear_maxlatprocdata(struct maxlatproc_data *mp)
18844 + mp->comm[0] = mp->current_comm[0] = '\0';
18845 + mp->prio = mp->current_prio = mp->pid = mp->current_pid =
18846 + mp->latency = mp->timeroffset = -1;
18847 + mp->timestamp = 0;
18851 +static void hist_reset(struct hist_data *hist)
18853 + atomic_dec(&hist->hist_mode);
18855 + memset(hist->hist_array, 0, sizeof(hist->hist_array));
18856 + hist->below_hist_bound_samples = 0ULL;
18857 + hist->above_hist_bound_samples = 0ULL;
18858 + hist->min_lat = LONG_MAX;
18859 + hist->max_lat = LONG_MIN;
18860 + hist->total_samples = 0ULL;
18861 + hist->accumulate_lat = 0LL;
18863 + atomic_inc(&hist->hist_mode);
18867 +latency_hist_reset(struct file *file, const char __user *a,
18868 + size_t size, loff_t *off)
18871 + struct hist_data *hist = NULL;
18872 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
18873 + defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
18874 + struct maxlatproc_data *mp = NULL;
18876 + off_t latency_type = (off_t) file->private_data;
18878 + for_each_online_cpu(cpu) {
18880 + switch (latency_type) {
18881 +#ifdef CONFIG_PREEMPT_OFF_HIST
18882 + case PREEMPTOFF_LATENCY:
18883 + hist = &per_cpu(preemptoff_hist, cpu);
18886 +#ifdef CONFIG_INTERRUPT_OFF_HIST
18887 + case IRQSOFF_LATENCY:
18888 + hist = &per_cpu(irqsoff_hist, cpu);
18891 +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
18892 + case PREEMPTIRQSOFF_LATENCY:
18893 + hist = &per_cpu(preemptirqsoff_hist, cpu);
18896 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
18897 + case WAKEUP_LATENCY:
18898 + hist = &per_cpu(wakeup_latency_hist, cpu);
18899 + mp = &per_cpu(wakeup_maxlatproc, cpu);
18901 + case WAKEUP_LATENCY_SHAREDPRIO:
18902 + hist = &per_cpu(wakeup_latency_hist_sharedprio, cpu);
18903 + mp = &per_cpu(wakeup_maxlatproc_sharedprio, cpu);
18906 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
18907 + case MISSED_TIMER_OFFSETS:
18908 + hist = &per_cpu(missed_timer_offsets, cpu);
18909 + mp = &per_cpu(missed_timer_offsets_maxlatproc, cpu);
18912 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
18913 + defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
18914 + case TIMERANDWAKEUP_LATENCY:
18915 + hist = &per_cpu(timerandwakeup_latency_hist, cpu);
18916 + mp = &per_cpu(timerandwakeup_maxlatproc, cpu);
18921 + hist_reset(hist);
18922 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
18923 + defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
18924 + if (latency_type == WAKEUP_LATENCY ||
18925 + latency_type == WAKEUP_LATENCY_SHAREDPRIO ||
18926 + latency_type == MISSED_TIMER_OFFSETS ||
18927 + latency_type == TIMERANDWAKEUP_LATENCY)
18928 + clear_maxlatprocdata(mp);
18935 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
18936 + defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
18938 +show_pid(struct file *file, char __user *ubuf, size_t cnt, loff_t *ppos)
18942 + unsigned long *this_pid = file->private_data;
18944 + r = snprintf(buf, sizeof(buf), "%lu\n", *this_pid);
18945 + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
18948 +static ssize_t do_pid(struct file *file, const char __user *ubuf,
18949 + size_t cnt, loff_t *ppos)
18952 + unsigned long pid;
18953 + unsigned long *this_pid = file->private_data;
18955 + if (cnt >= sizeof(buf))
18958 + if (copy_from_user(&buf, ubuf, cnt))
18963 + if (kstrtoul(buf, 10, &pid))
18972 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
18973 + defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
18975 +show_maxlatproc(struct file *file, char __user *ubuf, size_t cnt, loff_t *ppos)
18978 + struct maxlatproc_data *mp = file->private_data;
18979 + int strmaxlen = (TASK_COMM_LEN * 2) + (8 * 8);
18980 + unsigned long long t;
18981 + unsigned long usecs, secs;
18984 + if (mp->pid == -1 || mp->current_pid == -1) {
18985 + buf = "(none)\n";
18986 + return simple_read_from_buffer(ubuf, cnt, ppos, buf,
18990 + buf = kmalloc(strmaxlen, GFP_KERNEL);
18994 + t = ns2usecs(mp->timestamp);
18995 + usecs = do_div(t, USEC_PER_SEC);
18996 + secs = (unsigned long) t;
18997 + r = snprintf(buf, strmaxlen,
18998 + "%d %d %ld (%ld) %s <- %d %d %s %lu.%06lu\n", mp->pid,
18999 + MAX_RT_PRIO-1 - mp->prio, mp->latency, mp->timeroffset, mp->comm,
19000 + mp->current_pid, MAX_RT_PRIO-1 - mp->current_prio, mp->current_comm,
19002 + r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
19009 +show_enable(struct file *file, char __user *ubuf, size_t cnt, loff_t *ppos)
19012 + struct enable_data *ed = file->private_data;
19015 + r = snprintf(buf, sizeof(buf), "%d\n", ed->enabled);
19016 + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
19020 +do_enable(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos)
19024 + struct enable_data *ed = file->private_data;
19026 + if (cnt >= sizeof(buf))
19029 + if (copy_from_user(&buf, ubuf, cnt))
19034 + if (kstrtoul(buf, 10, &enable))
19037 + if ((enable && ed->enabled) || (!enable && !ed->enabled))
19043 + switch (ed->latency_type) {
19044 +#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST)
19045 + case PREEMPTIRQSOFF_LATENCY:
19046 + ret = register_trace_preemptirqsoff_hist(
19047 + probe_preemptirqsoff_hist, NULL);
19049 + pr_info("wakeup trace: Couldn't assign "
19050 + "probe_preemptirqsoff_hist "
19051 + "to trace_preemptirqsoff_hist\n");
19056 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
19057 + case WAKEUP_LATENCY:
19058 + ret = register_trace_sched_wakeup(
19059 + probe_wakeup_latency_hist_start, NULL);
19061 + pr_info("wakeup trace: Couldn't assign "
19062 + "probe_wakeup_latency_hist_start "
19063 + "to trace_sched_wakeup\n");
19066 + ret = register_trace_sched_wakeup_new(
19067 + probe_wakeup_latency_hist_start, NULL);
19069 + pr_info("wakeup trace: Couldn't assign "
19070 + "probe_wakeup_latency_hist_start "
19071 + "to trace_sched_wakeup_new\n");
19072 + unregister_trace_sched_wakeup(
19073 + probe_wakeup_latency_hist_start, NULL);
19076 + ret = register_trace_sched_switch(
19077 + probe_wakeup_latency_hist_stop, NULL);
19079 + pr_info("wakeup trace: Couldn't assign "
19080 + "probe_wakeup_latency_hist_stop "
19081 + "to trace_sched_switch\n");
19082 + unregister_trace_sched_wakeup(
19083 + probe_wakeup_latency_hist_start, NULL);
19084 + unregister_trace_sched_wakeup_new(
19085 + probe_wakeup_latency_hist_start, NULL);
19088 + ret = register_trace_sched_migrate_task(
19089 + probe_sched_migrate_task, NULL);
19091 + pr_info("wakeup trace: Couldn't assign "
19092 + "probe_sched_migrate_task "
19093 + "to trace_sched_migrate_task\n");
19094 + unregister_trace_sched_wakeup(
19095 + probe_wakeup_latency_hist_start, NULL);
19096 + unregister_trace_sched_wakeup_new(
19097 + probe_wakeup_latency_hist_start, NULL);
19098 + unregister_trace_sched_switch(
19099 + probe_wakeup_latency_hist_stop, NULL);
19104 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
19105 + case MISSED_TIMER_OFFSETS:
19106 + ret = register_trace_hrtimer_interrupt(
19107 + probe_hrtimer_interrupt, NULL);
19109 + pr_info("wakeup trace: Couldn't assign "
19110 + "probe_hrtimer_interrupt "
19111 + "to trace_hrtimer_interrupt\n");
19116 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
19117 + defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
19118 + case TIMERANDWAKEUP_LATENCY:
19119 + if (!wakeup_latency_enabled_data.enabled ||
19120 + !missed_timer_offsets_enabled_data.enabled)
19128 + switch (ed->latency_type) {
19129 +#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST)
19130 + case PREEMPTIRQSOFF_LATENCY:
19134 + unregister_trace_preemptirqsoff_hist(
19135 + probe_preemptirqsoff_hist, NULL);
19136 + for_each_online_cpu(cpu) {
19137 +#ifdef CONFIG_INTERRUPT_OFF_HIST
19138 + per_cpu(hist_irqsoff_counting,
19141 +#ifdef CONFIG_PREEMPT_OFF_HIST
19142 + per_cpu(hist_preemptoff_counting,
19145 +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
19146 + per_cpu(hist_preemptirqsoff_counting,
19153 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
19154 + case WAKEUP_LATENCY:
19158 + unregister_trace_sched_wakeup(
19159 + probe_wakeup_latency_hist_start, NULL);
19160 + unregister_trace_sched_wakeup_new(
19161 + probe_wakeup_latency_hist_start, NULL);
19162 + unregister_trace_sched_switch(
19163 + probe_wakeup_latency_hist_stop, NULL);
19164 + unregister_trace_sched_migrate_task(
19165 + probe_sched_migrate_task, NULL);
19167 + for_each_online_cpu(cpu) {
19168 + per_cpu(wakeup_task, cpu) = NULL;
19169 + per_cpu(wakeup_sharedprio, cpu) = 0;
19172 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
19173 + timerandwakeup_enabled_data.enabled = 0;
19177 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
19178 + case MISSED_TIMER_OFFSETS:
19179 + unregister_trace_hrtimer_interrupt(
19180 + probe_hrtimer_interrupt, NULL);
19181 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
19182 + timerandwakeup_enabled_data.enabled = 0;
19190 + ed->enabled = enable;
19194 +static const struct file_operations latency_hist_reset_fops = {
19195 + .open = tracing_open_generic,
19196 + .write = latency_hist_reset,
19199 +static const struct file_operations enable_fops = {
19200 + .open = tracing_open_generic,
19201 + .read = show_enable,
19202 + .write = do_enable,
19205 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
19206 + defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
19207 +static const struct file_operations pid_fops = {
19208 + .open = tracing_open_generic,
19209 + .read = show_pid,
19213 +static const struct file_operations maxlatproc_fops = {
19214 + .open = tracing_open_generic,
19215 + .read = show_maxlatproc,
19219 +#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST)
19220 +static notrace void probe_preemptirqsoff_hist(void *v, int reason,
19223 + int cpu = raw_smp_processor_id();
19224 + int time_set = 0;
19227 + cycle_t uninitialized_var(start);
19229 + if (!preempt_count() && !irqs_disabled())
19232 +#ifdef CONFIG_INTERRUPT_OFF_HIST
19233 + if ((reason == IRQS_OFF || reason == TRACE_START) &&
19234 + !per_cpu(hist_irqsoff_counting, cpu)) {
19235 + per_cpu(hist_irqsoff_counting, cpu) = 1;
19236 + start = ftrace_now(cpu);
19238 + per_cpu(hist_irqsoff_start, cpu) = start;
19242 +#ifdef CONFIG_PREEMPT_OFF_HIST
19243 + if ((reason == PREEMPT_OFF || reason == TRACE_START) &&
19244 + !per_cpu(hist_preemptoff_counting, cpu)) {
19245 + per_cpu(hist_preemptoff_counting, cpu) = 1;
19246 + if (!(time_set++))
19247 + start = ftrace_now(cpu);
19248 + per_cpu(hist_preemptoff_start, cpu) = start;
19252 +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
19253 + if (per_cpu(hist_irqsoff_counting, cpu) &&
19254 + per_cpu(hist_preemptoff_counting, cpu) &&
19255 + !per_cpu(hist_preemptirqsoff_counting, cpu)) {
19256 + per_cpu(hist_preemptirqsoff_counting, cpu) = 1;
19258 + start = ftrace_now(cpu);
19259 + per_cpu(hist_preemptirqsoff_start, cpu) = start;
19263 + cycle_t uninitialized_var(stop);
19265 +#ifdef CONFIG_INTERRUPT_OFF_HIST
19266 + if ((reason == IRQS_ON || reason == TRACE_STOP) &&
19267 + per_cpu(hist_irqsoff_counting, cpu)) {
19268 + cycle_t start = per_cpu(hist_irqsoff_start, cpu);
19270 + stop = ftrace_now(cpu);
19273 + long latency = ((long) (stop - start)) /
19276 + latency_hist(IRQSOFF_LATENCY, cpu, latency, 0,
19279 + per_cpu(hist_irqsoff_counting, cpu) = 0;
19283 +#ifdef CONFIG_PREEMPT_OFF_HIST
19284 + if ((reason == PREEMPT_ON || reason == TRACE_STOP) &&
19285 + per_cpu(hist_preemptoff_counting, cpu)) {
19286 + cycle_t start = per_cpu(hist_preemptoff_start, cpu);
19288 + if (!(time_set++))
19289 + stop = ftrace_now(cpu);
19291 + long latency = ((long) (stop - start)) /
19294 + latency_hist(PREEMPTOFF_LATENCY, cpu, latency,
19297 + per_cpu(hist_preemptoff_counting, cpu) = 0;
19301 +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
19302 + if ((!per_cpu(hist_irqsoff_counting, cpu) ||
19303 + !per_cpu(hist_preemptoff_counting, cpu)) &&
19304 + per_cpu(hist_preemptirqsoff_counting, cpu)) {
19305 + cycle_t start = per_cpu(hist_preemptirqsoff_start, cpu);
19308 + stop = ftrace_now(cpu);
19310 + long latency = ((long) (stop - start)) /
19313 + latency_hist(PREEMPTIRQSOFF_LATENCY, cpu,
19314 + latency, 0, stop, NULL);
19316 + per_cpu(hist_preemptirqsoff_counting, cpu) = 0;
19323 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
19324 +static DEFINE_RAW_SPINLOCK(wakeup_lock);
19325 +static notrace void probe_sched_migrate_task(void *v, struct task_struct *task,
19328 + int old_cpu = task_cpu(task);
19330 + if (cpu != old_cpu) {
19331 + unsigned long flags;
19332 + struct task_struct *cpu_wakeup_task;
19334 + raw_spin_lock_irqsave(&wakeup_lock, flags);
19336 + cpu_wakeup_task = per_cpu(wakeup_task, old_cpu);
19337 + if (task == cpu_wakeup_task) {
19338 + put_task_struct(cpu_wakeup_task);
19339 + per_cpu(wakeup_task, old_cpu) = NULL;
19340 + cpu_wakeup_task = per_cpu(wakeup_task, cpu) = task;
19341 + get_task_struct(cpu_wakeup_task);
19344 + raw_spin_unlock_irqrestore(&wakeup_lock, flags);
19348 +static notrace void probe_wakeup_latency_hist_start(void *v,
19349 + struct task_struct *p)
19351 + unsigned long flags;
19352 + struct task_struct *curr = current;
19353 + int cpu = task_cpu(p);
19354 + struct task_struct *cpu_wakeup_task;
19356 + raw_spin_lock_irqsave(&wakeup_lock, flags);
19358 + cpu_wakeup_task = per_cpu(wakeup_task, cpu);
19360 + if (wakeup_pid) {
19361 + if ((cpu_wakeup_task && p->prio == cpu_wakeup_task->prio) ||
19362 + p->prio == curr->prio)
19363 + per_cpu(wakeup_sharedprio, cpu) = 1;
19364 + if (likely(wakeup_pid != task_pid_nr(p)))
19367 + if (likely(!rt_task(p)) ||
19368 + (cpu_wakeup_task && p->prio > cpu_wakeup_task->prio) ||
19369 + p->prio > curr->prio)
19371 + if ((cpu_wakeup_task && p->prio == cpu_wakeup_task->prio) ||
19372 + p->prio == curr->prio)
19373 + per_cpu(wakeup_sharedprio, cpu) = 1;
19376 + if (cpu_wakeup_task)
19377 + put_task_struct(cpu_wakeup_task);
19378 + cpu_wakeup_task = per_cpu(wakeup_task, cpu) = p;
19379 + get_task_struct(cpu_wakeup_task);
19380 + cpu_wakeup_task->preempt_timestamp_hist =
19381 + ftrace_now(raw_smp_processor_id());
19383 + raw_spin_unlock_irqrestore(&wakeup_lock, flags);
19386 +static notrace void probe_wakeup_latency_hist_stop(void *v,
19387 + bool preempt, struct task_struct *prev, struct task_struct *next)
19389 + unsigned long flags;
19390 + int cpu = task_cpu(next);
19393 + struct task_struct *cpu_wakeup_task;
19395 + raw_spin_lock_irqsave(&wakeup_lock, flags);
19397 + cpu_wakeup_task = per_cpu(wakeup_task, cpu);
19399 + if (cpu_wakeup_task == NULL)
19402 + /* Already running? */
19403 + if (unlikely(current == cpu_wakeup_task))
19406 + if (next != cpu_wakeup_task) {
19407 + if (next->prio < cpu_wakeup_task->prio)
19410 + if (next->prio == cpu_wakeup_task->prio)
19411 + per_cpu(wakeup_sharedprio, cpu) = 1;
19416 + if (current->prio == cpu_wakeup_task->prio)
19417 + per_cpu(wakeup_sharedprio, cpu) = 1;
19420 + * The task we are waiting for is about to be switched to.
19421 + * Calculate latency and store it in histogram.
19423 + stop = ftrace_now(raw_smp_processor_id());
19425 + latency = ((long) (stop - next->preempt_timestamp_hist)) /
19428 + if (per_cpu(wakeup_sharedprio, cpu)) {
19429 + latency_hist(WAKEUP_LATENCY_SHAREDPRIO, cpu, latency, 0, stop,
19431 + per_cpu(wakeup_sharedprio, cpu) = 0;
19433 + latency_hist(WAKEUP_LATENCY, cpu, latency, 0, stop, next);
19434 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
19435 + if (timerandwakeup_enabled_data.enabled) {
19436 + latency_hist(TIMERANDWAKEUP_LATENCY, cpu,
19437 + next->timer_offset + latency, next->timer_offset,
19444 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
19445 + next->timer_offset = 0;
19447 + put_task_struct(cpu_wakeup_task);
19448 + per_cpu(wakeup_task, cpu) = NULL;
19450 + raw_spin_unlock_irqrestore(&wakeup_lock, flags);
19454 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
19455 +static notrace void probe_hrtimer_interrupt(void *v, int cpu,
19456 + long long latency_ns, struct task_struct *curr,
19457 + struct task_struct *task)
19459 + if (latency_ns <= 0 && task != NULL && rt_task(task) &&
19460 + (task->prio < curr->prio ||
19461 + (task->prio == curr->prio &&
19462 + !cpumask_test_cpu(cpu, &task->cpus_allowed)))) {
19466 + if (missed_timer_offsets_pid) {
19467 + if (likely(missed_timer_offsets_pid !=
19468 + task_pid_nr(task)))
19472 + now = ftrace_now(cpu);
19473 + latency = (long) div_s64(-latency_ns, NSECS_PER_USECS);
19474 + latency_hist(MISSED_TIMER_OFFSETS, cpu, latency, latency, now,
19476 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
19477 + task->timer_offset = latency;
19483 +static __init int latency_hist_init(void)
19485 + struct dentry *latency_hist_root = NULL;
19486 + struct dentry *dentry;
19487 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
19488 + struct dentry *dentry_sharedprio;
19490 + struct dentry *entry;
19491 + struct dentry *enable_root;
19493 + struct hist_data *my_hist;
19495 + char *cpufmt = "CPU%d";
19496 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
19497 + defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
19498 + char *cpufmt_maxlatproc = "max_latency-CPU%d";
19499 + struct maxlatproc_data *mp = NULL;
19502 + dentry = tracing_init_dentry();
19503 + latency_hist_root = debugfs_create_dir(latency_hist_dir_root, dentry);
19504 + enable_root = debugfs_create_dir("enable", latency_hist_root);
19506 +#ifdef CONFIG_INTERRUPT_OFF_HIST
19507 + dentry = debugfs_create_dir(irqsoff_hist_dir, latency_hist_root);
19508 + for_each_possible_cpu(i) {
19509 + sprintf(name, cpufmt, i);
19510 + entry = debugfs_create_file(name, 0444, dentry,
19511 + &per_cpu(irqsoff_hist, i), &latency_hist_fops);
19512 + my_hist = &per_cpu(irqsoff_hist, i);
19513 + atomic_set(&my_hist->hist_mode, 1);
19514 + my_hist->min_lat = LONG_MAX;
19516 + entry = debugfs_create_file("reset", 0644, dentry,
19517 + (void *)IRQSOFF_LATENCY, &latency_hist_reset_fops);
19520 +#ifdef CONFIG_PREEMPT_OFF_HIST
19521 + dentry = debugfs_create_dir(preemptoff_hist_dir,
19522 + latency_hist_root);
19523 + for_each_possible_cpu(i) {
19524 + sprintf(name, cpufmt, i);
19525 + entry = debugfs_create_file(name, 0444, dentry,
19526 + &per_cpu(preemptoff_hist, i), &latency_hist_fops);
19527 + my_hist = &per_cpu(preemptoff_hist, i);
19528 + atomic_set(&my_hist->hist_mode, 1);
19529 + my_hist->min_lat = LONG_MAX;
19531 + entry = debugfs_create_file("reset", 0644, dentry,
19532 + (void *)PREEMPTOFF_LATENCY, &latency_hist_reset_fops);
19535 +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
19536 + dentry = debugfs_create_dir(preemptirqsoff_hist_dir,
19537 + latency_hist_root);
19538 + for_each_possible_cpu(i) {
19539 + sprintf(name, cpufmt, i);
19540 + entry = debugfs_create_file(name, 0444, dentry,
19541 + &per_cpu(preemptirqsoff_hist, i), &latency_hist_fops);
19542 + my_hist = &per_cpu(preemptirqsoff_hist, i);
19543 + atomic_set(&my_hist->hist_mode, 1);
19544 + my_hist->min_lat = LONG_MAX;
19546 + entry = debugfs_create_file("reset", 0644, dentry,
19547 + (void *)PREEMPTIRQSOFF_LATENCY, &latency_hist_reset_fops);
19550 +#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST)
19551 + entry = debugfs_create_file("preemptirqsoff", 0644,
19552 + enable_root, (void *)&preemptirqsoff_enabled_data,
19556 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
19557 + dentry = debugfs_create_dir(wakeup_latency_hist_dir,
19558 + latency_hist_root);
19559 + dentry_sharedprio = debugfs_create_dir(
19560 + wakeup_latency_hist_dir_sharedprio, dentry);
19561 + for_each_possible_cpu(i) {
19562 + sprintf(name, cpufmt, i);
19564 + entry = debugfs_create_file(name, 0444, dentry,
19565 + &per_cpu(wakeup_latency_hist, i),
19566 + &latency_hist_fops);
19567 + my_hist = &per_cpu(wakeup_latency_hist, i);
19568 + atomic_set(&my_hist->hist_mode, 1);
19569 + my_hist->min_lat = LONG_MAX;
19571 + entry = debugfs_create_file(name, 0444, dentry_sharedprio,
19572 + &per_cpu(wakeup_latency_hist_sharedprio, i),
19573 + &latency_hist_fops);
19574 + my_hist = &per_cpu(wakeup_latency_hist_sharedprio, i);
19575 + atomic_set(&my_hist->hist_mode, 1);
19576 + my_hist->min_lat = LONG_MAX;
19578 + sprintf(name, cpufmt_maxlatproc, i);
19580 + mp = &per_cpu(wakeup_maxlatproc, i);
19581 + entry = debugfs_create_file(name, 0444, dentry, mp,
19582 + &maxlatproc_fops);
19583 + clear_maxlatprocdata(mp);
19585 + mp = &per_cpu(wakeup_maxlatproc_sharedprio, i);
19586 + entry = debugfs_create_file(name, 0444, dentry_sharedprio, mp,
19587 + &maxlatproc_fops);
19588 + clear_maxlatprocdata(mp);
19590 + entry = debugfs_create_file("pid", 0644, dentry,
19591 + (void *)&wakeup_pid, &pid_fops);
19592 + entry = debugfs_create_file("reset", 0644, dentry,
19593 + (void *)WAKEUP_LATENCY, &latency_hist_reset_fops);
19594 + entry = debugfs_create_file("reset", 0644, dentry_sharedprio,
19595 + (void *)WAKEUP_LATENCY_SHAREDPRIO, &latency_hist_reset_fops);
19596 + entry = debugfs_create_file("wakeup", 0644,
19597 + enable_root, (void *)&wakeup_latency_enabled_data,
19601 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
19602 + dentry = debugfs_create_dir(missed_timer_offsets_dir,
19603 + latency_hist_root);
19604 + for_each_possible_cpu(i) {
19605 + sprintf(name, cpufmt, i);
19606 + entry = debugfs_create_file(name, 0444, dentry,
19607 + &per_cpu(missed_timer_offsets, i), &latency_hist_fops);
19608 + my_hist = &per_cpu(missed_timer_offsets, i);
19609 + atomic_set(&my_hist->hist_mode, 1);
19610 + my_hist->min_lat = LONG_MAX;
19612 + sprintf(name, cpufmt_maxlatproc, i);
19613 + mp = &per_cpu(missed_timer_offsets_maxlatproc, i);
19614 + entry = debugfs_create_file(name, 0444, dentry, mp,
19615 + &maxlatproc_fops);
19616 + clear_maxlatprocdata(mp);
19618 + entry = debugfs_create_file("pid", 0644, dentry,
19619 + (void *)&missed_timer_offsets_pid, &pid_fops);
19620 + entry = debugfs_create_file("reset", 0644, dentry,
19621 + (void *)MISSED_TIMER_OFFSETS, &latency_hist_reset_fops);
19622 + entry = debugfs_create_file("missed_timer_offsets", 0644,
19623 + enable_root, (void *)&missed_timer_offsets_enabled_data,
19627 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
19628 + defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
19629 + dentry = debugfs_create_dir(timerandwakeup_latency_hist_dir,
19630 + latency_hist_root);
19631 + for_each_possible_cpu(i) {
19632 + sprintf(name, cpufmt, i);
19633 + entry = debugfs_create_file(name, 0444, dentry,
19634 + &per_cpu(timerandwakeup_latency_hist, i),
19635 + &latency_hist_fops);
19636 + my_hist = &per_cpu(timerandwakeup_latency_hist, i);
19637 + atomic_set(&my_hist->hist_mode, 1);
19638 + my_hist->min_lat = LONG_MAX;
19640 + sprintf(name, cpufmt_maxlatproc, i);
19641 + mp = &per_cpu(timerandwakeup_maxlatproc, i);
19642 + entry = debugfs_create_file(name, 0444, dentry, mp,
19643 + &maxlatproc_fops);
19644 + clear_maxlatprocdata(mp);
19646 + entry = debugfs_create_file("reset", 0644, dentry,
19647 + (void *)TIMERANDWAKEUP_LATENCY, &latency_hist_reset_fops);
19648 + entry = debugfs_create_file("timerandwakeup", 0644,
19649 + enable_root, (void *)&timerandwakeup_enabled_data,
19655 +device_initcall(latency_hist_init);
19656 diff -Nur linux-4.9.6.orig/kernel/trace/Makefile linux-4.9.6/kernel/trace/Makefile
19657 --- linux-4.9.6.orig/kernel/trace/Makefile 2017-01-26 08:25:24.000000000 +0100
19658 +++ linux-4.9.6/kernel/trace/Makefile 2017-01-28 13:59:10.091662336 +0100
19660 obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o
19661 obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o
19662 obj-$(CONFIG_HWLAT_TRACER) += trace_hwlat.o
19663 +obj-$(CONFIG_INTERRUPT_OFF_HIST) += latency_hist.o
19664 +obj-$(CONFIG_PREEMPT_OFF_HIST) += latency_hist.o
19665 +obj-$(CONFIG_WAKEUP_LATENCY_HIST) += latency_hist.o
19666 +obj-$(CONFIG_MISSED_TIMER_OFFSETS_HIST) += latency_hist.o
19667 obj-$(CONFIG_NOP_TRACER) += trace_nop.o
19668 obj-$(CONFIG_STACK_TRACER) += trace_stack.o
19669 obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
19670 diff -Nur linux-4.9.6.orig/kernel/trace/trace.c linux-4.9.6/kernel/trace/trace.c
19671 --- linux-4.9.6.orig/kernel/trace/trace.c 2017-01-26 08:25:24.000000000 +0100
19672 +++ linux-4.9.6/kernel/trace/trace.c 2017-01-28 13:59:10.091662336 +0100
19673 @@ -1897,6 +1897,7 @@
19674 struct task_struct *tsk = current;
19676 entry->preempt_count = pc & 0xff;
19677 + entry->preempt_lazy_count = preempt_lazy_count();
19678 entry->pid = (tsk) ? tsk->pid : 0;
19680 #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
19681 @@ -1907,8 +1908,11 @@
19682 ((pc & NMI_MASK ) ? TRACE_FLAG_NMI : 0) |
19683 ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) |
19684 ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) |
19685 - (tif_need_resched() ? TRACE_FLAG_NEED_RESCHED : 0) |
19686 + (tif_need_resched_now() ? TRACE_FLAG_NEED_RESCHED : 0) |
19687 + (need_resched_lazy() ? TRACE_FLAG_NEED_RESCHED_LAZY : 0) |
19688 (test_preempt_need_resched() ? TRACE_FLAG_PREEMPT_RESCHED : 0);
19690 + entry->migrate_disable = (tsk) ? __migrate_disabled(tsk) & 0xFF : 0;
19692 EXPORT_SYMBOL_GPL(tracing_generic_entry_update);
19694 @@ -2892,14 +2896,17 @@
19696 static void print_lat_help_header(struct seq_file *m)
19698 - seq_puts(m, "# _------=> CPU# \n"
19699 - "# / _-----=> irqs-off \n"
19700 - "# | / _----=> need-resched \n"
19701 - "# || / _---=> hardirq/softirq \n"
19702 - "# ||| / _--=> preempt-depth \n"
19703 - "# |||| / delay \n"
19704 - "# cmd pid ||||| time | caller \n"
19705 - "# \\ / ||||| \\ | / \n");
19706 + seq_puts(m, "# _--------=> CPU# \n"
19707 + "# / _-------=> irqs-off \n"
19708 + "# | / _------=> need-resched \n"
19709 + "# || / _-----=> need-resched_lazy \n"
19710 + "# ||| / _----=> hardirq/softirq \n"
19711 + "# |||| / _---=> preempt-depth \n"
19712 + "# ||||| / _--=> preempt-lazy-depth\n"
19713 + "# |||||| / _-=> migrate-disable \n"
19714 + "# ||||||| / delay \n"
19715 + "# cmd pid |||||||| time | caller \n"
19716 + "# \\ / |||||||| \\ | / \n");
19719 static void print_event_info(struct trace_buffer *buf, struct seq_file *m)
19720 @@ -2925,11 +2932,14 @@
19721 print_event_info(buf, m);
19722 seq_puts(m, "# _-----=> irqs-off\n"
19723 "# / _----=> need-resched\n"
19724 - "# | / _---=> hardirq/softirq\n"
19725 - "# || / _--=> preempt-depth\n"
19726 - "# ||| / delay\n"
19727 - "# TASK-PID CPU# |||| TIMESTAMP FUNCTION\n"
19728 - "# | | | |||| | |\n");
19729 + "# |/ _-----=> need-resched_lazy\n"
19730 + "# || / _---=> hardirq/softirq\n"
19731 + "# ||| / _--=> preempt-depth\n"
19732 + "# |||| / _-=> preempt-lazy-depth\n"
19733 + "# ||||| / _-=> migrate-disable \n"
19734 + "# |||||| / delay\n"
19735 + "# TASK-PID CPU# ||||||| TIMESTAMP FUNCTION\n"
19736 + "# | | | ||||||| | |\n");
19740 diff -Nur linux-4.9.6.orig/kernel/trace/trace_events.c linux-4.9.6/kernel/trace/trace_events.c
19741 --- linux-4.9.6.orig/kernel/trace/trace_events.c 2017-01-26 08:25:24.000000000 +0100
19742 +++ linux-4.9.6/kernel/trace/trace_events.c 2017-01-28 13:59:10.091662336 +0100
19743 @@ -187,6 +187,8 @@
19744 __common_field(unsigned char, flags);
19745 __common_field(unsigned char, preempt_count);
19746 __common_field(int, pid);
19747 + __common_field(unsigned short, migrate_disable);
19748 + __common_field(unsigned short, padding);
19752 diff -Nur linux-4.9.6.orig/kernel/trace/trace.h linux-4.9.6/kernel/trace/trace.h
19753 --- linux-4.9.6.orig/kernel/trace/trace.h 2017-01-26 08:25:24.000000000 +0100
19754 +++ linux-4.9.6/kernel/trace/trace.h 2017-01-28 13:59:10.091662336 +0100
19755 @@ -124,6 +124,7 @@
19756 * NEED_RESCHED - reschedule is requested
19757 * HARDIRQ - inside an interrupt handler
19758 * SOFTIRQ - inside a softirq handler
19759 + * NEED_RESCHED_LAZY - lazy reschedule is requested
19761 enum trace_flag_type {
19762 TRACE_FLAG_IRQS_OFF = 0x01,
19763 @@ -133,6 +134,7 @@
19764 TRACE_FLAG_SOFTIRQ = 0x10,
19765 TRACE_FLAG_PREEMPT_RESCHED = 0x20,
19766 TRACE_FLAG_NMI = 0x40,
19767 + TRACE_FLAG_NEED_RESCHED_LAZY = 0x80,
19770 #define TRACE_BUF_SIZE 1024
19771 diff -Nur linux-4.9.6.orig/kernel/trace/trace_irqsoff.c linux-4.9.6/kernel/trace/trace_irqsoff.c
19772 --- linux-4.9.6.orig/kernel/trace/trace_irqsoff.c 2017-01-26 08:25:24.000000000 +0100
19773 +++ linux-4.9.6/kernel/trace/trace_irqsoff.c 2017-01-28 13:59:10.091662336 +0100
19775 #include <linux/uaccess.h>
19776 #include <linux/module.h>
19777 #include <linux/ftrace.h>
19778 +#include <trace/events/hist.h>
19782 @@ -424,11 +425,13 @@
19784 if (preempt_trace() || irq_trace())
19785 start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
19786 + trace_preemptirqsoff_hist_rcuidle(TRACE_START, 1);
19788 EXPORT_SYMBOL_GPL(start_critical_timings);
19790 void stop_critical_timings(void)
19792 + trace_preemptirqsoff_hist_rcuidle(TRACE_STOP, 0);
19793 if (preempt_trace() || irq_trace())
19794 stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
19796 @@ -438,6 +441,7 @@
19797 #ifdef CONFIG_PROVE_LOCKING
19798 void time_hardirqs_on(unsigned long a0, unsigned long a1)
19800 + trace_preemptirqsoff_hist_rcuidle(IRQS_ON, 0);
19801 if (!preempt_trace() && irq_trace())
19802 stop_critical_timing(a0, a1);
19804 @@ -446,6 +450,7 @@
19806 if (!preempt_trace() && irq_trace())
19807 start_critical_timing(a0, a1);
19808 + trace_preemptirqsoff_hist_rcuidle(IRQS_OFF, 1);
19811 #else /* !CONFIG_PROVE_LOCKING */
19812 @@ -471,6 +476,7 @@
19814 void trace_hardirqs_on(void)
19816 + trace_preemptirqsoff_hist(IRQS_ON, 0);
19817 if (!preempt_trace() && irq_trace())
19818 stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
19820 @@ -480,11 +486,13 @@
19822 if (!preempt_trace() && irq_trace())
19823 start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
19824 + trace_preemptirqsoff_hist(IRQS_OFF, 1);
19826 EXPORT_SYMBOL(trace_hardirqs_off);
19828 __visible void trace_hardirqs_on_caller(unsigned long caller_addr)
19830 + trace_preemptirqsoff_hist(IRQS_ON, 0);
19831 if (!preempt_trace() && irq_trace())
19832 stop_critical_timing(CALLER_ADDR0, caller_addr);
19834 @@ -494,6 +502,7 @@
19836 if (!preempt_trace() && irq_trace())
19837 start_critical_timing(CALLER_ADDR0, caller_addr);
19838 + trace_preemptirqsoff_hist(IRQS_OFF, 1);
19840 EXPORT_SYMBOL(trace_hardirqs_off_caller);
19842 @@ -503,12 +512,14 @@
19843 #ifdef CONFIG_PREEMPT_TRACER
19844 void trace_preempt_on(unsigned long a0, unsigned long a1)
19846 + trace_preemptirqsoff_hist(PREEMPT_ON, 0);
19847 if (preempt_trace() && !irq_trace())
19848 stop_critical_timing(a0, a1);
19851 void trace_preempt_off(unsigned long a0, unsigned long a1)
19853 + trace_preemptirqsoff_hist(PREEMPT_ON, 1);
19854 if (preempt_trace() && !irq_trace())
19855 start_critical_timing(a0, a1);
19857 diff -Nur linux-4.9.6.orig/kernel/trace/trace_output.c linux-4.9.6/kernel/trace/trace_output.c
19858 --- linux-4.9.6.orig/kernel/trace/trace_output.c 2017-01-26 08:25:24.000000000 +0100
19859 +++ linux-4.9.6/kernel/trace/trace_output.c 2017-01-28 13:59:10.091662336 +0100
19860 @@ -386,6 +386,7 @@
19864 + char need_resched_lazy;
19868 @@ -416,6 +417,9 @@
19872 + need_resched_lazy =
19873 + (entry->flags & TRACE_FLAG_NEED_RESCHED_LAZY) ? 'L' : '.';
19876 (nmi && hardirq) ? 'Z' :
19878 @@ -424,14 +428,25 @@
19882 - trace_seq_printf(s, "%c%c%c",
19883 - irqs_off, need_resched, hardsoft_irq);
19884 + trace_seq_printf(s, "%c%c%c%c",
19885 + irqs_off, need_resched, need_resched_lazy,
19888 if (entry->preempt_count)
19889 trace_seq_printf(s, "%x", entry->preempt_count);
19891 trace_seq_putc(s, '.');
19893 + if (entry->preempt_lazy_count)
19894 + trace_seq_printf(s, "%x", entry->preempt_lazy_count);
19896 + trace_seq_putc(s, '.');
19898 + if (entry->migrate_disable)
19899 + trace_seq_printf(s, "%x", entry->migrate_disable);
19901 + trace_seq_putc(s, '.');
19903 return !trace_seq_has_overflowed(s);
19906 diff -Nur linux-4.9.6.orig/kernel/user.c linux-4.9.6/kernel/user.c
19907 --- linux-4.9.6.orig/kernel/user.c 2017-01-26 08:25:24.000000000 +0100
19908 +++ linux-4.9.6/kernel/user.c 2017-01-28 13:59:10.091662336 +0100
19909 @@ -161,11 +161,11 @@
19913 - local_irq_save(flags);
19914 + local_irq_save_nort(flags);
19915 if (atomic_dec_and_lock(&up->__count, &uidhash_lock))
19916 free_user(up, flags);
19918 - local_irq_restore(flags);
19919 + local_irq_restore_nort(flags);
19922 struct user_struct *alloc_uid(kuid_t uid)
19923 diff -Nur linux-4.9.6.orig/kernel/watchdog.c linux-4.9.6/kernel/watchdog.c
19924 --- linux-4.9.6.orig/kernel/watchdog.c 2017-01-26 08:25:24.000000000 +0100
19925 +++ linux-4.9.6/kernel/watchdog.c 2017-01-28 13:59:10.091662336 +0100
19926 @@ -315,6 +315,8 @@
19928 #ifdef CONFIG_HARDLOCKUP_DETECTOR
19930 +static DEFINE_RAW_SPINLOCK(watchdog_output_lock);
19932 static struct perf_event_attr wd_hw_attr = {
19933 .type = PERF_TYPE_HARDWARE,
19934 .config = PERF_COUNT_HW_CPU_CYCLES,
19935 @@ -348,6 +350,13 @@
19936 /* only print hardlockups once */
19937 if (__this_cpu_read(hard_watchdog_warn) == true)
19940 + * If early-printk is enabled then make sure we do not
19941 + * lock up in printk() and kill console logging:
19945 + raw_spin_lock(&watchdog_output_lock);
19947 pr_emerg("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
19949 @@ -365,6 +374,7 @@
19950 !test_and_set_bit(0, &hardlockup_allcpu_dumped))
19951 trigger_allbutself_cpu_backtrace();
19953 + raw_spin_unlock(&watchdog_output_lock);
19954 if (hardlockup_panic)
19955 nmi_panic(regs, "Hard LOCKUP");
19957 @@ -512,6 +522,7 @@
19958 /* kick off the timer for the hardlockup detector */
19959 hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
19960 hrtimer->function = watchdog_timer_fn;
19961 + hrtimer->irqsafe = 1;
19963 /* Enable the perf event */
19964 watchdog_nmi_enable(cpu);
19965 diff -Nur linux-4.9.6.orig/kernel/workqueue.c linux-4.9.6/kernel/workqueue.c
19966 --- linux-4.9.6.orig/kernel/workqueue.c 2017-01-26 08:25:24.000000000 +0100
19967 +++ linux-4.9.6/kernel/workqueue.c 2017-01-28 13:59:10.095662491 +0100
19969 #include <linux/nodemask.h>
19970 #include <linux/moduleparam.h>
19971 #include <linux/uaccess.h>
19972 +#include <linux/locallock.h>
19973 +#include <linux/delay.h>
19975 #include "workqueue_internal.h"
19977 @@ -121,11 +123,16 @@
19978 * cpu or grabbing pool->lock is enough for read access. If
19979 * POOL_DISASSOCIATED is set, it's identical to L.
19981 + * On RT we need the extra protection via rt_lock_idle_list() for
19982 + * the list manipulations against read access from
19983 + * wq_worker_sleeping(). All other places are nicely serialized via
19986 * A: pool->attach_mutex protected.
19988 * PL: wq_pool_mutex protected.
19990 - * PR: wq_pool_mutex protected for writes. Sched-RCU protected for reads.
19991 + * PR: wq_pool_mutex protected for writes. RCU protected for reads.
19993 * PW: wq_pool_mutex and wq->mutex protected for writes. Either for reads.
19995 @@ -134,7 +141,7 @@
19997 * WQ: wq->mutex protected.
19999 - * WR: wq->mutex protected for writes. Sched-RCU protected for reads.
20000 + * WR: wq->mutex protected for writes. RCU protected for reads.
20002 * MD: wq_mayday_lock protected.
20004 @@ -185,7 +192,7 @@
20005 atomic_t nr_running ____cacheline_aligned_in_smp;
20008 - * Destruction of pool is sched-RCU protected to allow dereferences
20009 + * Destruction of pool is RCU protected to allow dereferences
20010 * from get_work_pool().
20012 struct rcu_head rcu;
20013 @@ -214,7 +221,7 @@
20015 * Release of unbound pwq is punted to system_wq. See put_pwq()
20016 * and pwq_unbound_release_workfn() for details. pool_workqueue
20017 - * itself is also sched-RCU protected so that the first pwq can be
20018 + * itself is also RCU protected so that the first pwq can be
20019 * determined without grabbing wq->mutex.
20021 struct work_struct unbound_release_work;
20022 @@ -348,6 +355,8 @@
20023 struct workqueue_struct *system_freezable_power_efficient_wq __read_mostly;
20024 EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq);
20026 +static DEFINE_LOCAL_IRQ_LOCK(pendingb_lock);
20028 static int worker_thread(void *__worker);
20029 static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
20031 @@ -355,20 +364,20 @@
20032 #include <trace/events/workqueue.h>
20034 #define assert_rcu_or_pool_mutex() \
20035 - RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \
20036 + RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \
20037 !lockdep_is_held(&wq_pool_mutex), \
20038 - "sched RCU or wq_pool_mutex should be held")
20039 + "RCU or wq_pool_mutex should be held")
20041 #define assert_rcu_or_wq_mutex(wq) \
20042 - RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \
20043 + RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \
20044 !lockdep_is_held(&wq->mutex), \
20045 - "sched RCU or wq->mutex should be held")
20046 + "RCU or wq->mutex should be held")
20048 #define assert_rcu_or_wq_mutex_or_pool_mutex(wq) \
20049 - RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \
20050 + RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \
20051 !lockdep_is_held(&wq->mutex) && \
20052 !lockdep_is_held(&wq_pool_mutex), \
20053 - "sched RCU, wq->mutex or wq_pool_mutex should be held")
20054 + "RCU, wq->mutex or wq_pool_mutex should be held")
20056 #define for_each_cpu_worker_pool(pool, cpu) \
20057 for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0]; \
20058 @@ -380,7 +389,7 @@
20059 * @pool: iteration cursor
20060 * @pi: integer used for iteration
20062 - * This must be called either with wq_pool_mutex held or sched RCU read
20063 + * This must be called either with wq_pool_mutex held or RCU read
20064 * locked. If the pool needs to be used beyond the locking in effect, the
20065 * caller is responsible for guaranteeing that the pool stays online.
20067 @@ -412,7 +421,7 @@
20068 * @pwq: iteration cursor
20069 * @wq: the target workqueue
20071 - * This must be called either with wq->mutex held or sched RCU read locked.
20072 + * This must be called either with wq->mutex held or RCU read locked.
20073 * If the pwq needs to be used beyond the locking in effect, the caller is
20074 * responsible for guaranteeing that the pwq stays online.
20076 @@ -424,6 +433,31 @@
20077 if (({ assert_rcu_or_wq_mutex(wq); false; })) { } \
20080 +#ifdef CONFIG_PREEMPT_RT_BASE
20081 +static inline void rt_lock_idle_list(struct worker_pool *pool)
20083 + preempt_disable();
20085 +static inline void rt_unlock_idle_list(struct worker_pool *pool)
20087 + preempt_enable();
20089 +static inline void sched_lock_idle_list(struct worker_pool *pool) { }
20090 +static inline void sched_unlock_idle_list(struct worker_pool *pool) { }
20092 +static inline void rt_lock_idle_list(struct worker_pool *pool) { }
20093 +static inline void rt_unlock_idle_list(struct worker_pool *pool) { }
20094 +static inline void sched_lock_idle_list(struct worker_pool *pool)
20096 + spin_lock_irq(&pool->lock);
20098 +static inline void sched_unlock_idle_list(struct worker_pool *pool)
20100 + spin_unlock_irq(&pool->lock);
20105 #ifdef CONFIG_DEBUG_OBJECTS_WORK
20107 static struct debug_obj_descr work_debug_descr;
20108 @@ -548,7 +582,7 @@
20109 * @wq: the target workqueue
20110 * @node: the node ID
20112 - * This must be called with any of wq_pool_mutex, wq->mutex or sched RCU
20113 + * This must be called with any of wq_pool_mutex, wq->mutex or RCU
20115 * If the pwq needs to be used beyond the locking in effect, the caller is
20116 * responsible for guaranteeing that the pwq stays online.
20117 @@ -692,8 +726,8 @@
20118 * @work: the work item of interest
20120 * Pools are created and destroyed under wq_pool_mutex, and allows read
20121 - * access under sched-RCU read lock. As such, this function should be
20122 - * called under wq_pool_mutex or with preemption disabled.
20123 + * access under RCU read lock. As such, this function should be
20124 + * called under wq_pool_mutex or inside of a rcu_read_lock() region.
20126 * All fields of the returned pool are accessible as long as the above
20127 * mentioned locking is in effect. If the returned pool needs to be used
20128 @@ -830,50 +864,45 @@
20130 static void wake_up_worker(struct worker_pool *pool)
20132 - struct worker *worker = first_idle_worker(pool);
20133 + struct worker *worker;
20135 + rt_lock_idle_list(pool);
20137 + worker = first_idle_worker(pool);
20139 if (likely(worker))
20140 wake_up_process(worker->task);
20142 + rt_unlock_idle_list(pool);
20146 - * wq_worker_waking_up - a worker is waking up
20147 + * wq_worker_running - a worker is running again
20148 * @task: task waking up
20149 - * @cpu: CPU @task is waking up to
20151 - * This function is called during try_to_wake_up() when a worker is
20155 - * spin_lock_irq(rq->lock)
20156 + * This function is called when a worker returns from schedule()
20158 -void wq_worker_waking_up(struct task_struct *task, int cpu)
20159 +void wq_worker_running(struct task_struct *task)
20161 struct worker *worker = kthread_data(task);
20163 - if (!(worker->flags & WORKER_NOT_RUNNING)) {
20164 - WARN_ON_ONCE(worker->pool->cpu != cpu);
20165 + if (!worker->sleeping)
20167 + if (!(worker->flags & WORKER_NOT_RUNNING))
20168 atomic_inc(&worker->pool->nr_running);
20170 + worker->sleeping = 0;
20174 * wq_worker_sleeping - a worker is going to sleep
20175 * @task: task going to sleep
20177 - * This function is called during schedule() when a busy worker is
20178 - * going to sleep. Worker on the same cpu can be woken up by
20179 - * returning pointer to its task.
20182 - * spin_lock_irq(rq->lock)
20185 - * Worker task on @cpu to wake up, %NULL if none.
20186 + * This function is called from schedule() when a busy worker is
20187 + * going to sleep.
20189 -struct task_struct *wq_worker_sleeping(struct task_struct *task)
20190 +void wq_worker_sleeping(struct task_struct *task)
20192 - struct worker *worker = kthread_data(task), *to_wakeup = NULL;
20193 + struct worker *worker = kthread_data(task);
20194 struct worker_pool *pool;
20197 @@ -882,29 +911,26 @@
20198 * checking NOT_RUNNING.
20200 if (worker->flags & WORKER_NOT_RUNNING)
20204 pool = worker->pool;
20206 - /* this can only happen on the local cpu */
20207 - if (WARN_ON_ONCE(pool->cpu != raw_smp_processor_id()))
20209 + if (WARN_ON_ONCE(worker->sleeping))
20212 + worker->sleeping = 1;
20215 * The counterpart of the following dec_and_test, implied mb,
20216 * worklist not empty test sequence is in insert_work().
20217 * Please read comment there.
20219 - * NOT_RUNNING is clear. This means that we're bound to and
20220 - * running on the local cpu w/ rq lock held and preemption
20221 - * disabled, which in turn means that none else could be
20222 - * manipulating idle_list, so dereferencing idle_list without pool
20225 if (atomic_dec_and_test(&pool->nr_running) &&
20226 - !list_empty(&pool->worklist))
20227 - to_wakeup = first_idle_worker(pool);
20228 - return to_wakeup ? to_wakeup->task : NULL;
20229 + !list_empty(&pool->worklist)) {
20230 + sched_lock_idle_list(pool);
20231 + wake_up_worker(pool);
20232 + sched_unlock_idle_list(pool);
20237 @@ -1098,12 +1124,14 @@
20241 - * As both pwqs and pools are sched-RCU protected, the
20242 + * As both pwqs and pools are RCU protected, the
20243 * following lock operations are safe.
20245 - spin_lock_irq(&pwq->pool->lock);
20247 + local_spin_lock_irq(pendingb_lock, &pwq->pool->lock);
20249 - spin_unlock_irq(&pwq->pool->lock);
20250 + local_spin_unlock_irq(pendingb_lock, &pwq->pool->lock);
20251 + rcu_read_unlock();
20255 @@ -1207,7 +1235,7 @@
20256 struct worker_pool *pool;
20257 struct pool_workqueue *pwq;
20259 - local_irq_save(*flags);
20260 + local_lock_irqsave(pendingb_lock, *flags);
20262 /* try to steal the timer if it exists */
20264 @@ -1226,6 +1254,7 @@
20265 if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)))
20270 * The queueing is in progress, or it is already queued. Try to
20271 * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
20272 @@ -1264,14 +1293,16 @@
20273 set_work_pool_and_keep_pending(work, pool->id);
20275 spin_unlock(&pool->lock);
20276 + rcu_read_unlock();
20279 spin_unlock(&pool->lock);
20281 - local_irq_restore(*flags);
20282 + rcu_read_unlock();
20283 + local_unlock_irqrestore(pendingb_lock, *flags);
20284 if (work_is_canceling(work))
20291 @@ -1373,7 +1404,7 @@
20292 * queued or lose PENDING. Grabbing PENDING and queueing should
20293 * happen with IRQ disabled.
20295 - WARN_ON_ONCE(!irqs_disabled());
20296 + WARN_ON_ONCE_NONRT(!irqs_disabled());
20298 debug_work_activate(work);
20300 @@ -1381,6 +1412,7 @@
20301 if (unlikely(wq->flags & __WQ_DRAINING) &&
20302 WARN_ON_ONCE(!is_chained_work(wq)))
20306 if (req_cpu == WORK_CPU_UNBOUND)
20307 cpu = wq_select_unbound_cpu(raw_smp_processor_id());
20308 @@ -1437,10 +1469,8 @@
20309 /* pwq determined, queue */
20310 trace_workqueue_queue_work(req_cpu, pwq, work);
20312 - if (WARN_ON(!list_empty(&work->entry))) {
20313 - spin_unlock(&pwq->pool->lock);
20316 + if (WARN_ON(!list_empty(&work->entry)))
20319 pwq->nr_in_flight[pwq->work_color]++;
20320 work_flags = work_color_to_flags(pwq->work_color);
20321 @@ -1458,7 +1488,9 @@
20323 insert_work(pwq, work, worklist, work_flags);
20326 spin_unlock(&pwq->pool->lock);
20327 + rcu_read_unlock();
20331 @@ -1478,14 +1510,14 @@
20333 unsigned long flags;
20335 - local_irq_save(flags);
20336 + local_lock_irqsave(pendingb_lock,flags);
20338 if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
20339 __queue_work(cpu, wq, work);
20343 - local_irq_restore(flags);
20344 + local_unlock_irqrestore(pendingb_lock, flags);
20347 EXPORT_SYMBOL(queue_work_on);
20348 @@ -1552,14 +1584,14 @@
20349 unsigned long flags;
20351 /* read the comment in __queue_work() */
20352 - local_irq_save(flags);
20353 + local_lock_irqsave(pendingb_lock, flags);
20355 if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
20356 __queue_delayed_work(cpu, wq, dwork, delay);
20360 - local_irq_restore(flags);
20361 + local_unlock_irqrestore(pendingb_lock, flags);
20364 EXPORT_SYMBOL(queue_delayed_work_on);
20365 @@ -1594,7 +1626,7 @@
20367 if (likely(ret >= 0)) {
20368 __queue_delayed_work(cpu, wq, dwork, delay);
20369 - local_irq_restore(flags);
20370 + local_unlock_irqrestore(pendingb_lock, flags);
20373 /* -ENOENT from try_to_grab_pending() becomes %true */
20374 @@ -1627,7 +1659,9 @@
20375 worker->last_active = jiffies;
20377 /* idle_list is LIFO */
20378 + rt_lock_idle_list(pool);
20379 list_add(&worker->entry, &pool->idle_list);
20380 + rt_unlock_idle_list(pool);
20382 if (too_many_workers(pool) && !timer_pending(&pool->idle_timer))
20383 mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);
20384 @@ -1660,7 +1694,9 @@
20386 worker_clr_flags(worker, WORKER_IDLE);
20388 + rt_lock_idle_list(pool);
20389 list_del_init(&worker->entry);
20390 + rt_unlock_idle_list(pool);
20393 static struct worker *alloc_worker(int node)
20394 @@ -1826,7 +1862,9 @@
20395 pool->nr_workers--;
20398 + rt_lock_idle_list(pool);
20399 list_del_init(&worker->entry);
20400 + rt_unlock_idle_list(pool);
20401 worker->flags |= WORKER_DIE;
20402 wake_up_process(worker->task);
20404 @@ -2785,14 +2823,14 @@
20408 - local_irq_disable();
20410 pool = get_work_pool(work);
20412 - local_irq_enable();
20413 + rcu_read_unlock();
20417 - spin_lock(&pool->lock);
20418 + spin_lock_irq(&pool->lock);
20419 /* see the comment in try_to_grab_pending() with the same code */
20420 pwq = get_work_pwq(work);
20422 @@ -2821,10 +2859,11 @@
20424 lock_map_acquire_read(&pwq->wq->lockdep_map);
20425 lock_map_release(&pwq->wq->lockdep_map);
20427 + rcu_read_unlock();
20430 spin_unlock_irq(&pool->lock);
20431 + rcu_read_unlock();
20435 @@ -2911,7 +2950,7 @@
20437 /* tell other tasks trying to grab @work to back off */
20438 mark_work_canceling(work);
20439 - local_irq_restore(flags);
20440 + local_unlock_irqrestore(pendingb_lock, flags);
20443 clear_work_data(work);
20444 @@ -2966,10 +3005,10 @@
20446 bool flush_delayed_work(struct delayed_work *dwork)
20448 - local_irq_disable();
20449 + local_lock_irq(pendingb_lock);
20450 if (del_timer_sync(&dwork->timer))
20451 __queue_work(dwork->cpu, dwork->wq, &dwork->work);
20452 - local_irq_enable();
20453 + local_unlock_irq(pendingb_lock);
20454 return flush_work(&dwork->work);
20456 EXPORT_SYMBOL(flush_delayed_work);
20457 @@ -2987,7 +3026,7 @@
20460 set_work_pool_and_clear_pending(work, get_work_pool_id(work));
20461 - local_irq_restore(flags);
20462 + local_unlock_irqrestore(pendingb_lock, flags);
20466 @@ -3245,7 +3284,7 @@
20467 * put_unbound_pool - put a worker_pool
20468 * @pool: worker_pool to put
20470 - * Put @pool. If its refcnt reaches zero, it gets destroyed in sched-RCU
20471 + * Put @pool. If its refcnt reaches zero, it gets destroyed in RCU
20472 * safe manner. get_unbound_pool() calls this function on its failure path
20473 * and this function should be able to release pools which went through,
20474 * successfully or not, init_worker_pool().
20475 @@ -3299,8 +3338,8 @@
20476 del_timer_sync(&pool->idle_timer);
20477 del_timer_sync(&pool->mayday_timer);
20479 - /* sched-RCU protected to allow dereferences from get_work_pool() */
20480 - call_rcu_sched(&pool->rcu, rcu_free_pool);
20481 + /* RCU protected to allow dereferences from get_work_pool() */
20482 + call_rcu(&pool->rcu, rcu_free_pool);
20486 @@ -3407,14 +3446,14 @@
20487 put_unbound_pool(pool);
20488 mutex_unlock(&wq_pool_mutex);
20490 - call_rcu_sched(&pwq->rcu, rcu_free_pwq);
20491 + call_rcu(&pwq->rcu, rcu_free_pwq);
20494 * If we're the last pwq going away, @wq is already dead and no one
20495 * is gonna access it anymore. Schedule RCU free.
20498 - call_rcu_sched(&wq->rcu, rcu_free_wq);
20499 + call_rcu(&wq->rcu, rcu_free_wq);
20503 @@ -4064,7 +4103,7 @@
20504 * The base ref is never dropped on per-cpu pwqs. Directly
20505 * schedule RCU free.
20507 - call_rcu_sched(&wq->rcu, rcu_free_wq);
20508 + call_rcu(&wq->rcu, rcu_free_wq);
20511 * We're the sole accessor of @wq at this point. Directly
20512 @@ -4157,7 +4196,8 @@
20513 struct pool_workqueue *pwq;
20516 - rcu_read_lock_sched();
20518 + preempt_disable();
20520 if (cpu == WORK_CPU_UNBOUND)
20521 cpu = smp_processor_id();
20522 @@ -4168,7 +4208,8 @@
20523 pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu));
20525 ret = !list_empty(&pwq->delayed_works);
20526 - rcu_read_unlock_sched();
20527 + preempt_enable();
20528 + rcu_read_unlock();
20532 @@ -4194,15 +4235,15 @@
20533 if (work_pending(work))
20534 ret |= WORK_BUSY_PENDING;
20536 - local_irq_save(flags);
20538 pool = get_work_pool(work);
20540 - spin_lock(&pool->lock);
20541 + spin_lock_irqsave(&pool->lock, flags);
20542 if (find_worker_executing_work(pool, work))
20543 ret |= WORK_BUSY_RUNNING;
20544 - spin_unlock(&pool->lock);
20545 + spin_unlock_irqrestore(&pool->lock, flags);
20547 - local_irq_restore(flags);
20548 + rcu_read_unlock();
20552 @@ -4391,7 +4432,7 @@
20553 unsigned long flags;
20556 - rcu_read_lock_sched();
20559 pr_info("Showing busy workqueues and worker pools:\n");
20561 @@ -4444,7 +4485,7 @@
20562 spin_unlock_irqrestore(&pool->lock, flags);
20565 - rcu_read_unlock_sched();
20566 + rcu_read_unlock();
20570 @@ -4782,16 +4823,16 @@
20571 * nr_active is monotonically decreasing. It's safe
20572 * to peek without lock.
20574 - rcu_read_lock_sched();
20576 for_each_pwq(pwq, wq) {
20577 WARN_ON_ONCE(pwq->nr_active < 0);
20578 if (pwq->nr_active) {
20580 - rcu_read_unlock_sched();
20581 + rcu_read_unlock();
20585 - rcu_read_unlock_sched();
20586 + rcu_read_unlock();
20589 mutex_unlock(&wq_pool_mutex);
20590 @@ -4981,7 +5022,8 @@
20591 const char *delim = "";
20592 int node, written = 0;
20594 - rcu_read_lock_sched();
20595 + get_online_cpus();
20597 for_each_node(node) {
20598 written += scnprintf(buf + written, PAGE_SIZE - written,
20599 "%s%d:%d", delim, node,
20600 @@ -4989,7 +5031,8 @@
20603 written += scnprintf(buf + written, PAGE_SIZE - written, "\n");
20604 - rcu_read_unlock_sched();
20605 + rcu_read_unlock();
20606 + put_online_cpus();
20610 diff -Nur linux-4.9.6.orig/kernel/workqueue_internal.h linux-4.9.6/kernel/workqueue_internal.h
20611 --- linux-4.9.6.orig/kernel/workqueue_internal.h 2017-01-26 08:25:24.000000000 +0100
20612 +++ linux-4.9.6/kernel/workqueue_internal.h 2017-01-28 13:59:10.095662491 +0100
20614 unsigned long last_active; /* L: last active timestamp */
20615 unsigned int flags; /* X: flags */
20616 int id; /* I: worker id */
20617 + int sleeping; /* None */
20620 * Opaque string set with work_set_desc(). Printed out with task
20622 * Scheduler hooks for concurrency managed workqueue. Only to be used from
20623 * sched/core.c and workqueue.c.
20625 -void wq_worker_waking_up(struct task_struct *task, int cpu);
20626 -struct task_struct *wq_worker_sleeping(struct task_struct *task);
20627 +void wq_worker_running(struct task_struct *task);
20628 +void wq_worker_sleeping(struct task_struct *task);
20630 #endif /* _KERNEL_WORKQUEUE_INTERNAL_H */
20631 diff -Nur linux-4.9.6.orig/lib/debugobjects.c linux-4.9.6/lib/debugobjects.c
20632 --- linux-4.9.6.orig/lib/debugobjects.c 2017-01-26 08:25:24.000000000 +0100
20633 +++ linux-4.9.6/lib/debugobjects.c 2017-01-28 13:59:10.095662491 +0100
20634 @@ -308,7 +308,10 @@
20635 struct debug_obj *obj;
20636 unsigned long flags;
20639 +#ifdef CONFIG_PREEMPT_RT_FULL
20640 + if (preempt_count() == 0 && !irqs_disabled())
20644 db = get_bucket((unsigned long) addr);
20646 diff -Nur linux-4.9.6.orig/lib/idr.c linux-4.9.6/lib/idr.c
20647 --- linux-4.9.6.orig/lib/idr.c 2017-01-26 08:25:24.000000000 +0100
20648 +++ linux-4.9.6/lib/idr.c 2017-01-28 13:59:10.095662491 +0100
20650 #include <linux/idr.h>
20651 #include <linux/spinlock.h>
20652 #include <linux/percpu.h>
20653 +#include <linux/locallock.h>
20655 #define MAX_IDR_SHIFT (sizeof(int) * 8 - 1)
20656 #define MAX_IDR_BIT (1U << MAX_IDR_SHIFT)
20658 static DEFINE_PER_CPU(int, idr_preload_cnt);
20659 static DEFINE_SPINLOCK(simple_ida_lock);
20661 +#ifdef CONFIG_PREEMPT_RT_FULL
20662 +static DEFINE_LOCAL_IRQ_LOCK(idr_lock);
20664 +static inline void idr_preload_lock(void)
20666 + local_lock(idr_lock);
20669 +static inline void idr_preload_unlock(void)
20671 + local_unlock(idr_lock);
20674 +void idr_preload_end(void)
20676 + idr_preload_unlock();
20678 +EXPORT_SYMBOL(idr_preload_end);
20680 +static inline void idr_preload_lock(void)
20682 + preempt_disable();
20685 +static inline void idr_preload_unlock(void)
20687 + preempt_enable();
20692 /* the maximum ID which can be allocated given idr->layers */
20693 static int idr_max(int layers)
20695 @@ -115,14 +147,14 @@
20696 * context. See idr_preload() for details.
20698 if (!in_interrupt()) {
20699 - preempt_disable();
20700 + idr_preload_lock();
20701 new = __this_cpu_read(idr_preload_head);
20703 __this_cpu_write(idr_preload_head, new->ary[0]);
20704 __this_cpu_dec(idr_preload_cnt);
20705 new->ary[0] = NULL;
20707 - preempt_enable();
20708 + idr_preload_unlock();
20712 @@ -366,7 +398,6 @@
20713 idr_mark_full(pa, id);
20718 * idr_preload - preload for idr_alloc()
20719 * @gfp_mask: allocation mask to use for preloading
20720 @@ -401,7 +432,7 @@
20721 WARN_ON_ONCE(in_interrupt());
20722 might_sleep_if(gfpflags_allow_blocking(gfp_mask));
20724 - preempt_disable();
20725 + idr_preload_lock();
20728 * idr_alloc() is likely to succeed w/o full idr_layer buffer and
20729 @@ -413,9 +444,9 @@
20730 while (__this_cpu_read(idr_preload_cnt) < MAX_IDR_FREE) {
20731 struct idr_layer *new;
20733 - preempt_enable();
20734 + idr_preload_unlock();
20735 new = kmem_cache_zalloc(idr_layer_cache, gfp_mask);
20736 - preempt_disable();
20737 + idr_preload_lock();
20741 diff -Nur linux-4.9.6.orig/lib/irq_poll.c linux-4.9.6/lib/irq_poll.c
20742 --- linux-4.9.6.orig/lib/irq_poll.c 2017-01-26 08:25:24.000000000 +0100
20743 +++ linux-4.9.6/lib/irq_poll.c 2017-01-28 13:59:10.095662491 +0100
20745 list_add_tail(&iop->list, this_cpu_ptr(&blk_cpu_iopoll));
20746 __raise_softirq_irqoff(IRQ_POLL_SOFTIRQ);
20747 local_irq_restore(flags);
20748 + preempt_check_resched_rt();
20750 EXPORT_SYMBOL(irq_poll_sched);
20753 local_irq_save(flags);
20754 __irq_poll_complete(iop);
20755 local_irq_restore(flags);
20756 + preempt_check_resched_rt();
20758 EXPORT_SYMBOL(irq_poll_complete);
20763 local_irq_enable();
20764 + preempt_check_resched_rt();
20766 /* Even though interrupts have been re-enabled, this
20767 * access is safe because interrupts can only add new
20768 @@ -132,6 +135,7 @@
20769 __raise_softirq_irqoff(IRQ_POLL_SOFTIRQ);
20771 local_irq_enable();
20772 + preempt_check_resched_rt();
20776 @@ -195,6 +199,7 @@
20777 this_cpu_ptr(&blk_cpu_iopoll));
20778 __raise_softirq_irqoff(IRQ_POLL_SOFTIRQ);
20779 local_irq_enable();
20780 + preempt_check_resched_rt();
20784 diff -Nur linux-4.9.6.orig/lib/Kconfig linux-4.9.6/lib/Kconfig
20785 --- linux-4.9.6.orig/lib/Kconfig 2017-01-26 08:25:24.000000000 +0100
20786 +++ linux-4.9.6/lib/Kconfig 2017-01-28 13:59:10.095662491 +0100
20787 @@ -400,6 +400,7 @@
20789 config CPUMASK_OFFSTACK
20790 bool "Force CPU masks off stack" if DEBUG_PER_CPU_MAPS
20791 + depends on !PREEMPT_RT_FULL
20793 Use dynamic allocation for cpumask_var_t, instead of putting
20794 them on the stack. This is a bit more expensive, but avoids
20795 diff -Nur linux-4.9.6.orig/lib/locking-selftest.c linux-4.9.6/lib/locking-selftest.c
20796 --- linux-4.9.6.orig/lib/locking-selftest.c 2017-01-26 08:25:24.000000000 +0100
20797 +++ linux-4.9.6/lib/locking-selftest.c 2017-01-28 13:59:10.095662491 +0100
20798 @@ -590,6 +590,8 @@
20799 #include "locking-selftest-spin-hardirq.h"
20800 GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_hard_spin)
20802 +#ifndef CONFIG_PREEMPT_RT_FULL
20804 #include "locking-selftest-rlock-hardirq.h"
20805 GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_hard_rlock)
20807 @@ -605,9 +607,12 @@
20808 #include "locking-selftest-wlock-softirq.h"
20809 GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_soft_wlock)
20816 +#ifndef CONFIG_PREEMPT_RT_FULL
20818 * Enabling hardirqs with a softirq-safe lock held:
20820 @@ -640,6 +645,8 @@
20827 * Enabling irqs with an irq-safe lock held:
20829 @@ -663,6 +670,8 @@
20830 #include "locking-selftest-spin-hardirq.h"
20831 GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_hard_spin)
20833 +#ifndef CONFIG_PREEMPT_RT_FULL
20835 #include "locking-selftest-rlock-hardirq.h"
20836 GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_hard_rlock)
20838 @@ -678,6 +687,8 @@
20839 #include "locking-selftest-wlock-softirq.h"
20840 GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_soft_wlock)
20847 @@ -709,6 +720,8 @@
20848 #include "locking-selftest-spin-hardirq.h"
20849 GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_hard_spin)
20851 +#ifndef CONFIG_PREEMPT_RT_FULL
20853 #include "locking-selftest-rlock-hardirq.h"
20854 GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_hard_rlock)
20856 @@ -724,6 +737,8 @@
20857 #include "locking-selftest-wlock-softirq.h"
20858 GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_soft_wlock)
20865 @@ -757,6 +772,8 @@
20866 #include "locking-selftest-spin-hardirq.h"
20867 GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_hard_spin)
20869 +#ifndef CONFIG_PREEMPT_RT_FULL
20871 #include "locking-selftest-rlock-hardirq.h"
20872 GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_hard_rlock)
20874 @@ -772,10 +789,14 @@
20875 #include "locking-selftest-wlock-softirq.h"
20876 GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_soft_wlock)
20884 +#ifndef CONFIG_PREEMPT_RT_FULL
20887 * read-lock / write-lock irq inversion.
20889 @@ -838,6 +859,10 @@
20895 +#ifndef CONFIG_PREEMPT_RT_FULL
20898 * read-lock / write-lock recursion that is actually safe.
20900 @@ -876,6 +901,8 @@
20907 * read-lock / write-lock recursion that is unsafe.
20909 @@ -1858,6 +1885,7 @@
20911 printk(" --------------------------------------------------------------------------\n");
20913 +#ifndef CONFIG_PREEMPT_RT_FULL
20915 * irq-context testcases:
20917 @@ -1870,6 +1898,28 @@
20919 DO_TESTCASE_6x2("irq read-recursion", irq_read_recursion);
20920 // DO_TESTCASE_6x2B("irq read-recursion #2", irq_read_recursion2);
20922 + /* On -rt, we only do hardirq context test for raw spinlock */
20923 + DO_TESTCASE_1B("hard-irqs-on + irq-safe-A", irqsafe1_hard_spin, 12);
20924 + DO_TESTCASE_1B("hard-irqs-on + irq-safe-A", irqsafe1_hard_spin, 21);
20926 + DO_TESTCASE_1B("hard-safe-A + irqs-on", irqsafe2B_hard_spin, 12);
20927 + DO_TESTCASE_1B("hard-safe-A + irqs-on", irqsafe2B_hard_spin, 21);
20929 + DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 123);
20930 + DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 132);
20931 + DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 213);
20932 + DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 231);
20933 + DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 312);
20934 + DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 321);
20936 + DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 123);
20937 + DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 132);
20938 + DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 213);
20939 + DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 231);
20940 + DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 312);
20941 + DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 321);
20946 diff -Nur linux-4.9.6.orig/lib/percpu_ida.c linux-4.9.6/lib/percpu_ida.c
20947 --- linux-4.9.6.orig/lib/percpu_ida.c 2017-01-26 08:25:24.000000000 +0100
20948 +++ linux-4.9.6/lib/percpu_ida.c 2017-01-28 13:59:10.095662491 +0100
20950 #include <linux/string.h>
20951 #include <linux/spinlock.h>
20952 #include <linux/percpu_ida.h>
20953 +#include <linux/locallock.h>
20955 +static DEFINE_LOCAL_IRQ_LOCK(irq_off_lock);
20957 struct percpu_ida_cpu {
20959 @@ -148,13 +151,13 @@
20960 unsigned long flags;
20963 - local_irq_save(flags);
20964 + local_lock_irqsave(irq_off_lock, flags);
20965 tags = this_cpu_ptr(pool->tag_cpu);
20968 tag = alloc_local_tag(tags);
20969 if (likely(tag >= 0)) {
20970 - local_irq_restore(flags);
20971 + local_unlock_irqrestore(irq_off_lock, flags);
20975 @@ -173,6 +176,7 @@
20977 if (!tags->nr_free)
20978 alloc_global_tags(pool, tags);
20980 if (!tags->nr_free)
20981 steal_tags(pool, tags);
20983 @@ -184,7 +188,7 @@
20986 spin_unlock(&pool->lock);
20987 - local_irq_restore(flags);
20988 + local_unlock_irqrestore(irq_off_lock, flags);
20990 if (tag >= 0 || state == TASK_RUNNING)
20992 @@ -196,7 +200,7 @@
20996 - local_irq_save(flags);
20997 + local_lock_irqsave(irq_off_lock, flags);
20998 tags = this_cpu_ptr(pool->tag_cpu);
21000 if (state != TASK_RUNNING)
21001 @@ -221,7 +225,7 @@
21003 BUG_ON(tag >= pool->nr_tags);
21005 - local_irq_save(flags);
21006 + local_lock_irqsave(irq_off_lock, flags);
21007 tags = this_cpu_ptr(pool->tag_cpu);
21009 spin_lock(&tags->lock);
21010 @@ -253,7 +257,7 @@
21011 spin_unlock(&pool->lock);
21014 - local_irq_restore(flags);
21015 + local_unlock_irqrestore(irq_off_lock, flags);
21017 EXPORT_SYMBOL_GPL(percpu_ida_free);
21019 @@ -345,7 +349,7 @@
21020 struct percpu_ida_cpu *remote;
21021 unsigned cpu, i, err = 0;
21023 - local_irq_save(flags);
21024 + local_lock_irqsave(irq_off_lock, flags);
21025 for_each_possible_cpu(cpu) {
21026 remote = per_cpu_ptr(pool->tag_cpu, cpu);
21027 spin_lock(&remote->lock);
21028 @@ -367,7 +371,7 @@
21030 spin_unlock(&pool->lock);
21032 - local_irq_restore(flags);
21033 + local_unlock_irqrestore(irq_off_lock, flags);
21036 EXPORT_SYMBOL_GPL(percpu_ida_for_each_free);
21037 diff -Nur linux-4.9.6.orig/lib/radix-tree.c linux-4.9.6/lib/radix-tree.c
21038 --- linux-4.9.6.orig/lib/radix-tree.c 2017-01-26 08:25:24.000000000 +0100
21039 +++ linux-4.9.6/lib/radix-tree.c 2017-01-28 13:59:10.095662491 +0100
21040 @@ -290,13 +290,14 @@
21041 * succeed in getting a node here (and never reach
21042 * kmem_cache_alloc)
21044 - rtp = this_cpu_ptr(&radix_tree_preloads);
21045 + rtp = &get_cpu_var(radix_tree_preloads);
21048 rtp->nodes = ret->private_data;
21049 ret->private_data = NULL;
21052 + put_cpu_var(radix_tree_preloads);
21054 * Update the allocation stack trace as this is more useful
21056 @@ -336,6 +337,7 @@
21057 call_rcu(&node->rcu_head, radix_tree_node_rcu_free);
21060 +#ifndef CONFIG_PREEMPT_RT_FULL
21062 * Load up this CPU's radix_tree_node buffer with sufficient objects to
21063 * ensure that the addition of a single element in the tree cannot fail. On
21064 @@ -455,6 +457,7 @@
21066 return __radix_tree_preload(gfp_mask, nr_nodes);
21071 * The maximum index which can be stored in a radix tree
21072 diff -Nur linux-4.9.6.orig/lib/scatterlist.c linux-4.9.6/lib/scatterlist.c
21073 --- linux-4.9.6.orig/lib/scatterlist.c 2017-01-26 08:25:24.000000000 +0100
21074 +++ linux-4.9.6/lib/scatterlist.c 2017-01-28 13:59:10.095662491 +0100
21075 @@ -620,7 +620,7 @@
21076 flush_kernel_dcache_page(miter->page);
21078 if (miter->__flags & SG_MITER_ATOMIC) {
21079 - WARN_ON_ONCE(preemptible());
21080 + WARN_ON_ONCE(!pagefault_disabled());
21081 kunmap_atomic(miter->addr);
21083 kunmap(miter->page);
21084 @@ -664,7 +664,7 @@
21085 if (!sg_miter_skip(&miter, skip))
21088 - local_irq_save(flags);
21089 + local_irq_save_nort(flags);
21091 while (sg_miter_next(&miter) && offset < buflen) {
21093 @@ -681,7 +681,7 @@
21095 sg_miter_stop(&miter);
21097 - local_irq_restore(flags);
21098 + local_irq_restore_nort(flags);
21101 EXPORT_SYMBOL(sg_copy_buffer);
21102 diff -Nur linux-4.9.6.orig/lib/smp_processor_id.c linux-4.9.6/lib/smp_processor_id.c
21103 --- linux-4.9.6.orig/lib/smp_processor_id.c 2017-01-26 08:25:24.000000000 +0100
21104 +++ linux-4.9.6/lib/smp_processor_id.c 2017-01-28 13:59:10.095662491 +0100
21106 if (!printk_ratelimit())
21109 - printk(KERN_ERR "BUG: using %s%s() in preemptible [%08x] code: %s/%d\n",
21110 - what1, what2, preempt_count() - 1, current->comm, current->pid);
21111 + printk(KERN_ERR "BUG: using %s%s() in preemptible [%08x %08x] code: %s/%d\n",
21112 + what1, what2, preempt_count() - 1, __migrate_disabled(current),
21113 + current->comm, current->pid);
21115 print_symbol("caller is %s\n", (long)__builtin_return_address(0));
21117 diff -Nur linux-4.9.6.orig/mm/backing-dev.c linux-4.9.6/mm/backing-dev.c
21118 --- linux-4.9.6.orig/mm/backing-dev.c 2017-01-26 08:25:24.000000000 +0100
21119 +++ linux-4.9.6/mm/backing-dev.c 2017-01-28 13:59:10.095662491 +0100
21120 @@ -457,9 +457,9 @@
21122 unsigned long flags;
21124 - local_irq_save(flags);
21125 + local_irq_save_nort(flags);
21126 if (!atomic_dec_and_lock(&congested->refcnt, &cgwb_lock)) {
21127 - local_irq_restore(flags);
21128 + local_irq_restore_nort(flags);
21132 diff -Nur linux-4.9.6.orig/mm/compaction.c linux-4.9.6/mm/compaction.c
21133 --- linux-4.9.6.orig/mm/compaction.c 2017-01-26 08:25:24.000000000 +0100
21134 +++ linux-4.9.6/mm/compaction.c 2017-01-28 13:59:10.095662491 +0100
21135 @@ -1593,10 +1593,12 @@
21136 block_start_pfn(cc->migrate_pfn, cc->order);
21138 if (cc->last_migrated_pfn < current_block_start) {
21140 + cpu = get_cpu_light();
21141 + local_lock_irq(swapvec_lock);
21142 lru_add_drain_cpu(cpu);
21143 + local_unlock_irq(swapvec_lock);
21144 drain_local_pages(zone);
21147 /* No more flushing until we migrate again */
21148 cc->last_migrated_pfn = 0;
21150 diff -Nur linux-4.9.6.orig/mm/filemap.c linux-4.9.6/mm/filemap.c
21151 --- linux-4.9.6.orig/mm/filemap.c 2017-01-26 08:25:24.000000000 +0100
21152 +++ linux-4.9.6/mm/filemap.c 2017-01-28 13:59:10.099662644 +0100
21153 @@ -159,9 +159,12 @@
21154 * node->private_list is protected by
21155 * mapping->tree_lock.
21157 - if (!list_empty(&node->private_list))
21158 - list_lru_del(&workingset_shadow_nodes,
21159 + if (!list_empty(&node->private_list)) {
21160 + local_lock(workingset_shadow_lock);
21161 + list_lru_del(&__workingset_shadow_nodes,
21162 &node->private_list);
21163 + local_unlock(workingset_shadow_lock);
21168 @@ -217,8 +220,10 @@
21169 if (!dax_mapping(mapping) && !workingset_node_pages(node) &&
21170 list_empty(&node->private_list)) {
21171 node->private_data = mapping;
21172 - list_lru_add(&workingset_shadow_nodes,
21173 - &node->private_list);
21174 + local_lock(workingset_shadow_lock);
21175 + list_lru_add(&__workingset_shadow_nodes,
21176 + &node->private_list);
21177 + local_unlock(workingset_shadow_lock);
21181 diff -Nur linux-4.9.6.orig/mm/highmem.c linux-4.9.6/mm/highmem.c
21182 --- linux-4.9.6.orig/mm/highmem.c 2017-01-26 08:25:24.000000000 +0100
21183 +++ linux-4.9.6/mm/highmem.c 2017-01-28 13:59:10.099662644 +0100
21184 @@ -29,10 +29,11 @@
21185 #include <linux/kgdb.h>
21186 #include <asm/tlbflush.h>
21189 +#ifndef CONFIG_PREEMPT_RT_FULL
21190 #if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32)
21191 DEFINE_PER_CPU(int, __kmap_atomic_idx);
21196 * Virtual_count is not a pure "count".
21197 @@ -107,8 +108,9 @@
21198 unsigned long totalhigh_pages __read_mostly;
21199 EXPORT_SYMBOL(totalhigh_pages);
21202 +#ifndef CONFIG_PREEMPT_RT_FULL
21203 EXPORT_PER_CPU_SYMBOL(__kmap_atomic_idx);
21206 unsigned int nr_free_highpages (void)
21208 diff -Nur linux-4.9.6.orig/mm/Kconfig linux-4.9.6/mm/Kconfig
21209 --- linux-4.9.6.orig/mm/Kconfig 2017-01-26 08:25:24.000000000 +0100
21210 +++ linux-4.9.6/mm/Kconfig 2017-01-28 13:59:10.095662491 +0100
21211 @@ -410,7 +410,7 @@
21213 config TRANSPARENT_HUGEPAGE
21214 bool "Transparent Hugepage Support"
21215 - depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE
21216 + depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE && !PREEMPT_RT_FULL
21218 select RADIX_TREE_MULTIORDER
21220 diff -Nur linux-4.9.6.orig/mm/memcontrol.c linux-4.9.6/mm/memcontrol.c
21221 --- linux-4.9.6.orig/mm/memcontrol.c 2017-01-26 08:25:24.000000000 +0100
21222 +++ linux-4.9.6/mm/memcontrol.c 2017-01-28 13:59:10.099662644 +0100
21224 #include <net/sock.h>
21225 #include <net/ip.h>
21227 +#include <linux/locallock.h>
21229 #include <asm/uaccess.h>
21232 #define do_swap_account 0
21235 +static DEFINE_LOCAL_IRQ_LOCK(event_lock);
21237 /* Whether legacy memory+swap accounting is active */
21238 static bool do_memsw_account(void)
21240 @@ -1692,6 +1695,7 @@
21241 #define FLUSHING_CACHED_CHARGE 0
21243 static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
21244 +static DEFINE_LOCAL_IRQ_LOCK(memcg_stock_ll);
21245 static DEFINE_MUTEX(percpu_charge_mutex);
21248 @@ -1714,7 +1718,7 @@
21249 if (nr_pages > CHARGE_BATCH)
21252 - local_irq_save(flags);
21253 + local_lock_irqsave(memcg_stock_ll, flags);
21255 stock = this_cpu_ptr(&memcg_stock);
21256 if (memcg == stock->cached && stock->nr_pages >= nr_pages) {
21257 @@ -1722,7 +1726,7 @@
21261 - local_irq_restore(flags);
21262 + local_unlock_irqrestore(memcg_stock_ll, flags);
21266 @@ -1749,13 +1753,13 @@
21267 struct memcg_stock_pcp *stock;
21268 unsigned long flags;
21270 - local_irq_save(flags);
21271 + local_lock_irqsave(memcg_stock_ll, flags);
21273 stock = this_cpu_ptr(&memcg_stock);
21274 drain_stock(stock);
21275 clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
21277 - local_irq_restore(flags);
21278 + local_unlock_irqrestore(memcg_stock_ll, flags);
21282 @@ -1767,7 +1771,7 @@
21283 struct memcg_stock_pcp *stock;
21284 unsigned long flags;
21286 - local_irq_save(flags);
21287 + local_lock_irqsave(memcg_stock_ll, flags);
21289 stock = this_cpu_ptr(&memcg_stock);
21290 if (stock->cached != memcg) { /* reset if necessary */
21291 @@ -1776,7 +1780,7 @@
21293 stock->nr_pages += nr_pages;
21295 - local_irq_restore(flags);
21296 + local_unlock_irqrestore(memcg_stock_ll, flags);
21300 @@ -1792,7 +1796,7 @@
21302 /* Notify other cpus that system-wide "drain" is running */
21304 - curcpu = get_cpu();
21305 + curcpu = get_cpu_light();
21306 for_each_online_cpu(cpu) {
21307 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
21308 struct mem_cgroup *memcg;
21309 @@ -1809,7 +1813,7 @@
21310 schedule_work_on(cpu, &stock->work);
21316 mutex_unlock(&percpu_charge_mutex);
21318 @@ -4548,12 +4552,12 @@
21322 - local_irq_disable();
21323 + local_lock_irq(event_lock);
21324 mem_cgroup_charge_statistics(to, page, compound, nr_pages);
21325 memcg_check_events(to, page);
21326 mem_cgroup_charge_statistics(from, page, compound, -nr_pages);
21327 memcg_check_events(from, page);
21328 - local_irq_enable();
21329 + local_unlock_irq(event_lock);
21333 @@ -5428,10 +5432,10 @@
21335 commit_charge(page, memcg, lrucare);
21337 - local_irq_disable();
21338 + local_lock_irq(event_lock);
21339 mem_cgroup_charge_statistics(memcg, page, compound, nr_pages);
21340 memcg_check_events(memcg, page);
21341 - local_irq_enable();
21342 + local_unlock_irq(event_lock);
21344 if (do_memsw_account() && PageSwapCache(page)) {
21345 swp_entry_t entry = { .val = page_private(page) };
21346 @@ -5487,14 +5491,14 @@
21347 memcg_oom_recover(memcg);
21350 - local_irq_save(flags);
21351 + local_lock_irqsave(event_lock, flags);
21352 __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon);
21353 __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_file);
21354 __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_huge);
21355 __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout);
21356 __this_cpu_add(memcg->stat->nr_page_events, nr_pages);
21357 memcg_check_events(memcg, dummy_page);
21358 - local_irq_restore(flags);
21359 + local_unlock_irqrestore(event_lock, flags);
21361 if (!mem_cgroup_is_root(memcg))
21362 css_put_many(&memcg->css, nr_pages);
21363 @@ -5649,10 +5653,10 @@
21365 commit_charge(newpage, memcg, false);
21367 - local_irq_save(flags);
21368 + local_lock_irqsave(event_lock, flags);
21369 mem_cgroup_charge_statistics(memcg, newpage, compound, nr_pages);
21370 memcg_check_events(memcg, newpage);
21371 - local_irq_restore(flags);
21372 + local_unlock_irqrestore(event_lock, flags);
21375 DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key);
21376 @@ -5832,6 +5836,7 @@
21378 struct mem_cgroup *memcg, *swap_memcg;
21379 unsigned short oldid;
21380 + unsigned long flags;
21382 VM_BUG_ON_PAGE(PageLRU(page), page);
21383 VM_BUG_ON_PAGE(page_count(page), page);
21384 @@ -5872,12 +5877,16 @@
21385 * important here to have the interrupts disabled because it is the
21386 * only synchronisation we have for udpating the per-CPU variables.
21388 + local_lock_irqsave(event_lock, flags);
21389 +#ifndef CONFIG_PREEMPT_RT_BASE
21390 VM_BUG_ON(!irqs_disabled());
21392 mem_cgroup_charge_statistics(memcg, page, false, -1);
21393 memcg_check_events(memcg, page);
21395 if (!mem_cgroup_is_root(memcg))
21396 css_put(&memcg->css);
21397 + local_unlock_irqrestore(event_lock, flags);
21401 diff -Nur linux-4.9.6.orig/mm/mmu_context.c linux-4.9.6/mm/mmu_context.c
21402 --- linux-4.9.6.orig/mm/mmu_context.c 2017-01-26 08:25:24.000000000 +0100
21403 +++ linux-4.9.6/mm/mmu_context.c 2017-01-28 13:59:10.099662644 +0100
21405 struct task_struct *tsk = current;
21408 + preempt_disable_rt();
21409 active_mm = tsk->active_mm;
21410 if (active_mm != mm) {
21411 atomic_inc(&mm->mm_count);
21415 switch_mm(active_mm, mm, tsk);
21416 + preempt_enable_rt();
21418 #ifdef finish_arch_post_lock_switch
21419 finish_arch_post_lock_switch();
21420 diff -Nur linux-4.9.6.orig/mm/page_alloc.c linux-4.9.6/mm/page_alloc.c
21421 --- linux-4.9.6.orig/mm/page_alloc.c 2017-01-26 08:25:24.000000000 +0100
21422 +++ linux-4.9.6/mm/page_alloc.c 2017-01-28 13:59:10.099662644 +0100
21424 #include <linux/page_ext.h>
21425 #include <linux/hugetlb.h>
21426 #include <linux/sched/rt.h>
21427 +#include <linux/locallock.h>
21428 #include <linux/page_owner.h>
21429 #include <linux/kthread.h>
21430 #include <linux/memcontrol.h>
21431 @@ -281,6 +282,18 @@
21432 EXPORT_SYMBOL(nr_online_nodes);
21435 +static DEFINE_LOCAL_IRQ_LOCK(pa_lock);
21437 +#ifdef CONFIG_PREEMPT_RT_BASE
21438 +# define cpu_lock_irqsave(cpu, flags) \
21439 + local_lock_irqsave_on(pa_lock, flags, cpu)
21440 +# define cpu_unlock_irqrestore(cpu, flags) \
21441 + local_unlock_irqrestore_on(pa_lock, flags, cpu)
21443 +# define cpu_lock_irqsave(cpu, flags) local_irq_save(flags)
21444 +# define cpu_unlock_irqrestore(cpu, flags) local_irq_restore(flags)
21447 int page_group_by_mobility_disabled __read_mostly;
21449 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
21450 @@ -1072,7 +1085,7 @@
21451 #endif /* CONFIG_DEBUG_VM */
21454 - * Frees a number of pages from the PCP lists
21455 + * Frees a number of pages which have been collected from the pcp lists.
21456 * Assumes all pages on list are in same zone, and of same order.
21457 * count is the number of pages to free.
21459 @@ -1083,19 +1096,58 @@
21460 * pinned" detection logic.
21462 static void free_pcppages_bulk(struct zone *zone, int count,
21463 - struct per_cpu_pages *pcp)
21464 + struct list_head *list)
21466 - int migratetype = 0;
21467 - int batch_free = 0;
21468 unsigned long nr_scanned;
21469 bool isolated_pageblocks;
21470 + unsigned long flags;
21472 + spin_lock_irqsave(&zone->lock, flags);
21474 - spin_lock(&zone->lock);
21475 isolated_pageblocks = has_isolate_pageblock(zone);
21476 nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED);
21478 __mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned);
21480 + while (!list_empty(list)) {
21481 + struct page *page;
21482 + int mt; /* migratetype of the to-be-freed page */
21484 + page = list_first_entry(list, struct page, lru);
21485 + /* must delete as __free_one_page list manipulates */
21486 + list_del(&page->lru);
21488 + mt = get_pcppage_migratetype(page);
21489 + /* MIGRATE_ISOLATE page should not go to pcplists */
21490 + VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
21491 + /* Pageblock could have been isolated meanwhile */
21492 + if (unlikely(isolated_pageblocks))
21493 + mt = get_pageblock_migratetype(page);
21495 + if (bulkfree_pcp_prepare(page))
21498 + __free_one_page(page, page_to_pfn(page), zone, 0, mt);
21499 + trace_mm_page_pcpu_drain(page, 0, mt);
21502 + WARN_ON(count != 0);
21503 + spin_unlock_irqrestore(&zone->lock, flags);
21507 + * Moves a number of pages from the PCP lists to free list which
21508 + * is freed outside of the locked region.
21510 + * Assumes all pages on list are in same zone, and of same order.
21511 + * count is the number of pages to free.
21513 +static void isolate_pcp_pages(int count, struct per_cpu_pages *src,
21514 + struct list_head *dst)
21516 + int migratetype = 0;
21517 + int batch_free = 0;
21521 struct list_head *list;
21522 @@ -1111,7 +1163,7 @@
21524 if (++migratetype == MIGRATE_PCPTYPES)
21526 - list = &pcp->lists[migratetype];
21527 + list = &src->lists[migratetype];
21528 } while (list_empty(list));
21530 /* This is the only non-empty list. Free them all. */
21531 @@ -1119,27 +1171,12 @@
21532 batch_free = count;
21535 - int mt; /* migratetype of the to-be-freed page */
21537 page = list_last_entry(list, struct page, lru);
21538 - /* must delete as __free_one_page list manipulates */
21539 list_del(&page->lru);
21541 - mt = get_pcppage_migratetype(page);
21542 - /* MIGRATE_ISOLATE page should not go to pcplists */
21543 - VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
21544 - /* Pageblock could have been isolated meanwhile */
21545 - if (unlikely(isolated_pageblocks))
21546 - mt = get_pageblock_migratetype(page);
21548 - if (bulkfree_pcp_prepare(page))
21551 - __free_one_page(page, page_to_pfn(page), zone, 0, mt);
21552 - trace_mm_page_pcpu_drain(page, 0, mt);
21553 + list_add(&page->lru, dst);
21554 } while (--count && --batch_free && !list_empty(list));
21556 - spin_unlock(&zone->lock);
21559 static void free_one_page(struct zone *zone,
21560 @@ -1148,7 +1185,9 @@
21563 unsigned long nr_scanned;
21564 - spin_lock(&zone->lock);
21565 + unsigned long flags;
21567 + spin_lock_irqsave(&zone->lock, flags);
21568 nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED);
21570 __mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned);
21571 @@ -1158,7 +1197,7 @@
21572 migratetype = get_pfnblock_migratetype(page, pfn);
21574 __free_one_page(page, pfn, zone, order, migratetype);
21575 - spin_unlock(&zone->lock);
21576 + spin_unlock_irqrestore(&zone->lock, flags);
21579 static void __meminit __init_single_page(struct page *page, unsigned long pfn,
21580 @@ -1244,10 +1283,10 @@
21583 migratetype = get_pfnblock_migratetype(page, pfn);
21584 - local_irq_save(flags);
21585 + local_lock_irqsave(pa_lock, flags);
21586 __count_vm_events(PGFREE, 1 << order);
21587 free_one_page(page_zone(page), page, pfn, order, migratetype);
21588 - local_irq_restore(flags);
21589 + local_unlock_irqrestore(pa_lock, flags);
21592 static void __init __free_pages_boot_core(struct page *page, unsigned int order)
21593 @@ -2246,16 +2285,18 @@
21594 void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
21596 unsigned long flags;
21598 int to_drain, batch;
21600 - local_irq_save(flags);
21601 + local_lock_irqsave(pa_lock, flags);
21602 batch = READ_ONCE(pcp->batch);
21603 to_drain = min(pcp->count, batch);
21604 if (to_drain > 0) {
21605 - free_pcppages_bulk(zone, to_drain, pcp);
21606 + isolate_pcp_pages(to_drain, pcp, &dst);
21607 pcp->count -= to_drain;
21609 - local_irq_restore(flags);
21610 + local_unlock_irqrestore(pa_lock, flags);
21611 + free_pcppages_bulk(zone, to_drain, &dst);
21615 @@ -2271,16 +2312,21 @@
21616 unsigned long flags;
21617 struct per_cpu_pageset *pset;
21618 struct per_cpu_pages *pcp;
21622 - local_irq_save(flags);
21623 + cpu_lock_irqsave(cpu, flags);
21624 pset = per_cpu_ptr(zone->pageset, cpu);
21627 - if (pcp->count) {
21628 - free_pcppages_bulk(zone, pcp->count, pcp);
21629 + count = pcp->count;
21631 + isolate_pcp_pages(count, pcp, &dst);
21634 - local_irq_restore(flags);
21635 + cpu_unlock_irqrestore(cpu, flags);
21637 + free_pcppages_bulk(zone, count, &dst);
21641 @@ -2366,8 +2412,17 @@
21643 cpumask_clear_cpu(cpu, &cpus_with_pcps);
21645 +#ifndef CONFIG_PREEMPT_RT_BASE
21646 on_each_cpu_mask(&cpus_with_pcps, (smp_call_func_t) drain_local_pages,
21649 + for_each_cpu(cpu, &cpus_with_pcps) {
21651 + drain_pages_zone(cpu, zone);
21653 + drain_pages(cpu);
21658 #ifdef CONFIG_HIBERNATION
21659 @@ -2427,7 +2482,7 @@
21661 migratetype = get_pfnblock_migratetype(page, pfn);
21662 set_pcppage_migratetype(page, migratetype);
21663 - local_irq_save(flags);
21664 + local_lock_irqsave(pa_lock, flags);
21665 __count_vm_event(PGFREE);
21668 @@ -2453,12 +2508,17 @@
21670 if (pcp->count >= pcp->high) {
21671 unsigned long batch = READ_ONCE(pcp->batch);
21672 - free_pcppages_bulk(zone, batch, pcp);
21675 + isolate_pcp_pages(batch, pcp, &dst);
21676 pcp->count -= batch;
21677 + local_unlock_irqrestore(pa_lock, flags);
21678 + free_pcppages_bulk(zone, batch, &dst);
21683 - local_irq_restore(flags);
21684 + local_unlock_irqrestore(pa_lock, flags);
21688 @@ -2600,7 +2660,7 @@
21689 struct per_cpu_pages *pcp;
21690 struct list_head *list;
21692 - local_irq_save(flags);
21693 + local_lock_irqsave(pa_lock, flags);
21695 pcp = &this_cpu_ptr(zone->pageset)->pcp;
21696 list = &pcp->lists[migratetype];
21697 @@ -2627,7 +2687,7 @@
21698 * allocate greater than order-1 page units with __GFP_NOFAIL.
21700 WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
21701 - spin_lock_irqsave(&zone->lock, flags);
21702 + local_spin_lock_irqsave(pa_lock, &zone->lock, flags);
21706 @@ -2639,22 +2699,24 @@
21708 page = __rmqueue(zone, order, migratetype);
21709 } while (page && check_new_pages(page, order));
21710 - spin_unlock(&zone->lock);
21713 + spin_unlock(&zone->lock);
21716 __mod_zone_freepage_state(zone, -(1 << order),
21717 get_pcppage_migratetype(page));
21718 + spin_unlock(&zone->lock);
21721 __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
21722 zone_statistics(preferred_zone, zone, gfp_flags);
21723 - local_irq_restore(flags);
21724 + local_unlock_irqrestore(pa_lock, flags);
21726 VM_BUG_ON_PAGE(bad_range(zone, page), page);
21730 - local_irq_restore(flags);
21731 + local_unlock_irqrestore(pa_lock, flags);
21735 @@ -6505,7 +6567,9 @@
21736 int cpu = (unsigned long)hcpu;
21738 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
21739 + local_lock_irq_on(swapvec_lock, cpu);
21740 lru_add_drain_cpu(cpu);
21741 + local_unlock_irq_on(swapvec_lock, cpu);
21745 @@ -6531,6 +6595,7 @@
21746 void __init page_alloc_init(void)
21748 hotcpu_notifier(page_alloc_cpu_notify, 0);
21749 + local_irq_lock_init(pa_lock);
21753 @@ -7359,7 +7424,7 @@
21754 struct per_cpu_pageset *pset;
21756 /* avoid races with drain_pages() */
21757 - local_irq_save(flags);
21758 + local_lock_irqsave(pa_lock, flags);
21759 if (zone->pageset != &boot_pageset) {
21760 for_each_online_cpu(cpu) {
21761 pset = per_cpu_ptr(zone->pageset, cpu);
21762 @@ -7368,7 +7433,7 @@
21763 free_percpu(zone->pageset);
21764 zone->pageset = &boot_pageset;
21766 - local_irq_restore(flags);
21767 + local_unlock_irqrestore(pa_lock, flags);
21770 #ifdef CONFIG_MEMORY_HOTREMOVE
21771 diff -Nur linux-4.9.6.orig/mm/slab.h linux-4.9.6/mm/slab.h
21772 --- linux-4.9.6.orig/mm/slab.h 2017-01-26 08:25:24.000000000 +0100
21773 +++ linux-4.9.6/mm/slab.h 2017-01-28 13:59:10.099662644 +0100
21774 @@ -426,7 +426,11 @@
21775 * The slab lists for all objects.
21777 struct kmem_cache_node {
21778 +#ifdef CONFIG_SLUB
21779 + raw_spinlock_t list_lock;
21781 spinlock_t list_lock;
21785 struct list_head slabs_partial; /* partial list first, better asm code */
21786 diff -Nur linux-4.9.6.orig/mm/slub.c linux-4.9.6/mm/slub.c
21787 --- linux-4.9.6.orig/mm/slub.c 2017-01-26 08:25:24.000000000 +0100
21788 +++ linux-4.9.6/mm/slub.c 2017-01-28 13:59:10.099662644 +0100
21789 @@ -1141,7 +1141,7 @@
21790 unsigned long uninitialized_var(flags);
21793 - spin_lock_irqsave(&n->list_lock, flags);
21794 + raw_spin_lock_irqsave(&n->list_lock, flags);
21797 if (s->flags & SLAB_CONSISTENCY_CHECKS) {
21798 @@ -1176,7 +1176,7 @@
21802 - spin_unlock_irqrestore(&n->list_lock, flags);
21803 + raw_spin_unlock_irqrestore(&n->list_lock, flags);
21805 slab_fix(s, "Object at 0x%p not freed", object);
21807 @@ -1304,6 +1304,12 @@
21809 #endif /* CONFIG_SLUB_DEBUG */
21811 +struct slub_free_list {
21812 + raw_spinlock_t lock;
21813 + struct list_head list;
21815 +static DEFINE_PER_CPU(struct slub_free_list, slub_free_list);
21818 * Hooks for other subsystems that check memory allocations. In a typical
21819 * production configuration these hooks all should produce no code at all.
21820 @@ -1523,10 +1529,17 @@
21824 + bool enableirqs = false;
21826 flags &= gfp_allowed_mask;
21828 if (gfpflags_allow_blocking(flags))
21829 + enableirqs = true;
21830 +#ifdef CONFIG_PREEMPT_RT_FULL
21831 + if (system_state == SYSTEM_RUNNING)
21832 + enableirqs = true;
21835 local_irq_enable();
21837 flags |= s->allocflags;
21838 @@ -1601,7 +1614,7 @@
21842 - if (gfpflags_allow_blocking(flags))
21844 local_irq_disable();
21847 @@ -1660,6 +1673,16 @@
21848 __free_pages(page, order);
21851 +static void free_delayed(struct list_head *h)
21853 + while(!list_empty(h)) {
21854 + struct page *page = list_first_entry(h, struct page, lru);
21856 + list_del(&page->lru);
21857 + __free_slab(page->slab_cache, page);
21861 #define need_reserve_slab_rcu \
21862 (sizeof(((struct page *)NULL)->lru) < sizeof(struct rcu_head))
21864 @@ -1691,6 +1714,12 @@
21867 call_rcu(head, rcu_free_slab);
21868 + } else if (irqs_disabled()) {
21869 + struct slub_free_list *f = this_cpu_ptr(&slub_free_list);
21871 + raw_spin_lock(&f->lock);
21872 + list_add(&page->lru, &f->list);
21873 + raw_spin_unlock(&f->lock);
21875 __free_slab(s, page);
21877 @@ -1798,7 +1827,7 @@
21878 if (!n || !n->nr_partial)
21881 - spin_lock(&n->list_lock);
21882 + raw_spin_lock(&n->list_lock);
21883 list_for_each_entry_safe(page, page2, &n->partial, lru) {
21886 @@ -1823,7 +1852,7 @@
21890 - spin_unlock(&n->list_lock);
21891 + raw_spin_unlock(&n->list_lock);
21895 @@ -2069,7 +2098,7 @@
21896 * that acquire_slab() will see a slab page that
21899 - spin_lock(&n->list_lock);
21900 + raw_spin_lock(&n->list_lock);
21904 @@ -2080,7 +2109,7 @@
21905 * slabs from diagnostic functions will not see
21906 * any frozen slabs.
21908 - spin_lock(&n->list_lock);
21909 + raw_spin_lock(&n->list_lock);
21913 @@ -2115,7 +2144,7 @@
21917 - spin_unlock(&n->list_lock);
21918 + raw_spin_unlock(&n->list_lock);
21921 stat(s, DEACTIVATE_EMPTY);
21922 @@ -2147,10 +2176,10 @@
21923 n2 = get_node(s, page_to_nid(page));
21926 - spin_unlock(&n->list_lock);
21927 + raw_spin_unlock(&n->list_lock);
21930 - spin_lock(&n->list_lock);
21931 + raw_spin_lock(&n->list_lock);
21935 @@ -2179,7 +2208,7 @@
21939 - spin_unlock(&n->list_lock);
21940 + raw_spin_unlock(&n->list_lock);
21942 while (discard_page) {
21943 page = discard_page;
21944 @@ -2218,14 +2247,21 @@
21945 pobjects = oldpage->pobjects;
21946 pages = oldpage->pages;
21947 if (drain && pobjects > s->cpu_partial) {
21948 + struct slub_free_list *f;
21949 unsigned long flags;
21950 + LIST_HEAD(tofree);
21952 * partial array is full. Move the existing
21953 * set to the per node partial list.
21955 local_irq_save(flags);
21956 unfreeze_partials(s, this_cpu_ptr(s->cpu_slab));
21957 + f = this_cpu_ptr(&slub_free_list);
21958 + raw_spin_lock(&f->lock);
21959 + list_splice_init(&f->list, &tofree);
21960 + raw_spin_unlock(&f->lock);
21961 local_irq_restore(flags);
21962 + free_delayed(&tofree);
21966 @@ -2297,7 +2333,22 @@
21968 static void flush_all(struct kmem_cache *s)
21970 + LIST_HEAD(tofree);
21973 on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1, GFP_ATOMIC);
21974 + for_each_online_cpu(cpu) {
21975 + struct slub_free_list *f;
21977 + if (!has_cpu_slab(cpu, s))
21980 + f = &per_cpu(slub_free_list, cpu);
21981 + raw_spin_lock_irq(&f->lock);
21982 + list_splice_init(&f->list, &tofree);
21983 + raw_spin_unlock_irq(&f->lock);
21984 + free_delayed(&tofree);
21989 @@ -2352,10 +2403,10 @@
21990 unsigned long x = 0;
21993 - spin_lock_irqsave(&n->list_lock, flags);
21994 + raw_spin_lock_irqsave(&n->list_lock, flags);
21995 list_for_each_entry(page, &n->partial, lru)
21996 x += get_count(page);
21997 - spin_unlock_irqrestore(&n->list_lock, flags);
21998 + raw_spin_unlock_irqrestore(&n->list_lock, flags);
22001 #endif /* CONFIG_SLUB_DEBUG || CONFIG_SYSFS */
22002 @@ -2493,8 +2544,10 @@
22003 * already disabled (which is the case for bulk allocation).
22005 static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
22006 - unsigned long addr, struct kmem_cache_cpu *c)
22007 + unsigned long addr, struct kmem_cache_cpu *c,
22008 + struct list_head *to_free)
22010 + struct slub_free_list *f;
22014 @@ -2554,6 +2607,13 @@
22015 VM_BUG_ON(!c->page->frozen);
22016 c->freelist = get_freepointer(s, freelist);
22017 c->tid = next_tid(c->tid);
22020 + f = this_cpu_ptr(&slub_free_list);
22021 + raw_spin_lock(&f->lock);
22022 + list_splice_init(&f->list, to_free);
22023 + raw_spin_unlock(&f->lock);
22028 @@ -2585,7 +2645,7 @@
22029 deactivate_slab(s, page, get_freepointer(s, freelist));
22031 c->freelist = NULL;
22037 @@ -2597,6 +2657,7 @@
22040 unsigned long flags;
22041 + LIST_HEAD(tofree);
22043 local_irq_save(flags);
22044 #ifdef CONFIG_PREEMPT
22045 @@ -2608,8 +2669,9 @@
22046 c = this_cpu_ptr(s->cpu_slab);
22049 - p = ___slab_alloc(s, gfpflags, node, addr, c);
22050 + p = ___slab_alloc(s, gfpflags, node, addr, c, &tofree);
22051 local_irq_restore(flags);
22052 + free_delayed(&tofree);
22056 @@ -2795,7 +2857,7 @@
22060 - spin_unlock_irqrestore(&n->list_lock, flags);
22061 + raw_spin_unlock_irqrestore(&n->list_lock, flags);
22064 prior = page->freelist;
22065 @@ -2827,7 +2889,7 @@
22066 * Otherwise the list_lock will synchronize with
22067 * other processors updating the list of slabs.
22069 - spin_lock_irqsave(&n->list_lock, flags);
22070 + raw_spin_lock_irqsave(&n->list_lock, flags);
22074 @@ -2869,7 +2931,7 @@
22075 add_partial(n, page, DEACTIVATE_TO_TAIL);
22076 stat(s, FREE_ADD_PARTIAL);
22078 - spin_unlock_irqrestore(&n->list_lock, flags);
22079 + raw_spin_unlock_irqrestore(&n->list_lock, flags);
22083 @@ -2884,7 +2946,7 @@
22084 remove_full(s, n, page);
22087 - spin_unlock_irqrestore(&n->list_lock, flags);
22088 + raw_spin_unlock_irqrestore(&n->list_lock, flags);
22089 stat(s, FREE_SLAB);
22090 discard_slab(s, page);
22092 @@ -3089,6 +3151,7 @@
22095 struct kmem_cache_cpu *c;
22096 + LIST_HEAD(to_free);
22099 /* memcg and kmem_cache debug support */
22100 @@ -3112,7 +3175,7 @@
22101 * of re-populating per CPU c->freelist
22103 p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE,
22105 + _RET_IP_, c, &to_free);
22106 if (unlikely(!p[i]))
22109 @@ -3124,6 +3187,7 @@
22111 c->tid = next_tid(c->tid);
22112 local_irq_enable();
22113 + free_delayed(&to_free);
22115 /* Clear memory outside IRQ disabled fastpath loop */
22116 if (unlikely(flags & __GFP_ZERO)) {
22117 @@ -3271,7 +3335,7 @@
22118 init_kmem_cache_node(struct kmem_cache_node *n)
22121 - spin_lock_init(&n->list_lock);
22122 + raw_spin_lock_init(&n->list_lock);
22123 INIT_LIST_HEAD(&n->partial);
22124 #ifdef CONFIG_SLUB_DEBUG
22125 atomic_long_set(&n->nr_slabs, 0);
22126 @@ -3615,6 +3679,10 @@
22129 #ifdef CONFIG_SLUB_DEBUG
22130 +#ifdef CONFIG_PREEMPT_RT_BASE
22131 + /* XXX move out of irq-off section */
22132 + slab_err(s, page, text, s->name);
22134 void *addr = page_address(page);
22136 unsigned long *map = kzalloc(BITS_TO_LONGS(page->objects) *
22137 @@ -3635,6 +3703,7 @@
22145 @@ -3648,7 +3717,7 @@
22146 struct page *page, *h;
22148 BUG_ON(irqs_disabled());
22149 - spin_lock_irq(&n->list_lock);
22150 + raw_spin_lock_irq(&n->list_lock);
22151 list_for_each_entry_safe(page, h, &n->partial, lru) {
22152 if (!page->inuse) {
22153 remove_partial(n, page);
22154 @@ -3658,7 +3727,7 @@
22155 "Objects remaining in %s on __kmem_cache_shutdown()");
22158 - spin_unlock_irq(&n->list_lock);
22159 + raw_spin_unlock_irq(&n->list_lock);
22161 list_for_each_entry_safe(page, h, &discard, lru)
22162 discard_slab(s, page);
22163 @@ -3916,7 +3985,7 @@
22164 for (i = 0; i < SHRINK_PROMOTE_MAX; i++)
22165 INIT_LIST_HEAD(promote + i);
22167 - spin_lock_irqsave(&n->list_lock, flags);
22168 + raw_spin_lock_irqsave(&n->list_lock, flags);
22171 * Build lists of slabs to discard or promote.
22172 @@ -3947,7 +4016,7 @@
22173 for (i = SHRINK_PROMOTE_MAX - 1; i >= 0; i--)
22174 list_splice(promote + i, &n->partial);
22176 - spin_unlock_irqrestore(&n->list_lock, flags);
22177 + raw_spin_unlock_irqrestore(&n->list_lock, flags);
22179 /* Release empty slabs */
22180 list_for_each_entry_safe(page, t, &discard, lru)
22181 @@ -4123,6 +4192,12 @@
22183 static __initdata struct kmem_cache boot_kmem_cache,
22184 boot_kmem_cache_node;
22187 + for_each_possible_cpu(cpu) {
22188 + raw_spin_lock_init(&per_cpu(slub_free_list, cpu).lock);
22189 + INIT_LIST_HEAD(&per_cpu(slub_free_list, cpu).list);
22192 if (debug_guardpage_minorder())
22193 slub_max_order = 0;
22194 @@ -4331,7 +4406,7 @@
22196 unsigned long flags;
22198 - spin_lock_irqsave(&n->list_lock, flags);
22199 + raw_spin_lock_irqsave(&n->list_lock, flags);
22201 list_for_each_entry(page, &n->partial, lru) {
22202 validate_slab_slab(s, page, map);
22203 @@ -4353,7 +4428,7 @@
22204 s->name, count, atomic_long_read(&n->nr_slabs));
22207 - spin_unlock_irqrestore(&n->list_lock, flags);
22208 + raw_spin_unlock_irqrestore(&n->list_lock, flags);
22212 @@ -4541,12 +4616,12 @@
22213 if (!atomic_long_read(&n->nr_slabs))
22216 - spin_lock_irqsave(&n->list_lock, flags);
22217 + raw_spin_lock_irqsave(&n->list_lock, flags);
22218 list_for_each_entry(page, &n->partial, lru)
22219 process_slab(&t, s, page, alloc, map);
22220 list_for_each_entry(page, &n->full, lru)
22221 process_slab(&t, s, page, alloc, map);
22222 - spin_unlock_irqrestore(&n->list_lock, flags);
22223 + raw_spin_unlock_irqrestore(&n->list_lock, flags);
22226 for (i = 0; i < t.count; i++) {
22227 diff -Nur linux-4.9.6.orig/mm/swap.c linux-4.9.6/mm/swap.c
22228 --- linux-4.9.6.orig/mm/swap.c 2017-01-26 08:25:24.000000000 +0100
22229 +++ linux-4.9.6/mm/swap.c 2017-01-28 13:59:10.103662798 +0100
22231 #include <linux/memcontrol.h>
22232 #include <linux/gfp.h>
22233 #include <linux/uio.h>
22234 +#include <linux/locallock.h>
22235 #include <linux/hugetlb.h>
22236 #include <linux/page_idle.h>
22240 static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs);
22242 +static DEFINE_LOCAL_IRQ_LOCK(rotate_lock);
22243 +DEFINE_LOCAL_IRQ_LOCK(swapvec_lock);
22246 * This path almost never happens for VM activity - pages are normally
22247 @@ -240,11 +243,11 @@
22248 unsigned long flags;
22251 - local_irq_save(flags);
22252 + local_lock_irqsave(rotate_lock, flags);
22253 pvec = this_cpu_ptr(&lru_rotate_pvecs);
22254 if (!pagevec_add(pvec, page) || PageCompound(page))
22255 pagevec_move_tail(pvec);
22256 - local_irq_restore(flags);
22257 + local_unlock_irqrestore(rotate_lock, flags);
22261 @@ -294,12 +297,13 @@
22263 page = compound_head(page);
22264 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
22265 - struct pagevec *pvec = &get_cpu_var(activate_page_pvecs);
22266 + struct pagevec *pvec = &get_locked_var(swapvec_lock,
22267 + activate_page_pvecs);
22270 if (!pagevec_add(pvec, page) || PageCompound(page))
22271 pagevec_lru_move_fn(pvec, __activate_page, NULL);
22272 - put_cpu_var(activate_page_pvecs);
22273 + put_locked_var(swapvec_lock, activate_page_pvecs);
22277 @@ -326,7 +330,7 @@
22279 static void __lru_cache_activate_page(struct page *page)
22281 - struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
22282 + struct pagevec *pvec = &get_locked_var(swapvec_lock, lru_add_pvec);
22286 @@ -348,7 +352,7 @@
22290 - put_cpu_var(lru_add_pvec);
22291 + put_locked_var(swapvec_lock, lru_add_pvec);
22295 @@ -390,12 +394,12 @@
22297 static void __lru_cache_add(struct page *page)
22299 - struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
22300 + struct pagevec *pvec = &get_locked_var(swapvec_lock, lru_add_pvec);
22303 if (!pagevec_add(pvec, page) || PageCompound(page))
22304 __pagevec_lru_add(pvec);
22305 - put_cpu_var(lru_add_pvec);
22306 + put_locked_var(swapvec_lock, lru_add_pvec);
22310 @@ -593,9 +597,15 @@
22311 unsigned long flags;
22313 /* No harm done if a racing interrupt already did this */
22314 - local_irq_save(flags);
22315 +#ifdef CONFIG_PREEMPT_RT_BASE
22316 + local_lock_irqsave_on(rotate_lock, flags, cpu);
22317 pagevec_move_tail(pvec);
22318 - local_irq_restore(flags);
22319 + local_unlock_irqrestore_on(rotate_lock, flags, cpu);
22321 + local_lock_irqsave(rotate_lock, flags);
22322 + pagevec_move_tail(pvec);
22323 + local_unlock_irqrestore(rotate_lock, flags);
22327 pvec = &per_cpu(lru_deactivate_file_pvecs, cpu);
22328 @@ -627,11 +637,12 @@
22331 if (likely(get_page_unless_zero(page))) {
22332 - struct pagevec *pvec = &get_cpu_var(lru_deactivate_file_pvecs);
22333 + struct pagevec *pvec = &get_locked_var(swapvec_lock,
22334 + lru_deactivate_file_pvecs);
22336 if (!pagevec_add(pvec, page) || PageCompound(page))
22337 pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
22338 - put_cpu_var(lru_deactivate_file_pvecs);
22339 + put_locked_var(swapvec_lock, lru_deactivate_file_pvecs);
22343 @@ -646,27 +657,31 @@
22344 void deactivate_page(struct page *page)
22346 if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
22347 - struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs);
22348 + struct pagevec *pvec = &get_locked_var(swapvec_lock,
22349 + lru_deactivate_pvecs);
22352 if (!pagevec_add(pvec, page) || PageCompound(page))
22353 pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
22354 - put_cpu_var(lru_deactivate_pvecs);
22355 + put_locked_var(swapvec_lock, lru_deactivate_pvecs);
22359 void lru_add_drain(void)
22361 - lru_add_drain_cpu(get_cpu());
22363 + lru_add_drain_cpu(local_lock_cpu(swapvec_lock));
22364 + local_unlock_cpu(swapvec_lock);
22367 -static void lru_add_drain_per_cpu(struct work_struct *dummy)
22368 +#ifdef CONFIG_PREEMPT_RT_BASE
22369 +static inline void remote_lru_add_drain(int cpu, struct cpumask *has_work)
22372 + local_lock_on(swapvec_lock, cpu);
22373 + lru_add_drain_cpu(cpu);
22374 + local_unlock_on(swapvec_lock, cpu);
22377 -static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
22381 * lru_add_drain_wq is used to do lru_add_drain_all() from a WQ_MEM_RECLAIM
22382 @@ -686,6 +701,22 @@
22384 early_initcall(lru_init);
22386 +static void lru_add_drain_per_cpu(struct work_struct *dummy)
22391 +static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
22392 +static inline void remote_lru_add_drain(int cpu, struct cpumask *has_work)
22394 + struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);
22396 + INIT_WORK(work, lru_add_drain_per_cpu);
22397 + queue_work_on(cpu, lru_add_drain_wq, work);
22398 + cpumask_set_cpu(cpu, has_work);
22402 void lru_add_drain_all(void)
22404 static DEFINE_MUTEX(lock);
22405 @@ -697,21 +728,18 @@
22406 cpumask_clear(&has_work);
22408 for_each_online_cpu(cpu) {
22409 - struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);
22411 if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
22412 pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
22413 pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) ||
22414 pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) ||
22415 - need_activate_page_drain(cpu)) {
22416 - INIT_WORK(work, lru_add_drain_per_cpu);
22417 - queue_work_on(cpu, lru_add_drain_wq, work);
22418 - cpumask_set_cpu(cpu, &has_work);
22420 + need_activate_page_drain(cpu))
22421 + remote_lru_add_drain(cpu, &has_work);
22424 +#ifndef CONFIG_PREEMPT_RT_BASE
22425 for_each_cpu(cpu, &has_work)
22426 flush_work(&per_cpu(lru_add_drain_work, cpu));
22430 mutex_unlock(&lock);
22431 diff -Nur linux-4.9.6.orig/mm/truncate.c linux-4.9.6/mm/truncate.c
22432 --- linux-4.9.6.orig/mm/truncate.c 2017-01-26 08:25:24.000000000 +0100
22433 +++ linux-4.9.6/mm/truncate.c 2017-01-28 13:59:10.103662798 +0100
22435 * protected by mapping->tree_lock.
22437 if (!workingset_node_shadows(node) &&
22438 - !list_empty(&node->private_list))
22439 - list_lru_del(&workingset_shadow_nodes,
22440 + !list_empty(&node->private_list)) {
22441 + local_lock(workingset_shadow_lock);
22442 + list_lru_del(&__workingset_shadow_nodes,
22443 &node->private_list);
22444 + local_unlock(workingset_shadow_lock);
22446 __radix_tree_delete_node(&mapping->page_tree, node);
22448 spin_unlock_irq(&mapping->tree_lock);
22449 diff -Nur linux-4.9.6.orig/mm/vmalloc.c linux-4.9.6/mm/vmalloc.c
22450 --- linux-4.9.6.orig/mm/vmalloc.c 2017-01-26 08:25:24.000000000 +0100
22451 +++ linux-4.9.6/mm/vmalloc.c 2017-01-28 13:59:10.103662798 +0100
22452 @@ -845,7 +845,7 @@
22453 struct vmap_block *vb;
22454 struct vmap_area *va;
22455 unsigned long vb_idx;
22457 + int node, err, cpu;
22460 node = numa_node_id();
22461 @@ -888,11 +888,12 @@
22463 radix_tree_preload_end();
22465 - vbq = &get_cpu_var(vmap_block_queue);
22466 + cpu = get_cpu_light();
22467 + vbq = this_cpu_ptr(&vmap_block_queue);
22468 spin_lock(&vbq->lock);
22469 list_add_tail_rcu(&vb->free_list, &vbq->free);
22470 spin_unlock(&vbq->lock);
22471 - put_cpu_var(vmap_block_queue);
22476 @@ -961,6 +962,7 @@
22477 struct vmap_block *vb;
22478 void *vaddr = NULL;
22479 unsigned int order;
22482 BUG_ON(offset_in_page(size));
22483 BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
22484 @@ -975,7 +977,8 @@
22485 order = get_order(size);
22488 - vbq = &get_cpu_var(vmap_block_queue);
22489 + cpu = get_cpu_light();
22490 + vbq = this_cpu_ptr(&vmap_block_queue);
22491 list_for_each_entry_rcu(vb, &vbq->free, free_list) {
22492 unsigned long pages_off;
22494 @@ -998,7 +1001,7 @@
22498 - put_cpu_var(vmap_block_queue);
22502 /* Allocate new block if nothing was found */
22503 diff -Nur linux-4.9.6.orig/mm/vmstat.c linux-4.9.6/mm/vmstat.c
22504 --- linux-4.9.6.orig/mm/vmstat.c 2017-01-26 08:25:24.000000000 +0100
22505 +++ linux-4.9.6/mm/vmstat.c 2017-01-28 13:59:10.103662798 +0100
22506 @@ -245,6 +245,7 @@
22510 + preempt_disable_rt();
22511 x = delta + __this_cpu_read(*p);
22513 t = __this_cpu_read(pcp->stat_threshold);
22514 @@ -254,6 +255,7 @@
22517 __this_cpu_write(*p, x);
22518 + preempt_enable_rt();
22520 EXPORT_SYMBOL(__mod_zone_page_state);
22522 @@ -265,6 +267,7 @@
22526 + preempt_disable_rt();
22527 x = delta + __this_cpu_read(*p);
22529 t = __this_cpu_read(pcp->stat_threshold);
22530 @@ -274,6 +277,7 @@
22533 __this_cpu_write(*p, x);
22534 + preempt_enable_rt();
22536 EXPORT_SYMBOL(__mod_node_page_state);
22538 @@ -306,6 +310,7 @@
22539 s8 __percpu *p = pcp->vm_stat_diff + item;
22542 + preempt_disable_rt();
22543 v = __this_cpu_inc_return(*p);
22544 t = __this_cpu_read(pcp->stat_threshold);
22545 if (unlikely(v > t)) {
22546 @@ -314,6 +319,7 @@
22547 zone_page_state_add(v + overstep, zone, item);
22548 __this_cpu_write(*p, -overstep);
22550 + preempt_enable_rt();
22553 void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
22554 @@ -322,6 +328,7 @@
22555 s8 __percpu *p = pcp->vm_node_stat_diff + item;
22558 + preempt_disable_rt();
22559 v = __this_cpu_inc_return(*p);
22560 t = __this_cpu_read(pcp->stat_threshold);
22561 if (unlikely(v > t)) {
22562 @@ -330,6 +337,7 @@
22563 node_page_state_add(v + overstep, pgdat, item);
22564 __this_cpu_write(*p, -overstep);
22566 + preempt_enable_rt();
22569 void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
22570 @@ -350,6 +358,7 @@
22571 s8 __percpu *p = pcp->vm_stat_diff + item;
22574 + preempt_disable_rt();
22575 v = __this_cpu_dec_return(*p);
22576 t = __this_cpu_read(pcp->stat_threshold);
22577 if (unlikely(v < - t)) {
22578 @@ -358,6 +367,7 @@
22579 zone_page_state_add(v - overstep, zone, item);
22580 __this_cpu_write(*p, overstep);
22582 + preempt_enable_rt();
22585 void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
22586 @@ -366,6 +376,7 @@
22587 s8 __percpu *p = pcp->vm_node_stat_diff + item;
22590 + preempt_disable_rt();
22591 v = __this_cpu_dec_return(*p);
22592 t = __this_cpu_read(pcp->stat_threshold);
22593 if (unlikely(v < - t)) {
22594 @@ -374,6 +385,7 @@
22595 node_page_state_add(v - overstep, pgdat, item);
22596 __this_cpu_write(*p, overstep);
22598 + preempt_enable_rt();
22601 void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
22602 diff -Nur linux-4.9.6.orig/mm/workingset.c linux-4.9.6/mm/workingset.c
22603 --- linux-4.9.6.orig/mm/workingset.c 2017-01-26 08:25:24.000000000 +0100
22604 +++ linux-4.9.6/mm/workingset.c 2017-01-28 13:59:10.103662798 +0100
22605 @@ -334,7 +334,8 @@
22606 * point where they would still be useful.
22609 -struct list_lru workingset_shadow_nodes;
22610 +struct list_lru __workingset_shadow_nodes;
22611 +DEFINE_LOCAL_IRQ_LOCK(workingset_shadow_lock);
22613 static unsigned long count_shadow_nodes(struct shrinker *shrinker,
22614 struct shrink_control *sc)
22615 @@ -344,9 +345,9 @@
22616 unsigned long pages;
22618 /* list_lru lock nests inside IRQ-safe mapping->tree_lock */
22619 - local_irq_disable();
22620 - shadow_nodes = list_lru_shrink_count(&workingset_shadow_nodes, sc);
22621 - local_irq_enable();
22622 + local_lock_irq(workingset_shadow_lock);
22623 + shadow_nodes = list_lru_shrink_count(&__workingset_shadow_nodes, sc);
22624 + local_unlock_irq(workingset_shadow_lock);
22627 pages = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid,
22628 @@ -438,9 +439,9 @@
22629 spin_unlock(&mapping->tree_lock);
22630 ret = LRU_REMOVED_RETRY;
22632 - local_irq_enable();
22633 + local_unlock_irq(workingset_shadow_lock);
22635 - local_irq_disable();
22636 + local_lock_irq(workingset_shadow_lock);
22637 spin_lock(lru_lock);
22640 @@ -451,10 +452,10 @@
22643 /* list_lru lock nests inside IRQ-safe mapping->tree_lock */
22644 - local_irq_disable();
22645 - ret = list_lru_shrink_walk(&workingset_shadow_nodes, sc,
22646 + local_lock_irq(workingset_shadow_lock);
22647 + ret = list_lru_shrink_walk(&__workingset_shadow_nodes, sc,
22648 shadow_lru_isolate, NULL);
22649 - local_irq_enable();
22650 + local_unlock_irq(workingset_shadow_lock);
22654 @@ -492,7 +493,7 @@
22655 pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n",
22656 timestamp_bits, max_order, bucket_order);
22658 - ret = list_lru_init_key(&workingset_shadow_nodes, &shadow_nodes_key);
22659 + ret = list_lru_init_key(&__workingset_shadow_nodes, &shadow_nodes_key);
22662 ret = register_shrinker(&workingset_shadow_shrinker);
22663 @@ -500,7 +501,7 @@
22667 - list_lru_destroy(&workingset_shadow_nodes);
22668 + list_lru_destroy(&__workingset_shadow_nodes);
22672 diff -Nur linux-4.9.6.orig/mm/zsmalloc.c linux-4.9.6/mm/zsmalloc.c
22673 --- linux-4.9.6.orig/mm/zsmalloc.c 2017-01-26 08:25:24.000000000 +0100
22674 +++ linux-4.9.6/mm/zsmalloc.c 2017-01-28 13:59:10.103662798 +0100
22676 #include <linux/mount.h>
22677 #include <linux/migrate.h>
22678 #include <linux/pagemap.h>
22679 +#include <linux/locallock.h>
22681 #define ZSPAGE_MAGIC 0x58
22685 #define ZS_MAX_ZSPAGE_ORDER 2
22686 #define ZS_MAX_PAGES_PER_ZSPAGE (_AC(1, UL) << ZS_MAX_ZSPAGE_ORDER)
22688 #define ZS_HANDLE_SIZE (sizeof(unsigned long))
22690 +#ifdef CONFIG_PREEMPT_RT_FULL
22692 +struct zsmalloc_handle {
22693 + unsigned long addr;
22694 + struct mutex lock;
22697 +#define ZS_HANDLE_ALLOC_SIZE (sizeof(struct zsmalloc_handle))
22701 +#define ZS_HANDLE_ALLOC_SIZE (sizeof(unsigned long))
22705 * Object location (<PFN>, <obj_idx>) is encoded as
22706 * as single (unsigned long) handle value.
22707 @@ -327,7 +341,7 @@
22709 static int create_cache(struct zs_pool *pool)
22711 - pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_SIZE,
22712 + pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_ALLOC_SIZE,
22714 if (!pool->handle_cachep)
22716 @@ -351,10 +365,27 @@
22718 static unsigned long cache_alloc_handle(struct zs_pool *pool, gfp_t gfp)
22720 - return (unsigned long)kmem_cache_alloc(pool->handle_cachep,
22721 - gfp & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
22724 + p = kmem_cache_alloc(pool->handle_cachep,
22725 + gfp & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
22726 +#ifdef CONFIG_PREEMPT_RT_FULL
22728 + struct zsmalloc_handle *zh = p;
22730 + mutex_init(&zh->lock);
22733 + return (unsigned long)p;
22736 +#ifdef CONFIG_PREEMPT_RT_FULL
22737 +static struct zsmalloc_handle *zs_get_pure_handle(unsigned long handle)
22739 + return (void *)(handle &~((1 << OBJ_TAG_BITS) - 1));
22743 static void cache_free_handle(struct zs_pool *pool, unsigned long handle)
22745 kmem_cache_free(pool->handle_cachep, (void *)handle);
22746 @@ -373,12 +404,18 @@
22748 static void record_obj(unsigned long handle, unsigned long obj)
22750 +#ifdef CONFIG_PREEMPT_RT_FULL
22751 + struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
22753 + WRITE_ONCE(zh->addr, obj);
22756 * lsb of @obj represents handle lock while other bits
22757 * represent object value the handle is pointing so
22758 * updating shouldn't do store tearing.
22760 WRITE_ONCE(*(unsigned long *)handle, obj);
22765 @@ -467,6 +504,7 @@
22767 /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */
22768 static DEFINE_PER_CPU(struct mapping_area, zs_map_area);
22769 +static DEFINE_LOCAL_IRQ_LOCK(zs_map_area_lock);
22771 static bool is_zspage_isolated(struct zspage *zspage)
22773 @@ -902,7 +940,13 @@
22775 static unsigned long handle_to_obj(unsigned long handle)
22777 +#ifdef CONFIG_PREEMPT_RT_FULL
22778 + struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
22782 return *(unsigned long *)handle;
22786 static unsigned long obj_to_head(struct page *page, void *obj)
22787 @@ -916,22 +960,46 @@
22789 static inline int testpin_tag(unsigned long handle)
22791 +#ifdef CONFIG_PREEMPT_RT_FULL
22792 + struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
22794 + return mutex_is_locked(&zh->lock);
22796 return bit_spin_is_locked(HANDLE_PIN_BIT, (unsigned long *)handle);
22800 static inline int trypin_tag(unsigned long handle)
22802 +#ifdef CONFIG_PREEMPT_RT_FULL
22803 + struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
22805 + return mutex_trylock(&zh->lock);
22807 return bit_spin_trylock(HANDLE_PIN_BIT, (unsigned long *)handle);
22811 static void pin_tag(unsigned long handle)
22813 +#ifdef CONFIG_PREEMPT_RT_FULL
22814 + struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
22816 + return mutex_lock(&zh->lock);
22818 bit_spin_lock(HANDLE_PIN_BIT, (unsigned long *)handle);
22822 static void unpin_tag(unsigned long handle)
22824 +#ifdef CONFIG_PREEMPT_RT_FULL
22825 + struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
22827 + return mutex_unlock(&zh->lock);
22829 bit_spin_unlock(HANDLE_PIN_BIT, (unsigned long *)handle);
22833 static void reset_page(struct page *page)
22834 @@ -1423,7 +1491,7 @@
22835 class = pool->size_class[class_idx];
22836 off = (class->size * obj_idx) & ~PAGE_MASK;
22838 - area = &get_cpu_var(zs_map_area);
22839 + area = &get_locked_var(zs_map_area_lock, zs_map_area);
22841 if (off + class->size <= PAGE_SIZE) {
22842 /* this object is contained entirely within a page */
22843 @@ -1477,7 +1545,7 @@
22845 __zs_unmap_object(area, pages, off, class->size);
22847 - put_cpu_var(zs_map_area);
22848 + put_locked_var(zs_map_area_lock, zs_map_area);
22850 migrate_read_unlock(zspage);
22852 diff -Nur linux-4.9.6.orig/net/core/dev.c linux-4.9.6/net/core/dev.c
22853 --- linux-4.9.6.orig/net/core/dev.c 2017-01-26 08:25:24.000000000 +0100
22854 +++ linux-4.9.6/net/core/dev.c 2017-01-28 13:59:10.103662798 +0100
22855 @@ -190,6 +190,7 @@
22856 static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
22858 static seqcount_t devnet_rename_seq;
22859 +static DEFINE_MUTEX(devnet_rename_mutex);
22861 static inline void dev_base_seq_inc(struct net *net)
22863 @@ -211,14 +212,14 @@
22864 static inline void rps_lock(struct softnet_data *sd)
22867 - spin_lock(&sd->input_pkt_queue.lock);
22868 + raw_spin_lock(&sd->input_pkt_queue.raw_lock);
22872 static inline void rps_unlock(struct softnet_data *sd)
22875 - spin_unlock(&sd->input_pkt_queue.lock);
22876 + raw_spin_unlock(&sd->input_pkt_queue.raw_lock);
22880 @@ -888,7 +889,8 @@
22881 strcpy(name, dev->name);
22883 if (read_seqcount_retry(&devnet_rename_seq, seq)) {
22885 + mutex_lock(&devnet_rename_mutex);
22886 + mutex_unlock(&devnet_rename_mutex);
22890 @@ -1157,20 +1159,17 @@
22891 if (dev->flags & IFF_UP)
22894 - write_seqcount_begin(&devnet_rename_seq);
22895 + mutex_lock(&devnet_rename_mutex);
22896 + __raw_write_seqcount_begin(&devnet_rename_seq);
22898 - if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
22899 - write_seqcount_end(&devnet_rename_seq);
22902 + if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
22905 memcpy(oldname, dev->name, IFNAMSIZ);
22907 err = dev_get_valid_name(net, dev, newname);
22909 - write_seqcount_end(&devnet_rename_seq);
22915 if (oldname[0] && !strchr(oldname, '%'))
22916 netdev_info(dev, "renamed from %s\n", oldname);
22917 @@ -1183,11 +1182,12 @@
22919 memcpy(dev->name, oldname, IFNAMSIZ);
22920 dev->name_assign_type = old_assign_type;
22921 - write_seqcount_end(&devnet_rename_seq);
22927 - write_seqcount_end(&devnet_rename_seq);
22928 + __raw_write_seqcount_end(&devnet_rename_seq);
22929 + mutex_unlock(&devnet_rename_mutex);
22931 netdev_adjacent_rename_links(dev, oldname);
22933 @@ -1208,7 +1208,8 @@
22934 /* err >= 0 after dev_alloc_name() or stores the first errno */
22937 - write_seqcount_begin(&devnet_rename_seq);
22938 + mutex_lock(&devnet_rename_mutex);
22939 + __raw_write_seqcount_begin(&devnet_rename_seq);
22940 memcpy(dev->name, oldname, IFNAMSIZ);
22941 memcpy(oldname, newname, IFNAMSIZ);
22942 dev->name_assign_type = old_assign_type;
22943 @@ -1221,6 +1222,11 @@
22949 + __raw_write_seqcount_end(&devnet_rename_seq);
22950 + mutex_unlock(&devnet_rename_mutex);
22955 @@ -2263,6 +2269,7 @@
22956 sd->output_queue_tailp = &q->next_sched;
22957 raise_softirq_irqoff(NET_TX_SOFTIRQ);
22958 local_irq_restore(flags);
22959 + preempt_check_resched_rt();
22962 void __netif_schedule(struct Qdisc *q)
22963 @@ -2344,6 +2351,7 @@
22964 __this_cpu_write(softnet_data.completion_queue, skb);
22965 raise_softirq_irqoff(NET_TX_SOFTIRQ);
22966 local_irq_restore(flags);
22967 + preempt_check_resched_rt();
22969 EXPORT_SYMBOL(__dev_kfree_skb_irq);
22971 @@ -3078,7 +3086,11 @@
22972 * This permits qdisc->running owner to get the lock more
22973 * often and dequeue packets faster.
22975 +#ifdef CONFIG_PREEMPT_RT_FULL
22976 + contended = true;
22978 contended = qdisc_is_running(q);
22980 if (unlikely(contended))
22981 spin_lock(&q->busylock);
22983 @@ -3141,8 +3153,10 @@
22984 #define skb_update_prio(skb)
22987 +#ifndef CONFIG_PREEMPT_RT_FULL
22988 DEFINE_PER_CPU(int, xmit_recursion);
22989 EXPORT_SYMBOL(xmit_recursion);
22993 * dev_loopback_xmit - loop back @skb
22994 @@ -3376,8 +3390,7 @@
22995 int cpu = smp_processor_id(); /* ok because BHs are off */
22997 if (txq->xmit_lock_owner != cpu) {
22998 - if (unlikely(__this_cpu_read(xmit_recursion) >
22999 - XMIT_RECURSION_LIMIT))
23000 + if (unlikely(xmit_rec_read() > XMIT_RECURSION_LIMIT))
23001 goto recursion_alert;
23003 skb = validate_xmit_skb(skb, dev);
23004 @@ -3387,9 +3400,9 @@
23005 HARD_TX_LOCK(dev, txq, cpu);
23007 if (!netif_xmit_stopped(txq)) {
23008 - __this_cpu_inc(xmit_recursion);
23010 skb = dev_hard_start_xmit(skb, dev, txq, &rc);
23011 - __this_cpu_dec(xmit_recursion);
23013 if (dev_xmit_complete(rc)) {
23014 HARD_TX_UNLOCK(dev, txq);
23016 @@ -3763,6 +3776,7 @@
23019 local_irq_restore(flags);
23020 + preempt_check_resched_rt();
23022 atomic_long_inc(&skb->dev->rx_dropped);
23024 @@ -3781,7 +3795,7 @@
23025 struct rps_dev_flow voidflow, *rflow = &voidflow;
23028 - preempt_disable();
23029 + migrate_disable();
23032 cpu = get_rps_cpu(skb->dev, skb, &rflow);
23033 @@ -3791,13 +3805,13 @@
23034 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
23037 - preempt_enable();
23038 + migrate_enable();
23042 unsigned int qtail;
23043 - ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
23045 + ret = enqueue_to_backlog(skb, get_cpu_light(), &qtail);
23050 @@ -3831,11 +3845,9 @@
23052 trace_netif_rx_ni_entry(skb);
23054 - preempt_disable();
23055 + local_bh_disable();
23056 err = netif_rx_internal(skb);
23057 - if (local_softirq_pending())
23059 - preempt_enable();
23060 + local_bh_enable();
23064 @@ -4314,7 +4326,7 @@
23065 skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
23066 if (skb->dev->reg_state == NETREG_UNREGISTERING) {
23067 __skb_unlink(skb, &sd->input_pkt_queue);
23069 + __skb_queue_tail(&sd->tofree_queue, skb);
23070 input_queue_head_incr(sd);
23073 @@ -4324,11 +4336,14 @@
23074 skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
23075 if (skb->dev->reg_state == NETREG_UNREGISTERING) {
23076 __skb_unlink(skb, &sd->process_queue);
23078 + __skb_queue_tail(&sd->tofree_queue, skb);
23079 input_queue_head_incr(sd);
23082 + if (!skb_queue_empty(&sd->tofree_queue))
23083 + raise_softirq_irqoff(NET_RX_SOFTIRQ);
23088 static void flush_all_backlogs(void)
23089 @@ -4809,6 +4824,7 @@
23090 sd->rps_ipi_list = NULL;
23092 local_irq_enable();
23093 + preempt_check_resched_rt();
23095 /* Send pending IPI's to kick RPS processing on remote cpus. */
23097 @@ -4822,6 +4838,7 @@
23100 local_irq_enable();
23101 + preempt_check_resched_rt();
23104 static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
23105 @@ -4851,7 +4868,9 @@
23107 struct sk_buff *skb;
23109 + local_irq_disable();
23110 while ((skb = __skb_dequeue(&sd->process_queue))) {
23111 + local_irq_enable();
23113 __netif_receive_skb(skb);
23115 @@ -4859,9 +4878,9 @@
23116 if (++work >= quota)
23119 + local_irq_disable();
23122 - local_irq_disable();
23124 if (skb_queue_empty(&sd->input_pkt_queue)) {
23126 @@ -4899,9 +4918,11 @@
23127 local_irq_save(flags);
23128 ____napi_schedule(this_cpu_ptr(&softnet_data), n);
23129 local_irq_restore(flags);
23130 + preempt_check_resched_rt();
23132 EXPORT_SYMBOL(__napi_schedule);
23134 +#ifndef CONFIG_PREEMPT_RT_FULL
23136 * __napi_schedule_irqoff - schedule for receive
23137 * @n: entry to schedule
23138 @@ -4913,6 +4934,7 @@
23139 ____napi_schedule(this_cpu_ptr(&softnet_data), n);
23141 EXPORT_SYMBOL(__napi_schedule_irqoff);
23144 void __napi_complete(struct napi_struct *n)
23146 @@ -5202,13 +5224,21 @@
23147 struct softnet_data *sd = this_cpu_ptr(&softnet_data);
23148 unsigned long time_limit = jiffies + 2;
23149 int budget = netdev_budget;
23150 + struct sk_buff_head tofree_q;
23151 + struct sk_buff *skb;
23155 + __skb_queue_head_init(&tofree_q);
23157 local_irq_disable();
23158 + skb_queue_splice_init(&sd->tofree_queue, &tofree_q);
23159 list_splice_init(&sd->poll_list, &list);
23160 local_irq_enable();
23162 + while ((skb = __skb_dequeue(&tofree_q)))
23166 struct napi_struct *n;
23168 @@ -5239,7 +5269,7 @@
23169 list_splice_tail(&repoll, &list);
23170 list_splice(&list, &sd->poll_list);
23171 if (!list_empty(&sd->poll_list))
23172 - __raise_softirq_irqoff(NET_RX_SOFTIRQ);
23173 + __raise_softirq_irqoff_ksoft(NET_RX_SOFTIRQ);
23175 net_rps_action_and_irq_enable(sd);
23177 @@ -8000,16 +8030,20 @@
23179 raise_softirq_irqoff(NET_TX_SOFTIRQ);
23180 local_irq_enable();
23181 + preempt_check_resched_rt();
23183 /* Process offline CPU's input_pkt_queue */
23184 while ((skb = __skb_dequeue(&oldsd->process_queue))) {
23186 input_queue_head_incr(oldsd);
23188 - while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
23189 + while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
23191 input_queue_head_incr(oldsd);
23193 + while ((skb = __skb_dequeue(&oldsd->tofree_queue))) {
23199 @@ -8314,8 +8348,9 @@
23201 INIT_WORK(flush, flush_backlog);
23203 - skb_queue_head_init(&sd->input_pkt_queue);
23204 - skb_queue_head_init(&sd->process_queue);
23205 + skb_queue_head_init_raw(&sd->input_pkt_queue);
23206 + skb_queue_head_init_raw(&sd->process_queue);
23207 + skb_queue_head_init_raw(&sd->tofree_queue);
23208 INIT_LIST_HEAD(&sd->poll_list);
23209 sd->output_queue_tailp = &sd->output_queue;
23211 diff -Nur linux-4.9.6.orig/net/core/filter.c linux-4.9.6/net/core/filter.c
23212 --- linux-4.9.6.orig/net/core/filter.c 2017-01-26 08:25:24.000000000 +0100
23213 +++ linux-4.9.6/net/core/filter.c 2017-01-28 13:59:10.103662798 +0100
23214 @@ -1645,7 +1645,7 @@
23218 - if (unlikely(__this_cpu_read(xmit_recursion) > XMIT_RECURSION_LIMIT)) {
23219 + if (unlikely(xmit_rec_read() > XMIT_RECURSION_LIMIT)) {
23220 net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n");
23223 @@ -1653,9 +1653,9 @@
23227 - __this_cpu_inc(xmit_recursion);
23229 ret = dev_queue_xmit(skb);
23230 - __this_cpu_dec(xmit_recursion);
23235 diff -Nur linux-4.9.6.orig/net/core/gen_estimator.c linux-4.9.6/net/core/gen_estimator.c
23236 --- linux-4.9.6.orig/net/core/gen_estimator.c 2017-01-26 08:25:24.000000000 +0100
23237 +++ linux-4.9.6/net/core/gen_estimator.c 2017-01-28 13:59:10.103662798 +0100
23239 struct gnet_stats_basic_packed *bstats;
23240 struct gnet_stats_rate_est64 *rate_est;
23241 spinlock_t *stats_lock;
23242 - seqcount_t *running;
23243 + net_seqlock_t *running;
23246 unsigned long avpps;
23247 @@ -213,7 +213,7 @@
23248 struct gnet_stats_basic_cpu __percpu *cpu_bstats,
23249 struct gnet_stats_rate_est64 *rate_est,
23250 spinlock_t *stats_lock,
23251 - seqcount_t *running,
23252 + net_seqlock_t *running,
23253 struct nlattr *opt)
23255 struct gen_estimator *est;
23256 @@ -309,7 +309,7 @@
23257 struct gnet_stats_basic_cpu __percpu *cpu_bstats,
23258 struct gnet_stats_rate_est64 *rate_est,
23259 spinlock_t *stats_lock,
23260 - seqcount_t *running, struct nlattr *opt)
23261 + net_seqlock_t *running, struct nlattr *opt)
23263 gen_kill_estimator(bstats, rate_est);
23264 return gen_new_estimator(bstats, cpu_bstats, rate_est, stats_lock, running, opt);
23265 diff -Nur linux-4.9.6.orig/net/core/gen_stats.c linux-4.9.6/net/core/gen_stats.c
23266 --- linux-4.9.6.orig/net/core/gen_stats.c 2017-01-26 08:25:24.000000000 +0100
23267 +++ linux-4.9.6/net/core/gen_stats.c 2017-01-28 13:59:10.107662951 +0100
23268 @@ -130,7 +130,7 @@
23272 -__gnet_stats_copy_basic(const seqcount_t *running,
23273 +__gnet_stats_copy_basic(net_seqlock_t *running,
23274 struct gnet_stats_basic_packed *bstats,
23275 struct gnet_stats_basic_cpu __percpu *cpu,
23276 struct gnet_stats_basic_packed *b)
23277 @@ -143,10 +143,10 @@
23281 - seq = read_seqcount_begin(running);
23282 + seq = net_seq_begin(running);
23283 bstats->bytes = b->bytes;
23284 bstats->packets = b->packets;
23285 - } while (running && read_seqcount_retry(running, seq));
23286 + } while (running && net_seq_retry(running, seq));
23288 EXPORT_SYMBOL(__gnet_stats_copy_basic);
23290 @@ -164,7 +164,7 @@
23291 * if the room in the socket buffer was not sufficient.
23294 -gnet_stats_copy_basic(const seqcount_t *running,
23295 +gnet_stats_copy_basic(net_seqlock_t *running,
23296 struct gnet_dump *d,
23297 struct gnet_stats_basic_cpu __percpu *cpu,
23298 struct gnet_stats_basic_packed *b)
23299 diff -Nur linux-4.9.6.orig/net/core/skbuff.c linux-4.9.6/net/core/skbuff.c
23300 --- linux-4.9.6.orig/net/core/skbuff.c 2017-01-26 08:25:24.000000000 +0100
23301 +++ linux-4.9.6/net/core/skbuff.c 2017-01-28 13:59:10.107662951 +0100
23303 #include <linux/errqueue.h>
23304 #include <linux/prefetch.h>
23305 #include <linux/if_vlan.h>
23306 +#include <linux/locallock.h>
23308 #include <net/protocol.h>
23309 #include <net/dst.h>
23310 @@ -360,6 +361,8 @@
23312 static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache);
23313 static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache);
23314 +static DEFINE_LOCAL_IRQ_LOCK(netdev_alloc_lock);
23315 +static DEFINE_LOCAL_IRQ_LOCK(napi_alloc_cache_lock);
23317 static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
23319 @@ -367,10 +370,10 @@
23320 unsigned long flags;
23323 - local_irq_save(flags);
23324 + local_lock_irqsave(netdev_alloc_lock, flags);
23325 nc = this_cpu_ptr(&netdev_alloc_cache);
23326 data = __alloc_page_frag(nc, fragsz, gfp_mask);
23327 - local_irq_restore(flags);
23328 + local_unlock_irqrestore(netdev_alloc_lock, flags);
23332 @@ -389,9 +392,13 @@
23334 static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
23336 - struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
23337 + struct napi_alloc_cache *nc;
23340 - return __alloc_page_frag(&nc->page, fragsz, gfp_mask);
23341 + nc = &get_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
23342 + data = __alloc_page_frag(&nc->page, fragsz, gfp_mask);
23343 + put_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
23347 void *napi_alloc_frag(unsigned int fragsz)
23348 @@ -438,13 +445,13 @@
23349 if (sk_memalloc_socks())
23350 gfp_mask |= __GFP_MEMALLOC;
23352 - local_irq_save(flags);
23353 + local_lock_irqsave(netdev_alloc_lock, flags);
23355 nc = this_cpu_ptr(&netdev_alloc_cache);
23356 data = __alloc_page_frag(nc, len, gfp_mask);
23357 pfmemalloc = nc->pfmemalloc;
23359 - local_irq_restore(flags);
23360 + local_unlock_irqrestore(netdev_alloc_lock, flags);
23362 if (unlikely(!data))
23364 @@ -485,9 +492,10 @@
23365 struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
23368 - struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
23369 + struct napi_alloc_cache *nc;
23370 struct sk_buff *skb;
23374 len += NET_SKB_PAD + NET_IP_ALIGN;
23376 @@ -505,7 +513,10 @@
23377 if (sk_memalloc_socks())
23378 gfp_mask |= __GFP_MEMALLOC;
23380 + nc = &get_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
23381 data = __alloc_page_frag(&nc->page, len, gfp_mask);
23382 + pfmemalloc = nc->page.pfmemalloc;
23383 + put_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
23384 if (unlikely(!data))
23387 @@ -516,7 +527,7 @@
23390 /* use OR instead of assignment to avoid clearing of bits in mask */
23391 - if (nc->page.pfmemalloc)
23393 skb->pfmemalloc = 1;
23394 skb->head_frag = 1;
23396 @@ -760,23 +771,26 @@
23398 void __kfree_skb_flush(void)
23400 - struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
23401 + struct napi_alloc_cache *nc;
23403 + nc = &get_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
23404 /* flush skb_cache if containing objects */
23405 if (nc->skb_count) {
23406 kmem_cache_free_bulk(skbuff_head_cache, nc->skb_count,
23410 + put_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
23413 static inline void _kfree_skb_defer(struct sk_buff *skb)
23415 - struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
23416 + struct napi_alloc_cache *nc;
23418 /* drop skb->head and call any destructors for packet */
23419 skb_release_all(skb);
23421 + nc = &get_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
23422 /* record skb to CPU local list */
23423 nc->skb_cache[nc->skb_count++] = skb;
23425 @@ -791,6 +805,7 @@
23429 + put_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
23431 void __kfree_skb_defer(struct sk_buff *skb)
23433 diff -Nur linux-4.9.6.orig/net/core/sock.c linux-4.9.6/net/core/sock.c
23434 --- linux-4.9.6.orig/net/core/sock.c 2017-01-26 08:25:24.000000000 +0100
23435 +++ linux-4.9.6/net/core/sock.c 2017-01-28 13:59:10.107662951 +0100
23436 @@ -2488,12 +2488,11 @@
23437 if (sk->sk_lock.owned)
23439 sk->sk_lock.owned = 1;
23440 - spin_unlock(&sk->sk_lock.slock);
23441 + spin_unlock_bh(&sk->sk_lock.slock);
23443 * The sk_lock has mutex_lock() semantics here:
23445 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
23446 - local_bh_enable();
23448 EXPORT_SYMBOL(lock_sock_nested);
23450 diff -Nur linux-4.9.6.orig/net/ipv4/icmp.c linux-4.9.6/net/ipv4/icmp.c
23451 --- linux-4.9.6.orig/net/ipv4/icmp.c 2017-01-26 08:25:24.000000000 +0100
23452 +++ linux-4.9.6/net/ipv4/icmp.c 2017-01-28 13:59:10.107662951 +0100
23454 #include <linux/jiffies.h>
23455 #include <linux/kernel.h>
23456 #include <linux/fcntl.h>
23457 +#include <linux/sysrq.h>
23458 #include <linux/socket.h>
23459 #include <linux/in.h>
23460 #include <linux/inet.h>
23462 #include <linux/string.h>
23463 #include <linux/netfilter_ipv4.h>
23464 #include <linux/slab.h>
23465 +#include <linux/locallock.h>
23466 #include <net/snmp.h>
23467 #include <net/ip.h>
23468 #include <net/route.h>
23469 @@ -204,6 +206,8 @@
23471 * On SMP we have one ICMP socket per-cpu.
23473 +static DEFINE_LOCAL_IRQ_LOCK(icmp_sk_lock);
23475 static struct sock *icmp_sk(struct net *net)
23477 return *this_cpu_ptr(net->ipv4.icmp_sk);
23478 @@ -215,12 +219,14 @@
23480 local_bh_disable();
23482 + local_lock(icmp_sk_lock);
23485 if (unlikely(!spin_trylock(&sk->sk_lock.slock))) {
23486 /* This can happen if the output path signals a
23487 * dst_link_failure() for an outgoing ICMP packet.
23489 + local_unlock(icmp_sk_lock);
23493 @@ -230,6 +236,7 @@
23494 static inline void icmp_xmit_unlock(struct sock *sk)
23496 spin_unlock_bh(&sk->sk_lock.slock);
23497 + local_unlock(icmp_sk_lock);
23500 int sysctl_icmp_msgs_per_sec __read_mostly = 1000;
23501 @@ -358,6 +365,7 @@
23503 struct sk_buff *skb;
23505 + local_lock(icmp_sk_lock);
23506 sk = icmp_sk(dev_net((*rt)->dst.dev));
23507 if (ip_append_data(sk, fl4, icmp_glue_bits, icmp_param,
23508 icmp_param->data_len+icmp_param->head_len,
23509 @@ -380,6 +388,7 @@
23510 skb->ip_summed = CHECKSUM_NONE;
23511 ip_push_pending_frames(sk, fl4);
23513 + local_unlock(icmp_sk_lock);
23517 @@ -891,6 +900,30 @@
23521 + * 32bit and 64bit have different timestamp length, so we check for
23522 + * the cookie at offset 20 and verify it is repeated at offset 50
23524 +#define CO_POS0 20
23525 +#define CO_POS1 50
23526 +#define CO_SIZE sizeof(int)
23527 +#define ICMP_SYSRQ_SIZE 57
23530 + * We got a ICMP_SYSRQ_SIZE sized ping request. Check for the cookie
23531 + * pattern and if it matches send the next byte as a trigger to sysrq.
23533 +static void icmp_check_sysrq(struct net *net, struct sk_buff *skb)
23535 + int cookie = htonl(net->ipv4.sysctl_icmp_echo_sysrq);
23536 + char *p = skb->data;
23538 + if (!memcmp(&cookie, p + CO_POS0, CO_SIZE) &&
23539 + !memcmp(&cookie, p + CO_POS1, CO_SIZE) &&
23540 + p[CO_POS0 + CO_SIZE] == p[CO_POS1 + CO_SIZE])
23541 + handle_sysrq(p[CO_POS0 + CO_SIZE]);
23545 * Handle ICMP_ECHO ("ping") requests.
23547 * RFC 1122: 3.2.2.6 MUST have an echo server that answers ICMP echo
23548 @@ -917,6 +950,11 @@
23549 icmp_param.data_len = skb->len;
23550 icmp_param.head_len = sizeof(struct icmphdr);
23551 icmp_reply(&icmp_param, skb);
23553 + if (skb->len == ICMP_SYSRQ_SIZE &&
23554 + net->ipv4.sysctl_icmp_echo_sysrq) {
23555 + icmp_check_sysrq(net, skb);
23558 /* should there be an ICMP stat for ignored echos? */
23560 diff -Nur linux-4.9.6.orig/net/ipv4/sysctl_net_ipv4.c linux-4.9.6/net/ipv4/sysctl_net_ipv4.c
23561 --- linux-4.9.6.orig/net/ipv4/sysctl_net_ipv4.c 2017-01-26 08:25:24.000000000 +0100
23562 +++ linux-4.9.6/net/ipv4/sysctl_net_ipv4.c 2017-01-28 13:59:10.107662951 +0100
23563 @@ -681,6 +681,13 @@
23564 .proc_handler = proc_dointvec
23567 + .procname = "icmp_echo_sysrq",
23568 + .data = &init_net.ipv4.sysctl_icmp_echo_sysrq,
23569 + .maxlen = sizeof(int),
23571 + .proc_handler = proc_dointvec
23574 .procname = "icmp_ignore_bogus_error_responses",
23575 .data = &init_net.ipv4.sysctl_icmp_ignore_bogus_error_responses,
23576 .maxlen = sizeof(int),
23577 diff -Nur linux-4.9.6.orig/net/ipv4/tcp_ipv4.c linux-4.9.6/net/ipv4/tcp_ipv4.c
23578 --- linux-4.9.6.orig/net/ipv4/tcp_ipv4.c 2017-01-26 08:25:24.000000000 +0100
23579 +++ linux-4.9.6/net/ipv4/tcp_ipv4.c 2017-01-28 13:59:10.107662951 +0100
23581 #include <linux/init.h>
23582 #include <linux/times.h>
23583 #include <linux/slab.h>
23584 +#include <linux/locallock.h>
23586 #include <net/net_namespace.h>
23587 #include <net/icmp.h>
23588 @@ -564,6 +565,7 @@
23590 EXPORT_SYMBOL(tcp_v4_send_check);
23592 +static DEFINE_LOCAL_IRQ_LOCK(tcp_sk_lock);
23594 * This routine will send an RST to the other tcp.
23596 @@ -691,6 +693,8 @@
23597 offsetof(struct inet_timewait_sock, tw_bound_dev_if));
23599 arg.tos = ip_hdr(skb)->tos;
23601 + local_lock(tcp_sk_lock);
23602 local_bh_disable();
23603 ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
23604 skb, &TCP_SKB_CB(skb)->header.h4.opt,
23605 @@ -700,6 +704,7 @@
23606 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
23607 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
23609 + local_unlock(tcp_sk_lock);
23611 #ifdef CONFIG_TCP_MD5SIG
23613 @@ -775,6 +780,7 @@
23615 arg.bound_dev_if = oif;
23617 + local_lock(tcp_sk_lock);
23618 local_bh_disable();
23619 ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
23620 skb, &TCP_SKB_CB(skb)->header.h4.opt,
23621 @@ -783,6 +789,7 @@
23623 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
23625 + local_unlock(tcp_sk_lock);
23628 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
23629 diff -Nur linux-4.9.6.orig/net/mac80211/rx.c linux-4.9.6/net/mac80211/rx.c
23630 --- linux-4.9.6.orig/net/mac80211/rx.c 2017-01-26 08:25:24.000000000 +0100
23631 +++ linux-4.9.6/net/mac80211/rx.c 2017-01-28 13:59:10.107662951 +0100
23632 @@ -4166,7 +4166,7 @@
23633 struct ieee80211_supported_band *sband;
23634 struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
23636 - WARN_ON_ONCE(softirq_count() == 0);
23637 + WARN_ON_ONCE_NONRT(softirq_count() == 0);
23639 if (WARN_ON(status->band >= NUM_NL80211_BANDS))
23641 diff -Nur linux-4.9.6.orig/net/netfilter/core.c linux-4.9.6/net/netfilter/core.c
23642 --- linux-4.9.6.orig/net/netfilter/core.c 2017-01-26 08:25:24.000000000 +0100
23643 +++ linux-4.9.6/net/netfilter/core.c 2017-01-28 13:59:10.107662951 +0100
23644 @@ -22,12 +22,18 @@
23645 #include <linux/proc_fs.h>
23646 #include <linux/mutex.h>
23647 #include <linux/slab.h>
23648 +#include <linux/locallock.h>
23649 #include <linux/rcupdate.h>
23650 #include <net/net_namespace.h>
23651 #include <net/sock.h>
23653 #include "nf_internals.h"
23655 +#ifdef CONFIG_PREEMPT_RT_BASE
23656 +DEFINE_LOCAL_IRQ_LOCK(xt_write_lock);
23657 +EXPORT_PER_CPU_SYMBOL(xt_write_lock);
23660 static DEFINE_MUTEX(afinfo_mutex);
23662 const struct nf_afinfo __rcu *nf_afinfo[NFPROTO_NUMPROTO] __read_mostly;
23663 diff -Nur linux-4.9.6.orig/net/packet/af_packet.c linux-4.9.6/net/packet/af_packet.c
23664 --- linux-4.9.6.orig/net/packet/af_packet.c 2017-01-26 08:25:24.000000000 +0100
23665 +++ linux-4.9.6/net/packet/af_packet.c 2017-01-28 13:59:10.111663104 +0100
23667 #include <linux/if_packet.h>
23668 #include <linux/wireless.h>
23669 #include <linux/kernel.h>
23670 +#include <linux/delay.h>
23671 #include <linux/kmod.h>
23672 #include <linux/slab.h>
23673 #include <linux/vmalloc.h>
23674 @@ -694,7 +695,7 @@
23675 if (BLOCK_NUM_PKTS(pbd)) {
23676 while (atomic_read(&pkc->blk_fill_in_prog)) {
23677 /* Waiting for skb_copy_bits to finish... */
23683 @@ -956,7 +957,7 @@
23684 if (!(status & TP_STATUS_BLK_TMO)) {
23685 while (atomic_read(&pkc->blk_fill_in_prog)) {
23686 /* Waiting for skb_copy_bits to finish... */
23691 prb_close_block(pkc, pbd, po, status);
23692 diff -Nur linux-4.9.6.orig/net/rds/ib_rdma.c linux-4.9.6/net/rds/ib_rdma.c
23693 --- linux-4.9.6.orig/net/rds/ib_rdma.c 2017-01-26 08:25:24.000000000 +0100
23694 +++ linux-4.9.6/net/rds/ib_rdma.c 2017-01-28 13:59:10.111663104 +0100
23696 #include <linux/slab.h>
23697 #include <linux/rculist.h>
23698 #include <linux/llist.h>
23699 +#include <linux/delay.h>
23701 #include "rds_single_path.h"
23703 @@ -210,7 +211,7 @@
23704 for_each_online_cpu(cpu) {
23705 flag = &per_cpu(clean_list_grace, cpu);
23706 while (test_bit(CLEAN_LIST_BUSY_BIT, flag))
23712 diff -Nur linux-4.9.6.orig/net/rxrpc/security.c linux-4.9.6/net/rxrpc/security.c
23713 --- linux-4.9.6.orig/net/rxrpc/security.c 2017-01-26 08:25:24.000000000 +0100
23714 +++ linux-4.9.6/net/rxrpc/security.c 2017-01-28 13:59:10.111663104 +0100
23716 #include <keys/rxrpc-type.h>
23717 #include "ar-internal.h"
23719 -static LIST_HEAD(rxrpc_security_methods);
23720 -static DECLARE_RWSEM(rxrpc_security_sem);
23722 static const struct rxrpc_security *rxrpc_security_types[] = {
23723 [RXRPC_SECURITY_NONE] = &rxrpc_no_security,
23724 #ifdef CONFIG_RXKAD
23725 diff -Nur linux-4.9.6.orig/net/sched/sch_api.c linux-4.9.6/net/sched/sch_api.c
23726 --- linux-4.9.6.orig/net/sched/sch_api.c 2017-01-26 08:25:24.000000000 +0100
23727 +++ linux-4.9.6/net/sched/sch_api.c 2017-01-28 13:59:10.111663104 +0100
23728 @@ -981,7 +981,7 @@
23729 rcu_assign_pointer(sch->stab, stab);
23731 if (tca[TCA_RATE]) {
23732 - seqcount_t *running;
23733 + net_seqlock_t *running;
23736 if (sch->flags & TCQ_F_MQROOT)
23737 diff -Nur linux-4.9.6.orig/net/sched/sch_generic.c linux-4.9.6/net/sched/sch_generic.c
23738 --- linux-4.9.6.orig/net/sched/sch_generic.c 2017-01-26 08:25:24.000000000 +0100
23739 +++ linux-4.9.6/net/sched/sch_generic.c 2017-01-28 13:59:10.111663104 +0100
23740 @@ -425,7 +425,11 @@
23741 .ops = &noop_qdisc_ops,
23742 .q.lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock),
23743 .dev_queue = &noop_netdev_queue,
23744 +#ifdef CONFIG_PREEMPT_RT_BASE
23745 + .running = __SEQLOCK_UNLOCKED(noop_qdisc.running),
23747 .running = SEQCNT_ZERO(noop_qdisc.running),
23749 .busylock = __SPIN_LOCK_UNLOCKED(noop_qdisc.busylock),
23751 EXPORT_SYMBOL(noop_qdisc);
23752 @@ -624,9 +628,17 @@
23753 lockdep_set_class(&sch->busylock,
23754 dev->qdisc_tx_busylock ?: &qdisc_tx_busylock);
23756 +#ifdef CONFIG_PREEMPT_RT_BASE
23757 + seqlock_init(&sch->running);
23758 + lockdep_set_class(&sch->running.seqcount,
23759 + dev->qdisc_running_key ?: &qdisc_running_key);
23760 + lockdep_set_class(&sch->running.lock,
23761 + dev->qdisc_running_key ?: &qdisc_running_key);
23763 seqcount_init(&sch->running);
23764 lockdep_set_class(&sch->running,
23765 dev->qdisc_running_key ?: &qdisc_running_key);
23769 sch->enqueue = ops->enqueue;
23770 @@ -925,7 +937,7 @@
23771 /* Wait for outstanding qdisc_run calls. */
23772 list_for_each_entry(dev, head, close_list)
23773 while (some_qdisc_is_busy(dev))
23778 void dev_deactivate(struct net_device *dev)
23779 diff -Nur linux-4.9.6.orig/net/sunrpc/svc_xprt.c linux-4.9.6/net/sunrpc/svc_xprt.c
23780 --- linux-4.9.6.orig/net/sunrpc/svc_xprt.c 2017-01-26 08:25:24.000000000 +0100
23781 +++ linux-4.9.6/net/sunrpc/svc_xprt.c 2017-01-28 13:59:10.111663104 +0100
23782 @@ -396,7 +396,7 @@
23787 + cpu = get_cpu_light();
23788 pool = svc_pool_for_cpu(xprt->xpt_server, cpu);
23790 atomic_long_inc(&pool->sp_stats.packets);
23791 @@ -432,7 +432,7 @@
23793 atomic_long_inc(&pool->sp_stats.threads_woken);
23794 wake_up_process(rqstp->rq_task);
23800 @@ -453,7 +453,7 @@
23807 trace_svc_xprt_do_enqueue(xprt, rqstp);
23809 diff -Nur linux-4.9.6.orig/scripts/mkcompile_h linux-4.9.6/scripts/mkcompile_h
23810 --- linux-4.9.6.orig/scripts/mkcompile_h 2017-01-26 08:25:24.000000000 +0100
23811 +++ linux-4.9.6/scripts/mkcompile_h 2017-01-28 13:59:10.111663104 +0100
23820 vecho() { [ "${quiet}" = "silent_" ] || echo "$@" ; }
23824 if [ -n "$SMP" ] ; then CONFIG_FLAGS="SMP"; fi
23825 if [ -n "$PREEMPT" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS PREEMPT"; fi
23826 +if [ -n "$RT" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS RT"; fi
23827 UTS_VERSION="$UTS_VERSION $CONFIG_FLAGS $TIMESTAMP"
23829 # Truncate to maximum length
23830 diff -Nur linux-4.9.6.orig/sound/core/pcm_native.c linux-4.9.6/sound/core/pcm_native.c
23831 --- linux-4.9.6.orig/sound/core/pcm_native.c 2017-01-26 08:25:24.000000000 +0100
23832 +++ linux-4.9.6/sound/core/pcm_native.c 2017-01-28 13:59:10.111663104 +0100
23833 @@ -135,7 +135,7 @@
23834 void snd_pcm_stream_lock_irq(struct snd_pcm_substream *substream)
23836 if (!substream->pcm->nonatomic)
23837 - local_irq_disable();
23838 + local_irq_disable_nort();
23839 snd_pcm_stream_lock(substream);
23841 EXPORT_SYMBOL_GPL(snd_pcm_stream_lock_irq);
23842 @@ -150,7 +150,7 @@
23844 snd_pcm_stream_unlock(substream);
23845 if (!substream->pcm->nonatomic)
23846 - local_irq_enable();
23847 + local_irq_enable_nort();
23849 EXPORT_SYMBOL_GPL(snd_pcm_stream_unlock_irq);
23851 @@ -158,7 +158,7 @@
23853 unsigned long flags = 0;
23854 if (!substream->pcm->nonatomic)
23855 - local_irq_save(flags);
23856 + local_irq_save_nort(flags);
23857 snd_pcm_stream_lock(substream);
23860 @@ -176,7 +176,7 @@
23862 snd_pcm_stream_unlock(substream);
23863 if (!substream->pcm->nonatomic)
23864 - local_irq_restore(flags);
23865 + local_irq_restore_nort(flags);
23867 EXPORT_SYMBOL_GPL(snd_pcm_stream_unlock_irqrestore);