target/linux/patches/4.9.9/patch-realtime

   1 diff -Nur linux-4.9.6.orig/arch/arm/include/asm/irq.h linux-4.9.6/arch/arm/include/asm/irq.h
   2 --- linux-4.9.6.orig/arch/arm/include/asm/irq.h 2017-01-26 08:25:24.000000000 +0100
   3 +++ linux-4.9.6/arch/arm/include/asm/irq.h      2017-01-28 13:59:09.887654498 +0100
   4 @@ -22,6 +22,8 @@
   5  #endif
   6
   7  #ifndef __ASSEMBLY__
   8 +#include <linux/cpumask.h>
   9 +
  10  struct irqaction;
  11  struct pt_regs;
  12  extern void migrate_irqs(void);
  13 diff -Nur linux-4.9.6.orig/arch/arm/include/asm/switch_to.h linux-4.9.6/arch/arm/include/asm/switch_to.h
  14 --- linux-4.9.6.orig/arch/arm/include/asm/switch_to.h   2017-01-26 08:25:24.000000000 +0100
  15 +++ linux-4.9.6/arch/arm/include/asm/switch_to.h        2017-01-28 13:59:09.887654498 +0100
  16 @@ -3,6 +3,13 @@
  17
  18  #include <linux/thread_info.h>
  19
  20 +#if defined CONFIG_PREEMPT_RT_FULL && defined CONFIG_HIGHMEM
  21 +void switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p);
  22 +#else
  23 +static inline void
  24 +switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p) { }
  25 +#endif
  26 +
  27  /*
  28   * For v7 SMP cores running a preemptible kernel we may be pre-empted
  29   * during a TLB maintenance operation, so execute an inner-shareable dsb
  30 @@ -25,6 +32,7 @@
  31  #define switch_to(prev,next,last)                                      \
  32  do {                                                                   \
  33         __complete_pending_tlbi();                                      \
  34 +       switch_kmaps(prev, next);                                       \
  35         last = __switch_to(prev,task_thread_info(prev), task_thread_info(next));        \
  36  } while (0)
  37
  38 diff -Nur linux-4.9.6.orig/arch/arm/include/asm/thread_info.h linux-4.9.6/arch/arm/include/asm/thread_info.h
  39 --- linux-4.9.6.orig/arch/arm/include/asm/thread_info.h 2017-01-26 08:25:24.000000000 +0100
  40 +++ linux-4.9.6/arch/arm/include/asm/thread_info.h      2017-01-28 13:59:09.887654498 +0100
  41 @@ -49,6 +49,7 @@
  42  struct thread_info {
  43         unsigned long           flags;          /* low level flags */
  44         int                     preempt_count;  /* 0 => preemptable, <0 => bug */
  45 +       int                     preempt_lazy_count; /* 0 => preemptable, <0 => bug */
  46         mm_segment_t            addr_limit;     /* address limit */
  47         struct task_struct      *task;          /* main task structure */
  48         __u32                   cpu;            /* cpu */
  49 @@ -142,7 +143,8 @@
  50  #define TIF_SYSCALL_TRACE      4       /* syscall trace active */
  51  #define TIF_SYSCALL_AUDIT      5       /* syscall auditing active */
  52  #define TIF_SYSCALL_TRACEPOINT 6       /* syscall tracepoint instrumentation */
  53 -#define TIF_SECCOMP            7       /* seccomp syscall filtering active */
  54 +#define TIF_SECCOMP            8       /* seccomp syscall filtering active */
  55 +#define TIF_NEED_RESCHED_LAZY  7
  56
  57  #define TIF_NOHZ               12      /* in adaptive nohz mode */
  58  #define TIF_USING_IWMMXT       17
  59 @@ -152,6 +154,7 @@
  60  #define _TIF_SIGPENDING                (1 << TIF_SIGPENDING)
  61  #define _TIF_NEED_RESCHED      (1 << TIF_NEED_RESCHED)
  62  #define _TIF_NOTIFY_RESUME     (1 << TIF_NOTIFY_RESUME)
  63 +#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY)
  64  #define _TIF_UPROBE            (1 << TIF_UPROBE)
  65  #define _TIF_SYSCALL_TRACE     (1 << TIF_SYSCALL_TRACE)
  66  #define _TIF_SYSCALL_AUDIT     (1 << TIF_SYSCALL_AUDIT)
  67 @@ -167,7 +170,8 @@
  68   * Change these and you break ASM code in entry-common.S
  69   */
  70  #define _TIF_WORK_MASK         (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
  71 -                                _TIF_NOTIFY_RESUME | _TIF_UPROBE)
  72 +                                _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
  73 +                                _TIF_NEED_RESCHED_LAZY)
  74
  75  #endif /* __KERNEL__ */
  76  #endif /* __ASM_ARM_THREAD_INFO_H */
  77 diff -Nur linux-4.9.6.orig/arch/arm/Kconfig linux-4.9.6/arch/arm/Kconfig
  78 --- linux-4.9.6.orig/arch/arm/Kconfig   2017-01-26 08:25:24.000000000 +0100
  79 +++ linux-4.9.6/arch/arm/Kconfig        2017-01-28 13:59:09.887654498 +0100
  80 @@ -36,7 +36,7 @@
  81         select HAVE_ARCH_AUDITSYSCALL if (AEABI && !OABI_COMPAT)
  82         select HAVE_ARCH_BITREVERSE if (CPU_32v7M || CPU_32v7) && !CPU_32v6
  83         select HAVE_ARCH_HARDENED_USERCOPY
  84 -       select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 && MMU
  85 +       select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 && MMU && !PREEMPT_RT_BASE
  86         select HAVE_ARCH_KGDB if !CPU_ENDIAN_BE32 && MMU
  87         select HAVE_ARCH_MMAP_RND_BITS if MMU
  88         select HAVE_ARCH_SECCOMP_FILTER if (AEABI && !OABI_COMPAT)
  89 @@ -75,6 +75,7 @@
  90         select HAVE_PERF_EVENTS
  91         select HAVE_PERF_REGS
  92         select HAVE_PERF_USER_STACK_DUMP
  93 +       select HAVE_PREEMPT_LAZY
  94         select HAVE_RCU_TABLE_FREE if (SMP && ARM_LPAE)
  95         select HAVE_REGS_AND_STACK_ACCESS_API
  96         select HAVE_SYSCALL_TRACEPOINTS
  97 diff -Nur linux-4.9.6.orig/arch/arm/kernel/asm-offsets.c linux-4.9.6/arch/arm/kernel/asm-offsets.c
  98 --- linux-4.9.6.orig/arch/arm/kernel/asm-offsets.c      2017-01-26 08:25:24.000000000 +0100
  99 +++ linux-4.9.6/arch/arm/kernel/asm-offsets.c   2017-01-28 13:59:09.887654498 +0100
 100 @@ -65,6 +65,7 @@
 101    BLANK();
 102    DEFINE(TI_FLAGS,             offsetof(struct thread_info, flags));
 103    DEFINE(TI_PREEMPT,           offsetof(struct thread_info, preempt_count));
 104 +  DEFINE(TI_PREEMPT_LAZY,      offsetof(struct thread_info, preempt_lazy_count));
 105    DEFINE(TI_ADDR_LIMIT,                offsetof(struct thread_info, addr_limit));
 106    DEFINE(TI_TASK,              offsetof(struct thread_info, task));
 107    DEFINE(TI_CPU,               offsetof(struct thread_info, cpu));
 108 diff -Nur linux-4.9.6.orig/arch/arm/kernel/entry-armv.S linux-4.9.6/arch/arm/kernel/entry-armv.S
 109 --- linux-4.9.6.orig/arch/arm/kernel/entry-armv.S       2017-01-26 08:25:24.000000000 +0100
 110 +++ linux-4.9.6/arch/arm/kernel/entry-armv.S    2017-01-28 13:59:09.887654498 +0100
 111 @@ -220,11 +220,18 @@
 112
 113  #ifdef CONFIG_PREEMPT
 114         ldr     r8, [tsk, #TI_PREEMPT]          @ get preempt count
 115 -       ldr     r0, [tsk, #TI_FLAGS]            @ get flags
 116         teq     r8, #0                          @ if preempt count != 0
 117 +       bne     1f                              @ return from exeption
 118 +       ldr     r0, [tsk, #TI_FLAGS]            @ get flags
 119 +       tst     r0, #_TIF_NEED_RESCHED          @ if NEED_RESCHED is set
 120 +       blne    svc_preempt                     @ preempt!
 121 +
 122 +       ldr     r8, [tsk, #TI_PREEMPT_LAZY]     @ get preempt lazy count
 123 +       teq     r8, #0                          @ if preempt lazy count != 0
 124         movne   r0, #0                          @ force flags to 0
 125 -       tst     r0, #_TIF_NEED_RESCHED
 126 +       tst     r0, #_TIF_NEED_RESCHED_LAZY
 127         blne    svc_preempt
 128 +1:
 129  #endif
 130
 131         svc_exit r5, irq = 1                    @ return from exception
 132 @@ -239,8 +246,14 @@
 133  1:     bl      preempt_schedule_irq            @ irq en/disable is done inside
 134         ldr     r0, [tsk, #TI_FLAGS]            @ get new tasks TI_FLAGS
 135         tst     r0, #_TIF_NEED_RESCHED
 136 +       bne     1b
 137 +       tst     r0, #_TIF_NEED_RESCHED_LAZY
 138         reteq   r8                              @ go again
 139 -       b       1b
 140 +       ldr     r0, [tsk, #TI_PREEMPT_LAZY]     @ get preempt lazy count
 141 +       teq     r0, #0                          @ if preempt lazy count != 0
 142 +       beq     1b
 143 +       ret     r8                              @ go again
 144 +
 145  #endif
 146
 147  __und_fault:
 148 diff -Nur linux-4.9.6.orig/arch/arm/kernel/entry-common.S linux-4.9.6/arch/arm/kernel/entry-common.S
 149 --- linux-4.9.6.orig/arch/arm/kernel/entry-common.S     2017-01-26 08:25:24.000000000 +0100
 150 +++ linux-4.9.6/arch/arm/kernel/entry-common.S  2017-01-28 13:59:09.887654498 +0100
 151 @@ -36,7 +36,9 @@
 152   UNWIND(.cantunwind    )
 153         disable_irq_notrace                     @ disable interrupts
 154         ldr     r1, [tsk, #TI_FLAGS]            @ re-check for syscall tracing
 155 -       tst     r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
 156 +       tst     r1, #((_TIF_SYSCALL_WORK | _TIF_WORK_MASK) & ~_TIF_SECCOMP)
 157 +       bne     fast_work_pending
 158 +       tst     r1, #_TIF_SECCOMP
 159         bne     fast_work_pending
 160
 161         /* perform architecture specific actions before user return */
 162 @@ -62,8 +64,11 @@
 163         str     r0, [sp, #S_R0 + S_OFF]!        @ save returned r0
 164         disable_irq_notrace                     @ disable interrupts
 165         ldr     r1, [tsk, #TI_FLAGS]            @ re-check for syscall tracing
 166 -       tst     r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
 167 +       tst     r1, #((_TIF_SYSCALL_WORK | _TIF_WORK_MASK) & ~_TIF_SECCOMP)
 168 +       bne     do_slower_path
 169 +       tst     r1, #_TIF_SECCOMP
 170         beq     no_work_pending
 171 +do_slower_path:
 172   UNWIND(.fnend         )
 173  ENDPROC(ret_fast_syscall)
 174
 175 diff -Nur linux-4.9.6.orig/arch/arm/kernel/patch.c linux-4.9.6/arch/arm/kernel/patch.c
 176 --- linux-4.9.6.orig/arch/arm/kernel/patch.c    2017-01-26 08:25:24.000000000 +0100
 177 +++ linux-4.9.6/arch/arm/kernel/patch.c 2017-01-28 13:59:09.895654805 +0100
 178 @@ -15,7 +15,7 @@
 179         unsigned int insn;
 180  };
 181
 182 -static DEFINE_SPINLOCK(patch_lock);
 183 +static DEFINE_RAW_SPINLOCK(patch_lock);
 184
 185  static void __kprobes *patch_map(void *addr, int fixmap, unsigned long *flags)
 186         __acquires(&patch_lock)
 187 @@ -32,7 +32,7 @@
 188                 return addr;
 189
 190         if (flags)
 191 -               spin_lock_irqsave(&patch_lock, *flags);
 192 +               raw_spin_lock_irqsave(&patch_lock, *flags);
 193         else
 194                 __acquire(&patch_lock);
 195
 196 @@ -47,7 +47,7 @@
 197         clear_fixmap(fixmap);
 198
 199         if (flags)
 200 -               spin_unlock_irqrestore(&patch_lock, *flags);
 201 +               raw_spin_unlock_irqrestore(&patch_lock, *flags);
 202         else
 203                 __release(&patch_lock);
 204  }
 205 diff -Nur linux-4.9.6.orig/arch/arm/kernel/process.c linux-4.9.6/arch/arm/kernel/process.c
 206 --- linux-4.9.6.orig/arch/arm/kernel/process.c  2017-01-26 08:25:24.000000000 +0100
 207 +++ linux-4.9.6/arch/arm/kernel/process.c       2017-01-28 13:59:09.895654805 +0100
 208 @@ -322,6 +322,30 @@
 209  }
 210
 211  #ifdef CONFIG_MMU
 212 +/*
 213 + * CONFIG_SPLIT_PTLOCK_CPUS results in a page->ptl lock.  If the lock is not
 214 + * initialized by pgtable_page_ctor() then a coredump of the vector page will
 215 + * fail.
 216 + */
 217 +static int __init vectors_user_mapping_init_page(void)
 218 +{
 219 +       struct page *page;
 220 +       unsigned long addr = 0xffff0000;
 221 +       pgd_t *pgd;
 222 +       pud_t *pud;
 223 +       pmd_t *pmd;
 224 +
 225 +       pgd = pgd_offset_k(addr);
 226 +       pud = pud_offset(pgd, addr);
 227 +       pmd = pmd_offset(pud, addr);
 228 +       page = pmd_page(*(pmd));
 229 +
 230 +       pgtable_page_ctor(page);
 231 +
 232 +       return 0;
 233 +}
 234 +late_initcall(vectors_user_mapping_init_page);
 235 +
 236  #ifdef CONFIG_KUSER_HELPERS
 237  /*
 238   * The vectors page is always readable from user space for the
 239 diff -Nur linux-4.9.6.orig/arch/arm/kernel/signal.c linux-4.9.6/arch/arm/kernel/signal.c
 240 --- linux-4.9.6.orig/arch/arm/kernel/signal.c   2017-01-26 08:25:24.000000000 +0100
 241 +++ linux-4.9.6/arch/arm/kernel/signal.c        2017-01-28 13:59:09.895654805 +0100
 242 @@ -572,7 +572,8 @@
 243          */
 244         trace_hardirqs_off();
 245         do {
 246 -               if (likely(thread_flags & _TIF_NEED_RESCHED)) {
 247 +               if (likely(thread_flags & (_TIF_NEED_RESCHED |
 248 +                                          _TIF_NEED_RESCHED_LAZY))) {
 249                         schedule();
 250                 } else {
 251                         if (unlikely(!user_mode(regs)))
 252 diff -Nur linux-4.9.6.orig/arch/arm/kernel/smp.c linux-4.9.6/arch/arm/kernel/smp.c
 253 --- linux-4.9.6.orig/arch/arm/kernel/smp.c      2017-01-26 08:25:24.000000000 +0100
 254 +++ linux-4.9.6/arch/arm/kernel/smp.c   2017-01-28 13:59:09.895654805 +0100
 255 @@ -234,8 +234,6 @@
 256         flush_cache_louis();
 257         local_flush_tlb_all();
 258
 259 -       clear_tasks_mm_cpumask(cpu);
 260 -
 261         return 0;
 262  }
 263
 264 @@ -251,6 +249,9 @@
 265                 pr_err("CPU%u: cpu didn't die\n", cpu);
 266                 return;
 267         }
 268 +
 269 +       clear_tasks_mm_cpumask(cpu);
 270 +
 271         pr_notice("CPU%u: shutdown\n", cpu);
 272
 273         /*
 274 diff -Nur linux-4.9.6.orig/arch/arm/kernel/unwind.c linux-4.9.6/arch/arm/kernel/unwind.c
 275 --- linux-4.9.6.orig/arch/arm/kernel/unwind.c   2017-01-26 08:25:24.000000000 +0100
 276 +++ linux-4.9.6/arch/arm/kernel/unwind.c        2017-01-28 13:59:09.895654805 +0100
 277 @@ -93,7 +93,7 @@
 278  static const struct unwind_idx *__origin_unwind_idx;
 279  extern const struct unwind_idx __stop_unwind_idx[];
 280
 281 -static DEFINE_SPINLOCK(unwind_lock);
 282 +static DEFINE_RAW_SPINLOCK(unwind_lock);
 283  static LIST_HEAD(unwind_tables);
 284
 285  /* Convert a prel31 symbol to an absolute address */
 286 @@ -201,7 +201,7 @@
 287                 /* module unwind tables */
 288                 struct unwind_table *table;
 289
 290 -               spin_lock_irqsave(&unwind_lock, flags);
 291 +               raw_spin_lock_irqsave(&unwind_lock, flags);
 292                 list_for_each_entry(table, &unwind_tables, list) {
 293                         if (addr >= table->begin_addr &&
 294                             addr < table->end_addr) {
 295 @@ -213,7 +213,7 @@
 296                                 break;
 297                         }
 298                 }
 299 -               spin_unlock_irqrestore(&unwind_lock, flags);
 300 +               raw_spin_unlock_irqrestore(&unwind_lock, flags);
 301         }
 302
 303         pr_debug("%s: idx = %p\n", __func__, idx);
 304 @@ -529,9 +529,9 @@
 305         tab->begin_addr = text_addr;
 306         tab->end_addr = text_addr + text_size;
 307
 308 -       spin_lock_irqsave(&unwind_lock, flags);
 309 +       raw_spin_lock_irqsave(&unwind_lock, flags);
 310         list_add_tail(&tab->list, &unwind_tables);
 311 -       spin_unlock_irqrestore(&unwind_lock, flags);
 312 +       raw_spin_unlock_irqrestore(&unwind_lock, flags);
 313
 314         return tab;
 315  }
 316 @@ -543,9 +543,9 @@
 317         if (!tab)
 318                 return;
 319
 320 -       spin_lock_irqsave(&unwind_lock, flags);
 321 +       raw_spin_lock_irqsave(&unwind_lock, flags);
 322         list_del(&tab->list);
 323 -       spin_unlock_irqrestore(&unwind_lock, flags);
 324 +       raw_spin_unlock_irqrestore(&unwind_lock, flags);
 325
 326         kfree(tab);
 327  }
 328 diff -Nur linux-4.9.6.orig/arch/arm/kvm/arm.c linux-4.9.6/arch/arm/kvm/arm.c
 329 --- linux-4.9.6.orig/arch/arm/kvm/arm.c 2017-01-26 08:25:24.000000000 +0100
 330 +++ linux-4.9.6/arch/arm/kvm/arm.c      2017-01-28 13:59:09.895654805 +0100
 331 @@ -619,7 +619,7 @@
 332                  * involves poking the GIC, which must be done in a
 333                  * non-preemptible context.
 334                  */
 335 -               preempt_disable();
 336 +               migrate_disable();
 337                 kvm_pmu_flush_hwstate(vcpu);
 338                 kvm_timer_flush_hwstate(vcpu);
 339                 kvm_vgic_flush_hwstate(vcpu);
 340 @@ -640,7 +640,7 @@
 341                         kvm_pmu_sync_hwstate(vcpu);
 342                         kvm_timer_sync_hwstate(vcpu);
 343                         kvm_vgic_sync_hwstate(vcpu);
 344 -                       preempt_enable();
 345 +                       migrate_enable();
 346                         continue;
 347                 }
 348
 349 @@ -696,7 +696,7 @@
 350
 351                 kvm_vgic_sync_hwstate(vcpu);
 352
 353 -               preempt_enable();
 354 +               migrate_enable();
 355
 356                 ret = handle_exit(vcpu, run, ret);
 357         }
 358 diff -Nur linux-4.9.6.orig/arch/arm/mach-exynos/platsmp.c linux-4.9.6/arch/arm/mach-exynos/platsmp.c
 359 --- linux-4.9.6.orig/arch/arm/mach-exynos/platsmp.c     2017-01-26 08:25:24.000000000 +0100
 360 +++ linux-4.9.6/arch/arm/mach-exynos/platsmp.c  2017-01-28 13:59:09.895654805 +0100
 361 @@ -229,7 +229,7 @@
 362         return (void __iomem *)(S5P_VA_SCU);
 363  }
 364
 365 -static DEFINE_SPINLOCK(boot_lock);
 366 +static DEFINE_RAW_SPINLOCK(boot_lock);
 367
 368  static void exynos_secondary_init(unsigned int cpu)
 369  {
 370 @@ -242,8 +242,8 @@
 371         /*
 372          * Synchronise with the boot thread.
 373          */
 374 -       spin_lock(&boot_lock);
 375 -       spin_unlock(&boot_lock);
 376 +       raw_spin_lock(&boot_lock);
 377 +       raw_spin_unlock(&boot_lock);
 378  }
 379
 380  int exynos_set_boot_addr(u32 core_id, unsigned long boot_addr)
 381 @@ -307,7 +307,7 @@
 382          * Set synchronisation state between this boot processor
 383          * and the secondary one
 384          */
 385 -       spin_lock(&boot_lock);
 386 +       raw_spin_lock(&boot_lock);
 387
 388         /*
 389          * The secondary processor is waiting to be released from
 390 @@ -334,7 +334,7 @@
 391
 392                 if (timeout == 0) {
 393                         printk(KERN_ERR "cpu1 power enable failed");
 394 -                       spin_unlock(&boot_lock);
 395 +                       raw_spin_unlock(&boot_lock);
 396                         return -ETIMEDOUT;
 397                 }
 398         }
 399 @@ -380,7 +380,7 @@
 400          * calibrations, then wait for it to finish
 401          */
 402  fail:
 403 -       spin_unlock(&boot_lock);
 404 +       raw_spin_unlock(&boot_lock);
 405
 406         return pen_release != -1 ? ret : 0;
 407  }
 408 diff -Nur linux-4.9.6.orig/arch/arm/mach-hisi/platmcpm.c linux-4.9.6/arch/arm/mach-hisi/platmcpm.c
 409 --- linux-4.9.6.orig/arch/arm/mach-hisi/platmcpm.c      2017-01-26 08:25:24.000000000 +0100
 410 +++ linux-4.9.6/arch/arm/mach-hisi/platmcpm.c   2017-01-28 13:59:09.895654805 +0100
 411 @@ -61,7 +61,7 @@
 412
 413  static void __iomem *sysctrl, *fabric;
 414  static int hip04_cpu_table[HIP04_MAX_CLUSTERS][HIP04_MAX_CPUS_PER_CLUSTER];
 415 -static DEFINE_SPINLOCK(boot_lock);
 416 +static DEFINE_RAW_SPINLOCK(boot_lock);
 417  static u32 fabric_phys_addr;
 418  /*
 419   * [0]: bootwrapper physical address
 420 @@ -113,7 +113,7 @@
 421         if (cluster >= HIP04_MAX_CLUSTERS || cpu >= HIP04_MAX_CPUS_PER_CLUSTER)
 422                 return -EINVAL;
 423
 424 -       spin_lock_irq(&boot_lock);
 425 +       raw_spin_lock_irq(&boot_lock);
 426
 427         if (hip04_cpu_table[cluster][cpu])
 428                 goto out;
 429 @@ -147,7 +147,7 @@
 430
 431  out:
 432         hip04_cpu_table[cluster][cpu]++;
 433 -       spin_unlock_irq(&boot_lock);
 434 +       raw_spin_unlock_irq(&boot_lock);
 435
 436         return 0;
 437  }
 438 @@ -162,11 +162,11 @@
 439         cpu = MPIDR_AFFINITY_LEVEL(mpidr, 0);
 440         cluster = MPIDR_AFFINITY_LEVEL(mpidr, 1);
 441
 442 -       spin_lock(&boot_lock);
 443 +       raw_spin_lock(&boot_lock);
 444         hip04_cpu_table[cluster][cpu]--;
 445         if (hip04_cpu_table[cluster][cpu] == 1) {
 446                 /* A power_up request went ahead of us. */
 447 -               spin_unlock(&boot_lock);
 448 +               raw_spin_unlock(&boot_lock);
 449                 return;
 450         } else if (hip04_cpu_table[cluster][cpu] > 1) {
 451                 pr_err("Cluster %d CPU%d boots multiple times\n", cluster, cpu);
 452 @@ -174,7 +174,7 @@
 453         }
 454
 455         last_man = hip04_cluster_is_down(cluster);
 456 -       spin_unlock(&boot_lock);
 457 +       raw_spin_unlock(&boot_lock);
 458         if (last_man) {
 459                 /* Since it's Cortex A15, disable L2 prefetching. */
 460                 asm volatile(
 461 @@ -203,7 +203,7 @@
 462                cpu >= HIP04_MAX_CPUS_PER_CLUSTER);
 463
 464         count = TIMEOUT_MSEC / POLL_MSEC;
 465 -       spin_lock_irq(&boot_lock);
 466 +       raw_spin_lock_irq(&boot_lock);
 467         for (tries = 0; tries < count; tries++) {
 468                 if (hip04_cpu_table[cluster][cpu])
 469                         goto err;
 470 @@ -211,10 +211,10 @@
 471                 data = readl_relaxed(sysctrl + SC_CPU_RESET_STATUS(cluster));
 472                 if (data & CORE_WFI_STATUS(cpu))
 473                         break;
 474 -               spin_unlock_irq(&boot_lock);
 475 +               raw_spin_unlock_irq(&boot_lock);
 476                 /* Wait for clean L2 when the whole cluster is down. */
 477                 msleep(POLL_MSEC);
 478 -               spin_lock_irq(&boot_lock);
 479 +               raw_spin_lock_irq(&boot_lock);
 480         }
 481         if (tries >= count)
 482                 goto err;
 483 @@ -231,10 +231,10 @@
 484                 goto err;
 485         if (hip04_cluster_is_down(cluster))
 486                 hip04_set_snoop_filter(cluster, 0);
 487 -       spin_unlock_irq(&boot_lock);
 488 +       raw_spin_unlock_irq(&boot_lock);
 489         return 1;
 490  err:
 491 -       spin_unlock_irq(&boot_lock);
 492 +       raw_spin_unlock_irq(&boot_lock);
 493         return 0;
 494  }
 495  #endif
 496 diff -Nur linux-4.9.6.orig/arch/arm/mach-omap2/omap-smp.c linux-4.9.6/arch/arm/mach-omap2/omap-smp.c
 497 --- linux-4.9.6.orig/arch/arm/mach-omap2/omap-smp.c     2017-01-26 08:25:24.000000000 +0100
 498 +++ linux-4.9.6/arch/arm/mach-omap2/omap-smp.c  2017-01-28 13:59:09.895654805 +0100
 499 @@ -64,7 +64,7 @@
 500         .startup_addr = omap5_secondary_startup,
 501  };
 502
 503 -static DEFINE_SPINLOCK(boot_lock);
 504 +static DEFINE_RAW_SPINLOCK(boot_lock);
 505
 506  void __iomem *omap4_get_scu_base(void)
 507  {
 508 @@ -131,8 +131,8 @@
 509         /*
 510          * Synchronise with the boot thread.
 511          */
 512 -       spin_lock(&boot_lock);
 513 -       spin_unlock(&boot_lock);
 514 +       raw_spin_lock(&boot_lock);
 515 +       raw_spin_unlock(&boot_lock);
 516  }
 517
 518  static int omap4_boot_secondary(unsigned int cpu, struct task_struct *idle)
 519 @@ -146,7 +146,7 @@
 520          * Set synchronisation state between this boot processor
 521          * and the secondary one
 522          */
 523 -       spin_lock(&boot_lock);
 524 +       raw_spin_lock(&boot_lock);
 525
 526         /*
 527          * Update the AuxCoreBoot0 with boot state for secondary core.
 528 @@ -223,7 +223,7 @@
 529          * Now the secondary core is starting up let it run its
 530          * calibrations, then wait for it to finish
 531          */
 532 -       spin_unlock(&boot_lock);
 533 +       raw_spin_unlock(&boot_lock);
 534
 535         return 0;
 536  }
 537 diff -Nur linux-4.9.6.orig/arch/arm/mach-prima2/platsmp.c linux-4.9.6/arch/arm/mach-prima2/platsmp.c
 538 --- linux-4.9.6.orig/arch/arm/mach-prima2/platsmp.c     2017-01-26 08:25:24.000000000 +0100
 539 +++ linux-4.9.6/arch/arm/mach-prima2/platsmp.c  2017-01-28 13:59:09.895654805 +0100
 540 @@ -22,7 +22,7 @@
 541
 542  static void __iomem *clk_base;
 543
 544 -static DEFINE_SPINLOCK(boot_lock);
 545 +static DEFINE_RAW_SPINLOCK(boot_lock);
 546
 547  static void sirfsoc_secondary_init(unsigned int cpu)
 548  {
 549 @@ -36,8 +36,8 @@
 550         /*
 551          * Synchronise with the boot thread.
 552          */
 553 -       spin_lock(&boot_lock);
 554 -       spin_unlock(&boot_lock);
 555 +       raw_spin_lock(&boot_lock);
 556 +       raw_spin_unlock(&boot_lock);
 557  }
 558
 559  static const struct of_device_id clk_ids[]  = {
 560 @@ -75,7 +75,7 @@
 561         /* make sure write buffer is drained */
 562         mb();
 563
 564 -       spin_lock(&boot_lock);
 565 +       raw_spin_lock(&boot_lock);
 566
 567         /*
 568          * The secondary processor is waiting to be released from
 569 @@ -107,7 +107,7 @@
 570          * now the secondary core is starting up let it run its
 571          * calibrations, then wait for it to finish
 572          */
 573 -       spin_unlock(&boot_lock);
 574 +       raw_spin_unlock(&boot_lock);
 575
 576         return pen_release != -1 ? -ENOSYS : 0;
 577  }
 578 diff -Nur linux-4.9.6.orig/arch/arm/mach-qcom/platsmp.c linux-4.9.6/arch/arm/mach-qcom/platsmp.c
 579 --- linux-4.9.6.orig/arch/arm/mach-qcom/platsmp.c       2017-01-26 08:25:24.000000000 +0100
 580 +++ linux-4.9.6/arch/arm/mach-qcom/platsmp.c    2017-01-28 13:59:09.895654805 +0100
 581 @@ -46,7 +46,7 @@
 582
 583  extern void secondary_startup_arm(void);
 584
 585 -static DEFINE_SPINLOCK(boot_lock);
 586 +static DEFINE_RAW_SPINLOCK(boot_lock);
 587
 588  #ifdef CONFIG_HOTPLUG_CPU
 589  static void qcom_cpu_die(unsigned int cpu)
 590 @@ -60,8 +60,8 @@
 591         /*
 592          * Synchronise with the boot thread.
 593          */
 594 -       spin_lock(&boot_lock);
 595 -       spin_unlock(&boot_lock);
 596 +       raw_spin_lock(&boot_lock);
 597 +       raw_spin_unlock(&boot_lock);
 598  }
 599
 600  static int scss_release_secondary(unsigned int cpu)
 601 @@ -284,7 +284,7 @@
 602          * set synchronisation state between this boot processor
 603          * and the secondary one
 604          */
 605 -       spin_lock(&boot_lock);
 606 +       raw_spin_lock(&boot_lock);
 607
 608         /*
 609          * Send the secondary CPU a soft interrupt, thereby causing
 610 @@ -297,7 +297,7 @@
 611          * now the secondary core is starting up let it run its
 612          * calibrations, then wait for it to finish
 613          */
 614 -       spin_unlock(&boot_lock);
 615 +       raw_spin_unlock(&boot_lock);
 616
 617         return ret;
 618  }
 619 diff -Nur linux-4.9.6.orig/arch/arm/mach-spear/platsmp.c linux-4.9.6/arch/arm/mach-spear/platsmp.c
 620 --- linux-4.9.6.orig/arch/arm/mach-spear/platsmp.c      2017-01-26 08:25:24.000000000 +0100
 621 +++ linux-4.9.6/arch/arm/mach-spear/platsmp.c   2017-01-28 13:59:09.895654805 +0100
 622 @@ -32,7 +32,7 @@
 623         sync_cache_w(&pen_release);
 624  }
 625
 626 -static DEFINE_SPINLOCK(boot_lock);
 627 +static DEFINE_RAW_SPINLOCK(boot_lock);
 628
 629  static void __iomem *scu_base = IOMEM(VA_SCU_BASE);
 630
 631 @@ -47,8 +47,8 @@
 632         /*
 633          * Synchronise with the boot thread.
 634          */
 635 -       spin_lock(&boot_lock);
 636 -       spin_unlock(&boot_lock);
 637 +       raw_spin_lock(&boot_lock);
 638 +       raw_spin_unlock(&boot_lock);
 639  }
 640
 641  static int spear13xx_boot_secondary(unsigned int cpu, struct task_struct *idle)
 642 @@ -59,7 +59,7 @@
 643          * set synchronisation state between this boot processor
 644          * and the secondary one
 645          */
 646 -       spin_lock(&boot_lock);
 647 +       raw_spin_lock(&boot_lock);
 648
 649         /*
 650          * The secondary processor is waiting to be released from
 651 @@ -84,7 +84,7 @@
 652          * now the secondary core is starting up let it run its
 653          * calibrations, then wait for it to finish
 654          */
 655 -       spin_unlock(&boot_lock);
 656 +       raw_spin_unlock(&boot_lock);
 657
 658         return pen_release != -1 ? -ENOSYS : 0;
 659  }
 660 diff -Nur linux-4.9.6.orig/arch/arm/mach-sti/platsmp.c linux-4.9.6/arch/arm/mach-sti/platsmp.c
 661 --- linux-4.9.6.orig/arch/arm/mach-sti/platsmp.c        2017-01-26 08:25:24.000000000 +0100
 662 +++ linux-4.9.6/arch/arm/mach-sti/platsmp.c     2017-01-28 13:59:09.895654805 +0100
 663 @@ -35,7 +35,7 @@
 664         sync_cache_w(&pen_release);
 665  }
 666
 667 -static DEFINE_SPINLOCK(boot_lock);
 668 +static DEFINE_RAW_SPINLOCK(boot_lock);
 669
 670  static void sti_secondary_init(unsigned int cpu)
 671  {
 672 @@ -48,8 +48,8 @@
 673         /*
 674          * Synchronise with the boot thread.
 675          */
 676 -       spin_lock(&boot_lock);
 677 -       spin_unlock(&boot_lock);
 678 +       raw_spin_lock(&boot_lock);
 679 +       raw_spin_unlock(&boot_lock);
 680  }
 681
 682  static int sti_boot_secondary(unsigned int cpu, struct task_struct *idle)
 683 @@ -60,7 +60,7 @@
 684          * set synchronisation state between this boot processor
 685          * and the secondary one
 686          */
 687 -       spin_lock(&boot_lock);
 688 +       raw_spin_lock(&boot_lock);
 689
 690         /*
 691          * The secondary processor is waiting to be released from
 692 @@ -91,7 +91,7 @@
 693          * now the secondary core is starting up let it run its
 694          * calibrations, then wait for it to finish
 695          */
 696 -       spin_unlock(&boot_lock);
 697 +       raw_spin_unlock(&boot_lock);
 698
 699         return pen_release != -1 ? -ENOSYS : 0;
 700  }
 701 diff -Nur linux-4.9.6.orig/arch/arm/mm/fault.c linux-4.9.6/arch/arm/mm/fault.c
 702 --- linux-4.9.6.orig/arch/arm/mm/fault.c        2017-01-26 08:25:24.000000000 +0100
 703 +++ linux-4.9.6/arch/arm/mm/fault.c     2017-01-28 13:59:09.895654805 +0100
 704 @@ -430,6 +430,9 @@
 705         if (addr < TASK_SIZE)
 706                 return do_page_fault(addr, fsr, regs);
 707
 708 +       if (interrupts_enabled(regs))
 709 +               local_irq_enable();
 710 +
 711         if (user_mode(regs))
 712                 goto bad_area;
 713
 714 @@ -497,6 +500,9 @@
 715  static int
 716  do_sect_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
 717  {
 718 +       if (interrupts_enabled(regs))
 719 +               local_irq_enable();
 720 +
 721         do_bad_area(addr, fsr, regs);
 722         return 0;
 723  }
 724 diff -Nur linux-4.9.6.orig/arch/arm/mm/highmem.c linux-4.9.6/arch/arm/mm/highmem.c
 725 --- linux-4.9.6.orig/arch/arm/mm/highmem.c      2017-01-26 08:25:24.000000000 +0100
 726 +++ linux-4.9.6/arch/arm/mm/highmem.c   2017-01-28 13:59:09.895654805 +0100
 727 @@ -34,6 +34,11 @@
 728         return *ptep;
 729  }
 730
 731 +static unsigned int fixmap_idx(int type)
 732 +{
 733 +       return FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id();
 734 +}
 735 +
 736  void *kmap(struct page *page)
 737  {
 738         might_sleep();
 739 @@ -54,12 +59,13 @@
 740
 741  void *kmap_atomic(struct page *page)
 742  {
 743 +       pte_t pte = mk_pte(page, kmap_prot);
 744         unsigned int idx;
 745         unsigned long vaddr;
 746         void *kmap;
 747         int type;
 748
 749 -       preempt_disable();
 750 +       preempt_disable_nort();
 751         pagefault_disable();
 752         if (!PageHighMem(page))
 753                 return page_address(page);
 754 @@ -79,7 +85,7 @@
 755
 756         type = kmap_atomic_idx_push();
 757
 758 -       idx = FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id();
 759 +       idx = fixmap_idx(type);
 760         vaddr = __fix_to_virt(idx);
 761  #ifdef CONFIG_DEBUG_HIGHMEM
 762         /*
 763 @@ -93,7 +99,10 @@
 764          * in place, so the contained TLB flush ensures the TLB is updated
 765          * with the new mapping.
 766          */
 767 -       set_fixmap_pte(idx, mk_pte(page, kmap_prot));
 768 +#ifdef CONFIG_PREEMPT_RT_FULL
 769 +       current->kmap_pte[type] = pte;
 770 +#endif
 771 +       set_fixmap_pte(idx, pte);
 772
 773         return (void *)vaddr;
 774  }
 775 @@ -106,44 +115,75 @@
 776
 777         if (kvaddr >= (void *)FIXADDR_START) {
 778                 type = kmap_atomic_idx();
 779 -               idx = FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id();
 780 +               idx = fixmap_idx(type);
 781
 782                 if (cache_is_vivt())
 783                         __cpuc_flush_dcache_area((void *)vaddr, PAGE_SIZE);
 784 +#ifdef CONFIG_PREEMPT_RT_FULL
 785 +               current->kmap_pte[type] = __pte(0);
 786 +#endif
 787  #ifdef CONFIG_DEBUG_HIGHMEM
 788                 BUG_ON(vaddr != __fix_to_virt(idx));
 789 -               set_fixmap_pte(idx, __pte(0));
 790  #else
 791                 (void) idx;  /* to kill a warning */
 792  #endif
 793 +               set_fixmap_pte(idx, __pte(0));
 794                 kmap_atomic_idx_pop();
 795         } else if (vaddr >= PKMAP_ADDR(0) && vaddr < PKMAP_ADDR(LAST_PKMAP)) {
 796                 /* this address was obtained through kmap_high_get() */
 797                 kunmap_high(pte_page(pkmap_page_table[PKMAP_NR(vaddr)]));
 798         }
 799         pagefault_enable();
 800 -       preempt_enable();
 801 +       preempt_enable_nort();
 802  }
 803  EXPORT_SYMBOL(__kunmap_atomic);
 804
 805  void *kmap_atomic_pfn(unsigned long pfn)
 806  {
 807 +       pte_t pte = pfn_pte(pfn, kmap_prot);
 808         unsigned long vaddr;
 809         int idx, type;
 810         struct page *page = pfn_to_page(pfn);
 811
 812 -       preempt_disable();
 813 +       preempt_disable_nort();
 814         pagefault_disable();
 815         if (!PageHighMem(page))
 816                 return page_address(page);
 817
 818         type = kmap_atomic_idx_push();
 819 -       idx = FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id();
 820 +       idx = fixmap_idx(type);
 821         vaddr = __fix_to_virt(idx);
 822  #ifdef CONFIG_DEBUG_HIGHMEM
 823         BUG_ON(!pte_none(get_fixmap_pte(vaddr)));
 824  #endif
 825 -       set_fixmap_pte(idx, pfn_pte(pfn, kmap_prot));
 826 +#ifdef CONFIG_PREEMPT_RT_FULL
 827 +       current->kmap_pte[type] = pte;
 828 +#endif
 829 +       set_fixmap_pte(idx, pte);
 830
 831         return (void *)vaddr;
 832  }
 833 +#if defined CONFIG_PREEMPT_RT_FULL
 834 +void switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p)
 835 +{
 836 +       int i;
 837 +
 838 +       /*
 839 +        * Clear @prev's kmap_atomic mappings
 840 +        */
 841 +       for (i = 0; i < prev_p->kmap_idx; i++) {
 842 +               int idx = fixmap_idx(i);
 843 +
 844 +               set_fixmap_pte(idx, __pte(0));
 845 +       }
 846 +       /*
 847 +        * Restore @next_p's kmap_atomic mappings
 848 +        */
 849 +       for (i = 0; i < next_p->kmap_idx; i++) {
 850 +               int idx = fixmap_idx(i);
 851 +
 852 +               if (!pte_none(next_p->kmap_pte[i]))
 853 +                       set_fixmap_pte(idx, next_p->kmap_pte[i]);
 854 +       }
 855 +}
 856 +#endif
 857 diff -Nur linux-4.9.6.orig/arch/arm/plat-versatile/platsmp.c linux-4.9.6/arch/arm/plat-versatile/platsmp.c
 858 --- linux-4.9.6.orig/arch/arm/plat-versatile/platsmp.c  2017-01-26 08:25:24.000000000 +0100
 859 +++ linux-4.9.6/arch/arm/plat-versatile/platsmp.c       2017-01-28 13:59:09.895654805 +0100
 860 @@ -32,7 +32,7 @@
 861         sync_cache_w(&pen_release);
 862  }
 863
 864 -static DEFINE_SPINLOCK(boot_lock);
 865 +static DEFINE_RAW_SPINLOCK(boot_lock);
 866
 867  void versatile_secondary_init(unsigned int cpu)
 868  {
 869 @@ -45,8 +45,8 @@
 870         /*
 871          * Synchronise with the boot thread.
 872          */
 873 -       spin_lock(&boot_lock);
 874 -       spin_unlock(&boot_lock);
 875 +       raw_spin_lock(&boot_lock);
 876 +       raw_spin_unlock(&boot_lock);
 877  }
 878
 879  int versatile_boot_secondary(unsigned int cpu, struct task_struct *idle)
 880 @@ -57,7 +57,7 @@
 881          * Set synchronisation state between this boot processor
 882          * and the secondary one
 883          */
 884 -       spin_lock(&boot_lock);
 885 +       raw_spin_lock(&boot_lock);
 886
 887         /*
 888          * This is really belt and braces; we hold unintended secondary
 889 @@ -87,7 +87,7 @@
 890          * now the secondary core is starting up let it run its
 891          * calibrations, then wait for it to finish
 892          */
 893 -       spin_unlock(&boot_lock);
 894 +       raw_spin_unlock(&boot_lock);
 895
 896         return pen_release != -1 ? -ENOSYS : 0;
 897  }
 898 diff -Nur linux-4.9.6.orig/arch/arm64/include/asm/thread_info.h linux-4.9.6/arch/arm64/include/asm/thread_info.h
 899 --- linux-4.9.6.orig/arch/arm64/include/asm/thread_info.h       2017-01-26 08:25:24.000000000 +0100
 900 +++ linux-4.9.6/arch/arm64/include/asm/thread_info.h    2017-01-28 13:59:09.895654805 +0100
 901 @@ -49,6 +49,7 @@
 902         mm_segment_t            addr_limit;     /* address limit */
 903         struct task_struct      *task;          /* main task structure */
 904         int                     preempt_count;  /* 0 => preemptable, <0 => bug */
 905 +       int                     preempt_lazy_count; /* 0 => preemptable, <0 => bug */
 906         int                     cpu;            /* cpu */
 907  };
 908
 909 @@ -112,6 +113,7 @@
 910  #define TIF_NEED_RESCHED       1
 911  #define TIF_NOTIFY_RESUME      2       /* callback before returning to user */
 912  #define TIF_FOREIGN_FPSTATE    3       /* CPU's FP state is not current's */
 913 +#define TIF_NEED_RESCHED_LAZY  4
 914  #define TIF_NOHZ               7
 915  #define TIF_SYSCALL_TRACE      8
 916  #define TIF_SYSCALL_AUDIT      9
 917 @@ -127,6 +129,7 @@
 918  #define _TIF_NEED_RESCHED      (1 << TIF_NEED_RESCHED)
 919  #define _TIF_NOTIFY_RESUME     (1 << TIF_NOTIFY_RESUME)
 920  #define _TIF_FOREIGN_FPSTATE   (1 << TIF_FOREIGN_FPSTATE)
 921 +#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY)
 922  #define _TIF_NOHZ              (1 << TIF_NOHZ)
 923  #define _TIF_SYSCALL_TRACE     (1 << TIF_SYSCALL_TRACE)
 924  #define _TIF_SYSCALL_AUDIT     (1 << TIF_SYSCALL_AUDIT)
 925 @@ -135,7 +138,9 @@
 926  #define _TIF_32BIT             (1 << TIF_32BIT)
 927
 928  #define _TIF_WORK_MASK         (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
 929 -                                _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE)
 930 +                                _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE | \
 931 +                                _TIF_NEED_RESCHED_LAZY)
 932 +#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)
 933
 934  #define _TIF_SYSCALL_WORK      (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \
 935                                  _TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP | \
 936 diff -Nur linux-4.9.6.orig/arch/arm64/Kconfig linux-4.9.6/arch/arm64/Kconfig
 937 --- linux-4.9.6.orig/arch/arm64/Kconfig 2017-01-26 08:25:24.000000000 +0100
 938 +++ linux-4.9.6/arch/arm64/Kconfig      2017-01-28 13:59:09.895654805 +0100
 939 @@ -91,6 +91,7 @@
 940         select HAVE_PERF_EVENTS
 941         select HAVE_PERF_REGS
 942         select HAVE_PERF_USER_STACK_DUMP
 943 +       select HAVE_PREEMPT_LAZY
 944         select HAVE_REGS_AND_STACK_ACCESS_API
 945         select HAVE_RCU_TABLE_FREE
 946         select HAVE_SYSCALL_TRACEPOINTS
 947 @@ -694,7 +695,7 @@
 948
 949  config XEN
 950         bool "Xen guest support on ARM64"
 951 -       depends on ARM64 && OF
 952 +       depends on ARM64 && OF && !PREEMPT_RT_FULL
 953         select SWIOTLB_XEN
 954         select PARAVIRT
 955         help
 956 diff -Nur linux-4.9.6.orig/arch/arm64/kernel/asm-offsets.c linux-4.9.6/arch/arm64/kernel/asm-offsets.c
 957 --- linux-4.9.6.orig/arch/arm64/kernel/asm-offsets.c    2017-01-26 08:25:24.000000000 +0100
 958 +++ linux-4.9.6/arch/arm64/kernel/asm-offsets.c 2017-01-28 13:59:09.895654805 +0100
 959 @@ -38,6 +38,7 @@
 960    BLANK();
 961    DEFINE(TI_FLAGS,             offsetof(struct thread_info, flags));
 962    DEFINE(TI_PREEMPT,           offsetof(struct thread_info, preempt_count));
 963 +  DEFINE(TI_PREEMPT_LAZY,      offsetof(struct thread_info, preempt_lazy_count));
 964    DEFINE(TI_ADDR_LIMIT,                offsetof(struct thread_info, addr_limit));
 965    DEFINE(TI_TASK,              offsetof(struct thread_info, task));
 966    DEFINE(TI_CPU,               offsetof(struct thread_info, cpu));
 967 diff -Nur linux-4.9.6.orig/arch/arm64/kernel/entry.S linux-4.9.6/arch/arm64/kernel/entry.S
 968 --- linux-4.9.6.orig/arch/arm64/kernel/entry.S  2017-01-26 08:25:24.000000000 +0100
 969 +++ linux-4.9.6/arch/arm64/kernel/entry.S       2017-01-28 13:59:09.895654805 +0100
 970 @@ -428,11 +428,16 @@
 971
 972  #ifdef CONFIG_PREEMPT
 973         ldr     w24, [tsk, #TI_PREEMPT]         // get preempt count
 974 -       cbnz    w24, 1f                         // preempt count != 0
 975 +       cbnz    w24, 2f                         // preempt count != 0
 976         ldr     x0, [tsk, #TI_FLAGS]            // get flags
 977 -       tbz     x0, #TIF_NEED_RESCHED, 1f       // needs rescheduling?
 978 -       bl      el1_preempt
 979 +       tbnz    x0, #TIF_NEED_RESCHED, 1f       // needs rescheduling?
 980 +
 981 +       ldr     w24, [tsk, #TI_PREEMPT_LAZY]    // get preempt lazy count
 982 +       cbnz    w24, 2f                         // preempt lazy count != 0
 983 +       tbz     x0, #TIF_NEED_RESCHED_LAZY, 2f  // needs rescheduling?
 984  1:
 985 +       bl      el1_preempt
 986 +2:
 987  #endif
 988  #ifdef CONFIG_TRACE_IRQFLAGS
 989         bl      trace_hardirqs_on
 990 @@ -446,6 +451,7 @@
 991  1:     bl      preempt_schedule_irq            // irq en/disable is done inside
 992         ldr     x0, [tsk, #TI_FLAGS]            // get new tasks TI_FLAGS
 993         tbnz    x0, #TIF_NEED_RESCHED, 1b       // needs rescheduling?
 994 +       tbnz    x0, #TIF_NEED_RESCHED_LAZY, 1b  // needs rescheduling?
 995         ret     x24
 996  #endif
 997
 998 diff -Nur linux-4.9.6.orig/arch/arm64/kernel/signal.c linux-4.9.6/arch/arm64/kernel/signal.c
 999 --- linux-4.9.6.orig/arch/arm64/kernel/signal.c 2017-01-26 08:25:24.000000000 +0100
1000 +++ linux-4.9.6/arch/arm64/kernel/signal.c      2017-01-28 13:59:09.895654805 +0100
1001 @@ -409,7 +409,7 @@
1002          */
1003         trace_hardirqs_off();
1004         do {
1005 -               if (thread_flags & _TIF_NEED_RESCHED) {
1006 +               if (thread_flags & _TIF_NEED_RESCHED_MASK) {
1007                         schedule();
1008                 } else {
1009                         local_irq_enable();
1010 diff -Nur linux-4.9.6.orig/arch/Kconfig linux-4.9.6/arch/Kconfig
1011 --- linux-4.9.6.orig/arch/Kconfig       2017-01-26 08:25:24.000000000 +0100
1012 +++ linux-4.9.6/arch/Kconfig    2017-01-28 13:59:09.887654498 +0100
1013 @@ -9,6 +9,7 @@
1014         tristate "OProfile system profiling"
1015         depends on PROFILING
1016         depends on HAVE_OPROFILE
1017 +       depends on !PREEMPT_RT_FULL
1018         select RING_BUFFER
1019         select RING_BUFFER_ALLOW_SWAP
1020         help
1021 @@ -52,6 +53,7 @@
1022  config JUMP_LABEL
1023         bool "Optimize very unlikely/likely branches"
1024         depends on HAVE_ARCH_JUMP_LABEL
1025 +       depends on (!INTERRUPT_OFF_HIST && !PREEMPT_OFF_HIST && !WAKEUP_LATENCY_HIST && !MISSED_TIMER_OFFSETS_HIST)
1026         help
1027           This option enables a transparent branch optimization that
1028          makes certain almost-always-true or almost-always-false branch
1029 diff -Nur linux-4.9.6.orig/arch/mips/Kconfig linux-4.9.6/arch/mips/Kconfig
1030 --- linux-4.9.6.orig/arch/mips/Kconfig  2017-01-26 08:25:24.000000000 +0100
1031 +++ linux-4.9.6/arch/mips/Kconfig       2017-01-28 13:59:09.895654805 +0100
1032 @@ -2514,7 +2514,7 @@
1033  #
1034  config HIGHMEM
1035         bool "High Memory Support"
1036 -       depends on 32BIT && CPU_SUPPORTS_HIGHMEM && SYS_SUPPORTS_HIGHMEM && !CPU_MIPS32_3_5_EVA
1037 +       depends on 32BIT && CPU_SUPPORTS_HIGHMEM && SYS_SUPPORTS_HIGHMEM && !CPU_MIPS32_3_5_EVA && !PREEMPT_RT_FULL
1038
1039  config CPU_SUPPORTS_HIGHMEM
1040         bool
1041 diff -Nur linux-4.9.6.orig/arch/powerpc/include/asm/thread_info.h linux-4.9.6/arch/powerpc/include/asm/thread_info.h
1042 --- linux-4.9.6.orig/arch/powerpc/include/asm/thread_info.h     2017-01-26 08:25:24.000000000 +0100
1043 +++ linux-4.9.6/arch/powerpc/include/asm/thread_info.h  2017-01-28 13:59:09.895654805 +0100
1044 @@ -43,6 +43,8 @@
1045         int             cpu;                    /* cpu we're on */
1046         int             preempt_count;          /* 0 => preemptable,
1047                                                    <0 => BUG */
1048 +       int             preempt_lazy_count;     /* 0 => preemptable,
1049 +                                                  <0 => BUG */
1050         unsigned long   local_flags;            /* private flags for thread */
1051  #ifdef CONFIG_LIVEPATCH
1052         unsigned long *livepatch_sp;
1053 @@ -88,8 +90,7 @@
1054  #define TIF_SYSCALL_TRACE      0       /* syscall trace active */
1055  #define TIF_SIGPENDING         1       /* signal pending */
1056  #define TIF_NEED_RESCHED       2       /* rescheduling necessary */
1057 -#define TIF_POLLING_NRFLAG     3       /* true if poll_idle() is polling
1058 -                                          TIF_NEED_RESCHED */
1059 +#define TIF_NEED_RESCHED_LAZY  3       /* lazy rescheduling necessary */
1060  #define TIF_32BIT              4       /* 32 bit binary */
1061  #define TIF_RESTORE_TM         5       /* need to restore TM FP/VEC/VSX */
1062  #define TIF_SYSCALL_AUDIT      7       /* syscall auditing active */
1063 @@ -107,6 +108,8 @@
1064  #if defined(CONFIG_PPC64)
1065  #define TIF_ELF2ABI            18      /* function descriptors must die! */
1066  #endif
1067 +#define TIF_POLLING_NRFLAG     19      /* true if poll_idle() is polling
1068 +                                          TIF_NEED_RESCHED */
1069
1070  /* as above, but as bit values */
1071  #define _TIF_SYSCALL_TRACE     (1<<TIF_SYSCALL_TRACE)
1072 @@ -125,14 +128,16 @@
1073  #define _TIF_SYSCALL_TRACEPOINT        (1<<TIF_SYSCALL_TRACEPOINT)
1074  #define _TIF_EMULATE_STACK_STORE       (1<<TIF_EMULATE_STACK_STORE)
1075  #define _TIF_NOHZ              (1<<TIF_NOHZ)
1076 +#define _TIF_NEED_RESCHED_LAZY (1<<TIF_NEED_RESCHED_LAZY)
1077  #define _TIF_SYSCALL_DOTRACE   (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \
1078                                  _TIF_SECCOMP | _TIF_SYSCALL_TRACEPOINT | \
1079                                  _TIF_NOHZ)
1080
1081  #define _TIF_USER_WORK_MASK    (_TIF_SIGPENDING | _TIF_NEED_RESCHED | \
1082                                  _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
1083 -                                _TIF_RESTORE_TM)
1084 +                                _TIF_RESTORE_TM | _TIF_NEED_RESCHED_LAZY)
1085  #define _TIF_PERSYSCALL_MASK   (_TIF_RESTOREALL|_TIF_NOERROR)
1086 +#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)
1087
1088  /* Bits in local_flags */
1089  /* Don't move TLF_NAPPING without adjusting the code in entry_32.S */
1090 diff -Nur linux-4.9.6.orig/arch/powerpc/Kconfig linux-4.9.6/arch/powerpc/Kconfig
1091 --- linux-4.9.6.orig/arch/powerpc/Kconfig       2017-01-26 08:25:24.000000000 +0100
1092 +++ linux-4.9.6/arch/powerpc/Kconfig    2017-01-28 13:59:09.895654805 +0100
1093 @@ -52,10 +52,11 @@
1094
1095  config RWSEM_GENERIC_SPINLOCK
1096         bool
1097 +       default y if PREEMPT_RT_FULL
1098
1099  config RWSEM_XCHGADD_ALGORITHM
1100         bool
1101 -       default y
1102 +       default y if !PREEMPT_RT_FULL
1103
1104  config GENERIC_LOCKBREAK
1105         bool
1106 @@ -134,6 +135,7 @@
1107         select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
1108         select GENERIC_STRNCPY_FROM_USER
1109         select GENERIC_STRNLEN_USER
1110 +       select HAVE_PREEMPT_LAZY
1111         select HAVE_MOD_ARCH_SPECIFIC
1112         select MODULES_USE_ELF_RELA
1113         select CLONE_BACKWARDS
1114 @@ -321,7 +323,7 @@
1115
1116  config HIGHMEM
1117         bool "High memory support"
1118 -       depends on PPC32
1119 +       depends on PPC32 && !PREEMPT_RT_FULL
1120
1121  source kernel/Kconfig.hz
1122  source kernel/Kconfig.preempt
1123 diff -Nur linux-4.9.6.orig/arch/powerpc/kernel/asm-offsets.c linux-4.9.6/arch/powerpc/kernel/asm-offsets.c
1124 --- linux-4.9.6.orig/arch/powerpc/kernel/asm-offsets.c  2017-01-26 08:25:24.000000000 +0100
1125 +++ linux-4.9.6/arch/powerpc/kernel/asm-offsets.c       2017-01-28 13:59:09.895654805 +0100
1126 @@ -156,6 +156,7 @@
1127         DEFINE(TI_FLAGS, offsetof(struct thread_info, flags));
1128         DEFINE(TI_LOCAL_FLAGS, offsetof(struct thread_info, local_flags));
1129         DEFINE(TI_PREEMPT, offsetof(struct thread_info, preempt_count));
1130 +       DEFINE(TI_PREEMPT_LAZY, offsetof(struct thread_info, preempt_lazy_count));
1131         DEFINE(TI_TASK, offsetof(struct thread_info, task));
1132         DEFINE(TI_CPU, offsetof(struct thread_info, cpu));
1133
1134 diff -Nur linux-4.9.6.orig/arch/powerpc/kernel/entry_32.S linux-4.9.6/arch/powerpc/kernel/entry_32.S
1135 --- linux-4.9.6.orig/arch/powerpc/kernel/entry_32.S     2017-01-26 08:25:24.000000000 +0100
1136 +++ linux-4.9.6/arch/powerpc/kernel/entry_32.S  2017-01-28 13:59:09.895654805 +0100
1137 @@ -835,7 +835,14 @@
1138         cmpwi   0,r0,0          /* if non-zero, just restore regs and return */
1139         bne     restore
1140         andi.   r8,r8,_TIF_NEED_RESCHED
1141 +       bne+    1f
1142 +       lwz     r0,TI_PREEMPT_LAZY(r9)
1143 +       cmpwi   0,r0,0          /* if non-zero, just restore regs and return */
1144 +       bne     restore
1145 +       lwz     r0,TI_FLAGS(r9)
1146 +       andi.   r0,r0,_TIF_NEED_RESCHED_LAZY
1147         beq+    restore
1148 +1:
1149         lwz     r3,_MSR(r1)
1150         andi.   r0,r3,MSR_EE    /* interrupts off? */
1151         beq     restore         /* don't schedule if so */
1152 @@ -846,11 +853,11 @@
1153          */
1154         bl      trace_hardirqs_off
1155  #endif
1156 -1:     bl      preempt_schedule_irq
1157 +2:     bl      preempt_schedule_irq
1158         CURRENT_THREAD_INFO(r9, r1)
1159         lwz     r3,TI_FLAGS(r9)
1160 -       andi.   r0,r3,_TIF_NEED_RESCHED
1161 -       bne-    1b
1162 +       andi.   r0,r3,_TIF_NEED_RESCHED_MASK
1163 +       bne-    2b
1164  #ifdef CONFIG_TRACE_IRQFLAGS
1165         /* And now, to properly rebalance the above, we tell lockdep they
1166          * are being turned back on, which will happen when we return
1167 @@ -1171,7 +1178,7 @@
1168  #endif /* !(CONFIG_4xx || CONFIG_BOOKE) */
1169
1170  do_work:                       /* r10 contains MSR_KERNEL here */
1171 -       andi.   r0,r9,_TIF_NEED_RESCHED
1172 +       andi.   r0,r9,_TIF_NEED_RESCHED_MASK
1173         beq     do_user_signal
1174
1175  do_resched:                    /* r10 contains MSR_KERNEL here */
1176 @@ -1192,7 +1199,7 @@
1177         MTMSRD(r10)             /* disable interrupts */
1178         CURRENT_THREAD_INFO(r9, r1)
1179         lwz     r9,TI_FLAGS(r9)
1180 -       andi.   r0,r9,_TIF_NEED_RESCHED
1181 +       andi.   r0,r9,_TIF_NEED_RESCHED_MASK
1182         bne-    do_resched
1183         andi.   r0,r9,_TIF_USER_WORK_MASK
1184         beq     restore_user
1185 diff -Nur linux-4.9.6.orig/arch/powerpc/kernel/entry_64.S linux-4.9.6/arch/powerpc/kernel/entry_64.S
1186 --- linux-4.9.6.orig/arch/powerpc/kernel/entry_64.S     2017-01-26 08:25:24.000000000 +0100
1187 +++ linux-4.9.6/arch/powerpc/kernel/entry_64.S  2017-01-28 13:59:09.895654805 +0100
1188 @@ -656,7 +656,7 @@
1189         bl      restore_math
1190         b       restore
1191  #endif
1192 -1:     andi.   r0,r4,_TIF_NEED_RESCHED
1193 +1:     andi.   r0,r4,_TIF_NEED_RESCHED_MASK
1194         beq     2f
1195         bl      restore_interrupts
1196         SCHEDULE_USER
1197 @@ -718,10 +718,18 @@
1198
1199  #ifdef CONFIG_PREEMPT
1200         /* Check if we need to preempt */
1201 +       lwz     r8,TI_PREEMPT(r9)
1202 +       cmpwi   0,r8,0          /* if non-zero, just restore regs and return */
1203 +       bne     restore
1204         andi.   r0,r4,_TIF_NEED_RESCHED
1205 +       bne+    check_count
1206 +
1207 +       andi.   r0,r4,_TIF_NEED_RESCHED_LAZY
1208         beq+    restore
1209 +       lwz     r8,TI_PREEMPT_LAZY(r9)
1210 +
1211         /* Check that preempt_count() == 0 and interrupts are enabled */
1212 -       lwz     r8,TI_PREEMPT(r9)
1213 +check_count:
1214         cmpwi   cr1,r8,0
1215         ld      r0,SOFTE(r1)
1216         cmpdi   r0,0
1217 @@ -738,7 +746,7 @@
1218         /* Re-test flags and eventually loop */
1219         CURRENT_THREAD_INFO(r9, r1)
1220         ld      r4,TI_FLAGS(r9)
1221 -       andi.   r0,r4,_TIF_NEED_RESCHED
1222 +       andi.   r0,r4,_TIF_NEED_RESCHED_MASK
1223         bne     1b
1224
1225         /*
1226 diff -Nur linux-4.9.6.orig/arch/powerpc/kernel/irq.c linux-4.9.6/arch/powerpc/kernel/irq.c
1227 --- linux-4.9.6.orig/arch/powerpc/kernel/irq.c  2017-01-26 08:25:24.000000000 +0100
1228 +++ linux-4.9.6/arch/powerpc/kernel/irq.c       2017-01-28 13:59:09.895654805 +0100
1229 @@ -638,6 +638,7 @@
1230         }
1231  }
1232
1233 +#ifndef CONFIG_PREEMPT_RT_FULL
1234  void do_softirq_own_stack(void)
1235  {
1236         struct thread_info *curtp, *irqtp;
1237 @@ -655,6 +656,7 @@
1238         if (irqtp->flags)
1239                 set_bits(irqtp->flags, &curtp->flags);
1240  }
1241 +#endif
1242
1243  irq_hw_number_t virq_to_hw(unsigned int virq)
1244  {
1245 diff -Nur linux-4.9.6.orig/arch/powerpc/kernel/misc_32.S linux-4.9.6/arch/powerpc/kernel/misc_32.S
1246 --- linux-4.9.6.orig/arch/powerpc/kernel/misc_32.S      2017-01-26 08:25:24.000000000 +0100
1247 +++ linux-4.9.6/arch/powerpc/kernel/misc_32.S   2017-01-28 13:59:09.895654805 +0100
1248 @@ -41,6 +41,7 @@
1249   * We store the saved ksp_limit in the unused part
1250   * of the STACK_FRAME_OVERHEAD
1251   */
1252 +#ifndef CONFIG_PREEMPT_RT_FULL
1253  _GLOBAL(call_do_softirq)
1254         mflr    r0
1255         stw     r0,4(r1)
1256 @@ -57,6 +58,7 @@
1257         stw     r10,THREAD+KSP_LIMIT(r2)
1258         mtlr    r0
1259         blr
1260 +#endif
1261
1262  /*
1263   * void call_do_irq(struct pt_regs *regs, struct thread_info *irqtp);
1264 diff -Nur linux-4.9.6.orig/arch/powerpc/kernel/misc_64.S linux-4.9.6/arch/powerpc/kernel/misc_64.S
1265 --- linux-4.9.6.orig/arch/powerpc/kernel/misc_64.S      2017-01-26 08:25:24.000000000 +0100
1266 +++ linux-4.9.6/arch/powerpc/kernel/misc_64.S   2017-01-28 13:59:09.895654805 +0100
1267 @@ -31,6 +31,7 @@
1268
1269         .text
1270
1271 +#ifndef CONFIG_PREEMPT_RT_FULL
1272  _GLOBAL(call_do_softirq)
1273         mflr    r0
1274         std     r0,16(r1)
1275 @@ -41,6 +42,7 @@
1276         ld      r0,16(r1)
1277         mtlr    r0
1278         blr
1279 +#endif
1280
1281  _GLOBAL(call_do_irq)
1282         mflr    r0
1283 diff -Nur linux-4.9.6.orig/arch/powerpc/kvm/Kconfig linux-4.9.6/arch/powerpc/kvm/Kconfig
1284 --- linux-4.9.6.orig/arch/powerpc/kvm/Kconfig   2017-01-26 08:25:24.000000000 +0100
1285 +++ linux-4.9.6/arch/powerpc/kvm/Kconfig        2017-01-28 13:59:09.895654805 +0100
1286 @@ -175,6 +175,7 @@
1287  config KVM_MPIC
1288         bool "KVM in-kernel MPIC emulation"
1289         depends on KVM && E500
1290 +       depends on !PREEMPT_RT_FULL
1291         select HAVE_KVM_IRQCHIP
1292         select HAVE_KVM_IRQFD
1293         select HAVE_KVM_IRQ_ROUTING
1294 diff -Nur linux-4.9.6.orig/arch/powerpc/platforms/ps3/device-init.c linux-4.9.6/arch/powerpc/platforms/ps3/device-init.c
1295 --- linux-4.9.6.orig/arch/powerpc/platforms/ps3/device-init.c   2017-01-26 08:25:24.000000000 +0100
1296 +++ linux-4.9.6/arch/powerpc/platforms/ps3/device-init.c        2017-01-28 13:59:09.895654805 +0100
1297 @@ -752,7 +752,7 @@
1298         }
1299         pr_debug("%s:%u: notification %s issued\n", __func__, __LINE__, op);
1300
1301 -       res = wait_event_interruptible(dev->done.wait,
1302 +       res = swait_event_interruptible(dev->done.wait,
1303                                        dev->done.done || kthread_should_stop());
1304         if (kthread_should_stop())
1305                 res = -EINTR;
1306 diff -Nur linux-4.9.6.orig/arch/sh/kernel/irq.c linux-4.9.6/arch/sh/kernel/irq.c
1307 --- linux-4.9.6.orig/arch/sh/kernel/irq.c       2017-01-26 08:25:24.000000000 +0100
1308 +++ linux-4.9.6/arch/sh/kernel/irq.c    2017-01-28 13:59:09.895654805 +0100
1309 @@ -147,6 +147,7 @@
1310         hardirq_ctx[cpu] = NULL;
1311  }
1312
1313 +#ifndef CONFIG_PREEMPT_RT_FULL
1314  void do_softirq_own_stack(void)
1315  {
1316         struct thread_info *curctx;
1317 @@ -174,6 +175,7 @@
1318                   "r5", "r6", "r7", "r8", "r9", "r15", "t", "pr"
1319         );
1320  }
1321 +#endif
1322  #else
1323  static inline void handle_one_irq(unsigned int irq)
1324  {
1325 diff -Nur linux-4.9.6.orig/arch/sparc/Kconfig linux-4.9.6/arch/sparc/Kconfig
1326 --- linux-4.9.6.orig/arch/sparc/Kconfig 2017-01-26 08:25:24.000000000 +0100
1327 +++ linux-4.9.6/arch/sparc/Kconfig      2017-01-28 13:59:09.895654805 +0100
1328 @@ -194,12 +194,10 @@
1329  source kernel/Kconfig.hz
1330
1331  config RWSEM_GENERIC_SPINLOCK
1332 -       bool
1333 -       default y if SPARC32
1334 +       def_bool PREEMPT_RT_FULL
1335
1336  config RWSEM_XCHGADD_ALGORITHM
1337 -       bool
1338 -       default y if SPARC64
1339 +       def_bool !RWSEM_GENERIC_SPINLOCK && !PREEMPT_RT_FULL
1340
1341  config GENERIC_HWEIGHT
1342         bool
1343 diff -Nur linux-4.9.6.orig/arch/sparc/kernel/irq_64.c linux-4.9.6/arch/sparc/kernel/irq_64.c
1344 --- linux-4.9.6.orig/arch/sparc/kernel/irq_64.c 2017-01-26 08:25:24.000000000 +0100
1345 +++ linux-4.9.6/arch/sparc/kernel/irq_64.c      2017-01-28 13:59:09.895654805 +0100
1346 @@ -854,6 +854,7 @@
1347         set_irq_regs(old_regs);
1348  }
1349
1350 +#ifndef CONFIG_PREEMPT_RT_FULL
1351  void do_softirq_own_stack(void)
1352  {
1353         void *orig_sp, *sp = softirq_stack[smp_processor_id()];
1354 @@ -868,6 +869,7 @@
1355         __asm__ __volatile__("mov %0, %%sp"
1356                              : : "r" (orig_sp));
1357  }
1358 +#endif
1359
1360  #ifdef CONFIG_HOTPLUG_CPU
1361  void fixup_irqs(void)
1362 diff -Nur linux-4.9.6.orig/arch/x86/crypto/aesni-intel_glue.c linux-4.9.6/arch/x86/crypto/aesni-intel_glue.c
1363 --- linux-4.9.6.orig/arch/x86/crypto/aesni-intel_glue.c 2017-01-26 08:25:24.000000000 +0100
1364 +++ linux-4.9.6/arch/x86/crypto/aesni-intel_glue.c      2017-01-28 13:59:09.895654805 +0100
1365 @@ -372,14 +372,14 @@
1366         err = blkcipher_walk_virt(desc, &walk);
1367         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
1368
1369 -       kernel_fpu_begin();
1370         while ((nbytes = walk.nbytes)) {
1371 +               kernel_fpu_begin();
1372                 aesni_ecb_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr,
1373 -                             nbytes & AES_BLOCK_MASK);
1374 +                               nbytes & AES_BLOCK_MASK);
1375 +               kernel_fpu_end();
1376                 nbytes &= AES_BLOCK_SIZE - 1;
1377                 err = blkcipher_walk_done(desc, &walk, nbytes);
1378         }
1379 -       kernel_fpu_end();
1380
1381         return err;
1382  }
1383 @@ -396,14 +396,14 @@
1384         err = blkcipher_walk_virt(desc, &walk);
1385         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
1386
1387 -       kernel_fpu_begin();
1388         while ((nbytes = walk.nbytes)) {
1389 +               kernel_fpu_begin();
1390                 aesni_ecb_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr,
1391                               nbytes & AES_BLOCK_MASK);
1392 +               kernel_fpu_end();
1393                 nbytes &= AES_BLOCK_SIZE - 1;
1394                 err = blkcipher_walk_done(desc, &walk, nbytes);
1395         }
1396 -       kernel_fpu_end();
1397
1398         return err;
1399  }
1400 @@ -420,14 +420,14 @@
1401         err = blkcipher_walk_virt(desc, &walk);
1402         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
1403
1404 -       kernel_fpu_begin();
1405         while ((nbytes = walk.nbytes)) {
1406 +               kernel_fpu_begin();
1407                 aesni_cbc_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr,
1408                               nbytes & AES_BLOCK_MASK, walk.iv);
1409 +               kernel_fpu_end();
1410                 nbytes &= AES_BLOCK_SIZE - 1;
1411                 err = blkcipher_walk_done(desc, &walk, nbytes);
1412         }
1413 -       kernel_fpu_end();
1414
1415         return err;
1416  }
1417 @@ -444,14 +444,14 @@
1418         err = blkcipher_walk_virt(desc, &walk);
1419         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
1420
1421 -       kernel_fpu_begin();
1422         while ((nbytes = walk.nbytes)) {
1423 +               kernel_fpu_begin();
1424                 aesni_cbc_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr,
1425                               nbytes & AES_BLOCK_MASK, walk.iv);
1426 +               kernel_fpu_end();
1427                 nbytes &= AES_BLOCK_SIZE - 1;
1428                 err = blkcipher_walk_done(desc, &walk, nbytes);
1429         }
1430 -       kernel_fpu_end();
1431
1432         return err;
1433  }
1434 @@ -503,18 +503,20 @@
1435         err = blkcipher_walk_virt_block(desc, &walk, AES_BLOCK_SIZE);
1436         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
1437
1438 -       kernel_fpu_begin();
1439         while ((nbytes = walk.nbytes) >= AES_BLOCK_SIZE) {
1440 +               kernel_fpu_begin();
1441                 aesni_ctr_enc_tfm(ctx, walk.dst.virt.addr, walk.src.virt.addr,
1442                                       nbytes & AES_BLOCK_MASK, walk.iv);
1443 +               kernel_fpu_end();
1444                 nbytes &= AES_BLOCK_SIZE - 1;
1445                 err = blkcipher_walk_done(desc, &walk, nbytes);
1446         }
1447         if (walk.nbytes) {
1448 +               kernel_fpu_begin();
1449                 ctr_crypt_final(ctx, &walk);
1450 +               kernel_fpu_end();
1451                 err = blkcipher_walk_done(desc, &walk, 0);
1452         }
1453 -       kernel_fpu_end();
1454
1455         return err;
1456  }
1457 diff -Nur linux-4.9.6.orig/arch/x86/crypto/cast5_avx_glue.c linux-4.9.6/arch/x86/crypto/cast5_avx_glue.c
1458 --- linux-4.9.6.orig/arch/x86/crypto/cast5_avx_glue.c   2017-01-26 08:25:24.000000000 +0100
1459 +++ linux-4.9.6/arch/x86/crypto/cast5_avx_glue.c        2017-01-28 13:59:09.895654805 +0100
1460 @@ -59,7 +59,7 @@
1461  static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
1462                      bool enc)
1463  {
1464 -       bool fpu_enabled = false;
1465 +       bool fpu_enabled;
1466         struct cast5_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
1467         const unsigned int bsize = CAST5_BLOCK_SIZE;
1468         unsigned int nbytes;
1469 @@ -75,7 +75,7 @@
1470                 u8 *wsrc = walk->src.virt.addr;
1471                 u8 *wdst = walk->dst.virt.addr;
1472
1473 -               fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes);
1474 +               fpu_enabled = cast5_fpu_begin(false, nbytes);
1475
1476                 /* Process multi-block batch */
1477                 if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) {
1478 @@ -103,10 +103,9 @@
1479                 } while (nbytes >= bsize);
1480
1481  done:
1482 +               cast5_fpu_end(fpu_enabled);
1483                 err = blkcipher_walk_done(desc, walk, nbytes);
1484         }
1485 -
1486 -       cast5_fpu_end(fpu_enabled);
1487         return err;
1488  }
1489
1490 @@ -227,7 +226,7 @@
1491  static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
1492                        struct scatterlist *src, unsigned int nbytes)
1493  {
1494 -       bool fpu_enabled = false;
1495 +       bool fpu_enabled;
1496         struct blkcipher_walk walk;
1497         int err;
1498
1499 @@ -236,12 +235,11 @@
1500         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
1501
1502         while ((nbytes = walk.nbytes)) {
1503 -               fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes);
1504 +               fpu_enabled = cast5_fpu_begin(false, nbytes);
1505                 nbytes = __cbc_decrypt(desc, &walk);
1506 +               cast5_fpu_end(fpu_enabled);
1507                 err = blkcipher_walk_done(desc, &walk, nbytes);
1508         }
1509 -
1510 -       cast5_fpu_end(fpu_enabled);
1511         return err;
1512  }
1513
1514 @@ -311,7 +309,7 @@
1515  static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
1516                      struct scatterlist *src, unsigned int nbytes)
1517  {
1518 -       bool fpu_enabled = false;
1519 +       bool fpu_enabled;
1520         struct blkcipher_walk walk;
1521         int err;
1522
1523 @@ -320,13 +318,12 @@
1524         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
1525
1526         while ((nbytes = walk.nbytes) >= CAST5_BLOCK_SIZE) {
1527 -               fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes);
1528 +               fpu_enabled = cast5_fpu_begin(false, nbytes);
1529                 nbytes = __ctr_crypt(desc, &walk);
1530 +               cast5_fpu_end(fpu_enabled);
1531                 err = blkcipher_walk_done(desc, &walk, nbytes);
1532         }
1533
1534 -       cast5_fpu_end(fpu_enabled);
1535 -
1536         if (walk.nbytes) {
1537                 ctr_crypt_final(desc, &walk);
1538                 err = blkcipher_walk_done(desc, &walk, 0);
1539 diff -Nur linux-4.9.6.orig/arch/x86/crypto/glue_helper.c linux-4.9.6/arch/x86/crypto/glue_helper.c
1540 --- linux-4.9.6.orig/arch/x86/crypto/glue_helper.c      2017-01-26 08:25:24.000000000 +0100
1541 +++ linux-4.9.6/arch/x86/crypto/glue_helper.c   2017-01-28 13:59:09.895654805 +0100
1542 @@ -39,7 +39,7 @@
1543         void *ctx = crypto_blkcipher_ctx(desc->tfm);
1544         const unsigned int bsize = 128 / 8;
1545         unsigned int nbytes, i, func_bytes;
1546 -       bool fpu_enabled = false;
1547 +       bool fpu_enabled;
1548         int err;
1549
1550         err = blkcipher_walk_virt(desc, walk);
1551 @@ -49,7 +49,7 @@
1552                 u8 *wdst = walk->dst.virt.addr;
1553
1554                 fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
1555 -                                            desc, fpu_enabled, nbytes);
1556 +                                            desc, false, nbytes);
1557
1558                 for (i = 0; i < gctx->num_funcs; i++) {
1559                         func_bytes = bsize * gctx->funcs[i].num_blocks;
1560 @@ -71,10 +71,10 @@
1561                 }
1562
1563  done:
1564 +               glue_fpu_end(fpu_enabled);
1565                 err = blkcipher_walk_done(desc, walk, nbytes);
1566         }
1567
1568 -       glue_fpu_end(fpu_enabled);
1569         return err;
1570  }
1571
1572 @@ -194,7 +194,7 @@
1573                             struct scatterlist *src, unsigned int nbytes)
1574  {
1575         const unsigned int bsize = 128 / 8;
1576 -       bool fpu_enabled = false;
1577 +       bool fpu_enabled;
1578         struct blkcipher_walk walk;
1579         int err;
1580
1581 @@ -203,12 +203,12 @@
1582
1583         while ((nbytes = walk.nbytes)) {
1584                 fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
1585 -                                            desc, fpu_enabled, nbytes);
1586 +                                            desc, false, nbytes);
1587                 nbytes = __glue_cbc_decrypt_128bit(gctx, desc, &walk);
1588 +               glue_fpu_end(fpu_enabled);
1589                 err = blkcipher_walk_done(desc, &walk, nbytes);
1590         }
1591
1592 -       glue_fpu_end(fpu_enabled);
1593         return err;
1594  }
1595  EXPORT_SYMBOL_GPL(glue_cbc_decrypt_128bit);
1596 @@ -277,7 +277,7 @@
1597                           struct scatterlist *src, unsigned int nbytes)
1598  {
1599         const unsigned int bsize = 128 / 8;
1600 -       bool fpu_enabled = false;
1601 +       bool fpu_enabled;
1602         struct blkcipher_walk walk;
1603         int err;
1604
1605 @@ -286,13 +286,12 @@
1606
1607         while ((nbytes = walk.nbytes) >= bsize) {
1608                 fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
1609 -                                            desc, fpu_enabled, nbytes);
1610 +                                            desc, false, nbytes);
1611                 nbytes = __glue_ctr_crypt_128bit(gctx, desc, &walk);
1612 +               glue_fpu_end(fpu_enabled);
1613                 err = blkcipher_walk_done(desc, &walk, nbytes);
1614         }
1615
1616 -       glue_fpu_end(fpu_enabled);
1617 -
1618         if (walk.nbytes) {
1619                 glue_ctr_crypt_final_128bit(
1620                         gctx->funcs[gctx->num_funcs - 1].fn_u.ctr, desc, &walk);
1621 @@ -347,7 +346,7 @@
1622                           void *tweak_ctx, void *crypt_ctx)
1623  {
1624         const unsigned int bsize = 128 / 8;
1625 -       bool fpu_enabled = false;
1626 +       bool fpu_enabled;
1627         struct blkcipher_walk walk;
1628         int err;
1629
1630 @@ -360,21 +359,21 @@
1631
1632         /* set minimum length to bsize, for tweak_fn */
1633         fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
1634 -                                    desc, fpu_enabled,
1635 +                                    desc, false,
1636                                      nbytes < bsize ? bsize : nbytes);
1637 -
1638         /* calculate first value of T */
1639         tweak_fn(tweak_ctx, walk.iv, walk.iv);
1640 +       glue_fpu_end(fpu_enabled);
1641
1642         while (nbytes) {
1643 +               fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
1644 +                               desc, false, nbytes);
1645                 nbytes = __glue_xts_crypt_128bit(gctx, crypt_ctx, desc, &walk);
1646
1647 +               glue_fpu_end(fpu_enabled);
1648                 err = blkcipher_walk_done(desc, &walk, nbytes);
1649                 nbytes = walk.nbytes;
1650         }
1651 -
1652 -       glue_fpu_end(fpu_enabled);
1653 -
1654         return err;
1655  }
1656  EXPORT_SYMBOL_GPL(glue_xts_crypt_128bit);
1657 diff -Nur linux-4.9.6.orig/arch/x86/entry/common.c linux-4.9.6/arch/x86/entry/common.c
1658 --- linux-4.9.6.orig/arch/x86/entry/common.c    2017-01-26 08:25:24.000000000 +0100
1659 +++ linux-4.9.6/arch/x86/entry/common.c 2017-01-28 13:59:09.899654958 +0100
1660 @@ -129,7 +129,7 @@
1661
1662  #define EXIT_TO_USERMODE_LOOP_FLAGS                            \
1663         (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE |   \
1664 -        _TIF_NEED_RESCHED | _TIF_USER_RETURN_NOTIFY)
1665 +        _TIF_NEED_RESCHED_MASK | _TIF_USER_RETURN_NOTIFY)
1666
1667  static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags)
1668  {
1669 @@ -145,9 +145,16 @@
1670                 /* We have work to do. */
1671                 local_irq_enable();
1672
1673 -               if (cached_flags & _TIF_NEED_RESCHED)
1674 +               if (cached_flags & _TIF_NEED_RESCHED_MASK)
1675                         schedule();
1676
1677 +#ifdef ARCH_RT_DELAYS_SIGNAL_SEND
1678 +               if (unlikely(current->forced_info.si_signo)) {
1679 +                       struct task_struct *t = current;
1680 +                       force_sig_info(t->forced_info.si_signo, &t->forced_info, t);
1681 +                       t->forced_info.si_signo = 0;
1682 +               }
1683 +#endif
1684                 if (cached_flags & _TIF_UPROBE)
1685                         uprobe_notify_resume(regs);
1686
1687 diff -Nur linux-4.9.6.orig/arch/x86/entry/entry_32.S linux-4.9.6/arch/x86/entry/entry_32.S
1688 --- linux-4.9.6.orig/arch/x86/entry/entry_32.S  2017-01-26 08:25:24.000000000 +0100
1689 +++ linux-4.9.6/arch/x86/entry/entry_32.S       2017-01-28 13:59:09.899654958 +0100
1690 @@ -308,8 +308,25 @@
1691  ENTRY(resume_kernel)
1692         DISABLE_INTERRUPTS(CLBR_ANY)
1693  need_resched:
1694 +       # preempt count == 0 + NEED_RS set?
1695         cmpl    $0, PER_CPU_VAR(__preempt_count)
1696 +#ifndef CONFIG_PREEMPT_LAZY
1697         jnz     restore_all
1698 +#else
1699 +       jz test_int_off
1700 +
1701 +       # atleast preempt count == 0 ?
1702 +       cmpl $_PREEMPT_ENABLED,PER_CPU_VAR(__preempt_count)
1703 +       jne restore_all
1704 +
1705 +       movl    PER_CPU_VAR(current_task), %ebp
1706 +       cmpl $0,TASK_TI_preempt_lazy_count(%ebp)        # non-zero preempt_lazy_count ?
1707 +       jnz restore_all
1708 +
1709 +       testl $_TIF_NEED_RESCHED_LAZY, TASK_TI_flags(%ebp)
1710 +       jz restore_all
1711 +test_int_off:
1712 +#endif
1713         testl   $X86_EFLAGS_IF, PT_EFLAGS(%esp) # interrupts off (exception path) ?
1714         jz      restore_all
1715         call    preempt_schedule_irq
1716 diff -Nur linux-4.9.6.orig/arch/x86/entry/entry_64.S linux-4.9.6/arch/x86/entry/entry_64.S
1717 --- linux-4.9.6.orig/arch/x86/entry/entry_64.S  2017-01-26 08:25:24.000000000 +0100
1718 +++ linux-4.9.6/arch/x86/entry/entry_64.S       2017-01-28 13:59:09.899654958 +0100
1719 @@ -546,7 +546,23 @@
1720         bt      $9, EFLAGS(%rsp)                /* were interrupts off? */
1721         jnc     1f
1722  0:     cmpl    $0, PER_CPU_VAR(__preempt_count)
1723 +#ifndef CONFIG_PREEMPT_LAZY
1724         jnz     1f
1725 +#else
1726 +       jz      do_preempt_schedule_irq
1727 +
1728 +       # atleast preempt count == 0 ?
1729 +       cmpl $_PREEMPT_ENABLED,PER_CPU_VAR(__preempt_count)
1730 +       jnz     1f
1731 +
1732 +       movq    PER_CPU_VAR(current_task), %rcx
1733 +       cmpl    $0, TASK_TI_preempt_lazy_count(%rcx)
1734 +       jnz     1f
1735 +
1736 +       bt      $TIF_NEED_RESCHED_LAZY,TASK_TI_flags(%rcx)
1737 +       jnc     1f
1738 +do_preempt_schedule_irq:
1739 +#endif
1740         call    preempt_schedule_irq
1741         jmp     0b
1742  1:
1743 @@ -894,6 +910,7 @@
1744         jmp     2b
1745         .previous
1746
1747 +#ifndef CONFIG_PREEMPT_RT_FULL
1748  /* Call softirq on interrupt stack. Interrupts are off. */
1749  ENTRY(do_softirq_own_stack)
1750         pushq   %rbp
1751 @@ -906,6 +923,7 @@
1752         decl    PER_CPU_VAR(irq_count)
1753         ret
1754  END(do_softirq_own_stack)
1755 +#endif
1756
1757  #ifdef CONFIG_XEN
1758  idtentry xen_hypervisor_callback xen_do_hypervisor_callback has_error_code=0
1759 diff -Nur linux-4.9.6.orig/arch/x86/include/asm/preempt.h linux-4.9.6/arch/x86/include/asm/preempt.h
1760 --- linux-4.9.6.orig/arch/x86/include/asm/preempt.h     2017-01-26 08:25:24.000000000 +0100
1761 +++ linux-4.9.6/arch/x86/include/asm/preempt.h  2017-01-28 13:59:09.899654958 +0100
1762 @@ -79,17 +79,46 @@
1763   * a decrement which hits zero means we have no preempt_count and should
1764   * reschedule.
1765   */
1766 -static __always_inline bool __preempt_count_dec_and_test(void)
1767 +static __always_inline bool ____preempt_count_dec_and_test(void)
1768  {
1769         GEN_UNARY_RMWcc("decl", __preempt_count, __percpu_arg(0), e);
1770  }
1771
1772 +static __always_inline bool __preempt_count_dec_and_test(void)
1773 +{
1774 +       if (____preempt_count_dec_and_test())
1775 +               return true;
1776 +#ifdef CONFIG_PREEMPT_LAZY
1777 +       if (current_thread_info()->preempt_lazy_count)
1778 +               return false;
1779 +       return test_thread_flag(TIF_NEED_RESCHED_LAZY);
1780 +#else
1781 +       return false;
1782 +#endif
1783 +}
1784 +
1785  /*
1786   * Returns true when we need to resched and can (barring IRQ state).
1787   */
1788  static __always_inline bool should_resched(int preempt_offset)
1789  {
1790 +#ifdef CONFIG_PREEMPT_LAZY
1791 +       u32 tmp;
1792 +
1793 +       tmp = raw_cpu_read_4(__preempt_count);
1794 +       if (tmp == preempt_offset)
1795 +               return true;
1796 +
1797 +       /* preempt count == 0 ? */
1798 +       tmp &= ~PREEMPT_NEED_RESCHED;
1799 +       if (tmp)
1800 +               return false;
1801 +       if (current_thread_info()->preempt_lazy_count)
1802 +               return false;
1803 +       return test_thread_flag(TIF_NEED_RESCHED_LAZY);
1804 +#else
1805         return unlikely(raw_cpu_read_4(__preempt_count) == preempt_offset);
1806 +#endif
1807  }
1808
1809  #ifdef CONFIG_PREEMPT
1810 diff -Nur linux-4.9.6.orig/arch/x86/include/asm/signal.h linux-4.9.6/arch/x86/include/asm/signal.h
1811 --- linux-4.9.6.orig/arch/x86/include/asm/signal.h      2017-01-26 08:25:24.000000000 +0100
1812 +++ linux-4.9.6/arch/x86/include/asm/signal.h   2017-01-28 13:59:09.899654958 +0100
1813 @@ -27,6 +27,19 @@
1814  #define SA_IA32_ABI    0x02000000u
1815  #define SA_X32_ABI     0x01000000u
1816
1817 +/*
1818 + * Because some traps use the IST stack, we must keep preemption
1819 + * disabled while calling do_trap(), but do_trap() may call
1820 + * force_sig_info() which will grab the signal spin_locks for the
1821 + * task, which in PREEMPT_RT_FULL are mutexes.  By defining
1822 + * ARCH_RT_DELAYS_SIGNAL_SEND the force_sig_info() will set
1823 + * TIF_NOTIFY_RESUME and set up the signal to be sent on exit of the
1824 + * trap.
1825 + */
1826 +#if defined(CONFIG_PREEMPT_RT_FULL)
1827 +#define ARCH_RT_DELAYS_SIGNAL_SEND
1828 +#endif
1829 +
1830  #ifndef CONFIG_COMPAT
1831  typedef sigset_t compat_sigset_t;
1832  #endif
1833 diff -Nur linux-4.9.6.orig/arch/x86/include/asm/stackprotector.h linux-4.9.6/arch/x86/include/asm/stackprotector.h
1834 --- linux-4.9.6.orig/arch/x86/include/asm/stackprotector.h      2017-01-26 08:25:24.000000000 +0100
1835 +++ linux-4.9.6/arch/x86/include/asm/stackprotector.h   2017-01-28 13:59:09.899654958 +0100
1836 @@ -59,7 +59,7 @@
1837   */
1838  static __always_inline void boot_init_stack_canary(void)
1839  {
1840 -       u64 canary;
1841 +       u64 uninitialized_var(canary);
1842         u64 tsc;
1843
1844  #ifdef CONFIG_X86_64
1845 @@ -70,8 +70,15 @@
1846          * of randomness. The TSC only matters for very early init,
1847          * there it already has some randomness on most systems. Later
1848          * on during the bootup the random pool has true entropy too.
1849 +        *
1850 +        * For preempt-rt we need to weaken the randomness a bit, as
1851 +        * we can't call into the random generator from atomic context
1852 +        * due to locking constraints. We just leave canary
1853 +        * uninitialized and use the TSC based randomness on top of it.
1854          */
1855 +#ifndef CONFIG_PREEMPT_RT_FULL
1856         get_random_bytes(&canary, sizeof(canary));
1857 +#endif
1858         tsc = rdtsc();
1859         canary += tsc + (tsc << 32UL);
1860
1861 diff -Nur linux-4.9.6.orig/arch/x86/include/asm/thread_info.h linux-4.9.6/arch/x86/include/asm/thread_info.h
1862 --- linux-4.9.6.orig/arch/x86/include/asm/thread_info.h 2017-01-26 08:25:24.000000000 +0100
1863 +++ linux-4.9.6/arch/x86/include/asm/thread_info.h      2017-01-28 13:59:09.899654958 +0100
1864 @@ -54,11 +54,14 @@
1865
1866  struct thread_info {
1867         unsigned long           flags;          /* low level flags */
1868 +       int                     preempt_lazy_count;     /* 0 => lazy preemptable
1869 +                                                          <0 => BUG */
1870  };
1871
1872  #define INIT_THREAD_INFO(tsk)                  \
1873  {                                              \
1874         .flags          = 0,                    \
1875 +       .preempt_lazy_count = 0,                \
1876  }
1877
1878  #define init_stack             (init_thread_union.stack)
1879 @@ -67,6 +70,10 @@
1880
1881  #include <asm/asm-offsets.h>
1882
1883 +#define GET_THREAD_INFO(reg) \
1884 +       _ASM_MOV PER_CPU_VAR(cpu_current_top_of_stack),reg ; \
1885 +       _ASM_SUB $(THREAD_SIZE),reg ;
1886 +
1887  #endif
1888
1889  /*
1890 @@ -85,6 +92,7 @@
1891  #define TIF_SYSCALL_EMU                6       /* syscall emulation active */
1892  #define TIF_SYSCALL_AUDIT      7       /* syscall auditing active */
1893  #define TIF_SECCOMP            8       /* secure computing */
1894 +#define TIF_NEED_RESCHED_LAZY  9       /* lazy rescheduling necessary */
1895  #define TIF_USER_RETURN_NOTIFY 11      /* notify kernel of userspace return */
1896  #define TIF_UPROBE             12      /* breakpointed or singlestepping */
1897  #define TIF_NOTSC              16      /* TSC is not accessible in userland */
1898 @@ -108,6 +116,7 @@
1899  #define _TIF_SYSCALL_EMU       (1 << TIF_SYSCALL_EMU)
1900  #define _TIF_SYSCALL_AUDIT     (1 << TIF_SYSCALL_AUDIT)
1901  #define _TIF_SECCOMP           (1 << TIF_SECCOMP)
1902 +#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY)
1903  #define _TIF_USER_RETURN_NOTIFY        (1 << TIF_USER_RETURN_NOTIFY)
1904  #define _TIF_UPROBE            (1 << TIF_UPROBE)
1905  #define _TIF_NOTSC             (1 << TIF_NOTSC)
1906 @@ -143,6 +152,8 @@
1907  #define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY)
1908  #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW)
1909
1910 +#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)
1911 +
1912  #define STACK_WARN             (THREAD_SIZE/8)
1913
1914  /*
1915 diff -Nur linux-4.9.6.orig/arch/x86/include/asm/uv/uv_bau.h linux-4.9.6/arch/x86/include/asm/uv/uv_bau.h
1916 --- linux-4.9.6.orig/arch/x86/include/asm/uv/uv_bau.h   2017-01-26 08:25:24.000000000 +0100
1917 +++ linux-4.9.6/arch/x86/include/asm/uv/uv_bau.h        2017-01-28 13:59:09.899654958 +0100
1918 @@ -624,9 +624,9 @@
1919         cycles_t                send_message;
1920         cycles_t                period_end;
1921         cycles_t                period_time;
1922 -       spinlock_t              uvhub_lock;
1923 -       spinlock_t              queue_lock;
1924 -       spinlock_t              disable_lock;
1925 +       raw_spinlock_t          uvhub_lock;
1926 +       raw_spinlock_t          queue_lock;
1927 +       raw_spinlock_t          disable_lock;
1928         /* tunables */
1929         int                     max_concurr;
1930         int                     max_concurr_const;
1931 @@ -815,15 +815,15 @@
1932   * to be lowered below the current 'v'.  atomic_add_unless can only stop
1933   * on equal.
1934   */
1935 -static inline int atomic_inc_unless_ge(spinlock_t *lock, atomic_t *v, int u)
1936 +static inline int atomic_inc_unless_ge(raw_spinlock_t *lock, atomic_t *v, int u)
1937  {
1938 -       spin_lock(lock);
1939 +       raw_spin_lock(lock);
1940         if (atomic_read(v) >= u) {
1941 -               spin_unlock(lock);
1942 +               raw_spin_unlock(lock);
1943                 return 0;
1944         }
1945         atomic_inc(v);
1946 -       spin_unlock(lock);
1947 +       raw_spin_unlock(lock);
1948         return 1;
1949  }
1950
1951 diff -Nur linux-4.9.6.orig/arch/x86/Kconfig linux-4.9.6/arch/x86/Kconfig
1952 --- linux-4.9.6.orig/arch/x86/Kconfig   2017-01-26 08:25:24.000000000 +0100
1953 +++ linux-4.9.6/arch/x86/Kconfig        2017-01-28 13:59:09.895654805 +0100
1954 @@ -17,6 +17,7 @@
1955  ### Arch settings
1956  config X86
1957         def_bool y
1958 +       select HAVE_PREEMPT_LAZY
1959         select ACPI_LEGACY_TABLES_LOOKUP        if ACPI
1960         select ACPI_SYSTEM_POWER_STATES_SUPPORT if ACPI
1961         select ANON_INODES
1962 @@ -232,8 +233,11 @@
1963         def_bool y
1964         depends on ISA_DMA_API
1965
1966 +config RWSEM_GENERIC_SPINLOCK
1967 +       def_bool PREEMPT_RT_FULL
1968 +
1969  config RWSEM_XCHGADD_ALGORITHM
1970 -       def_bool y
1971 +       def_bool !RWSEM_GENERIC_SPINLOCK && !PREEMPT_RT_FULL
1972
1973  config GENERIC_CALIBRATE_DELAY
1974         def_bool y
1975 @@ -897,7 +901,7 @@
1976  config MAXSMP
1977         bool "Enable Maximum number of SMP Processors and NUMA Nodes"
1978         depends on X86_64 && SMP && DEBUG_KERNEL
1979 -       select CPUMASK_OFFSTACK
1980 +       select CPUMASK_OFFSTACK if !PREEMPT_RT_FULL
1981         ---help---
1982           Enable maximum number of CPUS and NUMA Nodes for this architecture.
1983           If unsure, say N.
1984 diff -Nur linux-4.9.6.orig/arch/x86/kernel/acpi/boot.c linux-4.9.6/arch/x86/kernel/acpi/boot.c
1985 --- linux-4.9.6.orig/arch/x86/kernel/acpi/boot.c        2017-01-26 08:25:24.000000000 +0100
1986 +++ linux-4.9.6/arch/x86/kernel/acpi/boot.c     2017-01-28 13:59:09.899654958 +0100
1987 @@ -87,7 +87,9 @@
1988   *             ->ioapic_mutex
1989   *                     ->ioapic_lock
1990   */
1991 +#ifdef CONFIG_X86_IO_APIC
1992  static DEFINE_MUTEX(acpi_ioapic_lock);
1993 +#endif
1994
1995  /* --------------------------------------------------------------------------
1996                                Boot-time Configuration
1997 diff -Nur linux-4.9.6.orig/arch/x86/kernel/apic/io_apic.c linux-4.9.6/arch/x86/kernel/apic/io_apic.c
1998 --- linux-4.9.6.orig/arch/x86/kernel/apic/io_apic.c     2017-01-26 08:25:24.000000000 +0100
1999 +++ linux-4.9.6/arch/x86/kernel/apic/io_apic.c  2017-01-28 13:59:09.899654958 +0100
2000 @@ -1712,7 +1712,8 @@
2001  static inline bool ioapic_irqd_mask(struct irq_data *data)
2002  {
2003         /* If we are moving the irq we need to mask it */
2004 -       if (unlikely(irqd_is_setaffinity_pending(data))) {
2005 +       if (unlikely(irqd_is_setaffinity_pending(data) &&
2006 +                    !irqd_irq_inprogress(data))) {
2007                 mask_ioapic_irq(data);
2008                 return true;
2009         }
2010 diff -Nur linux-4.9.6.orig/arch/x86/kernel/asm-offsets.c linux-4.9.6/arch/x86/kernel/asm-offsets.c
2011 --- linux-4.9.6.orig/arch/x86/kernel/asm-offsets.c      2017-01-26 08:25:24.000000000 +0100
2012 +++ linux-4.9.6/arch/x86/kernel/asm-offsets.c   2017-01-28 13:59:09.899654958 +0100
2013 @@ -36,6 +36,7 @@
2014
2015         BLANK();
2016         OFFSET(TASK_TI_flags, task_struct, thread_info.flags);
2017 +       OFFSET(TASK_TI_preempt_lazy_count, task_struct, thread_info.preempt_lazy_count);
2018         OFFSET(TASK_addr_limit, task_struct, thread.addr_limit);
2019
2020         BLANK();
2021 @@ -91,4 +92,5 @@
2022
2023         BLANK();
2024         DEFINE(PTREGS_SIZE, sizeof(struct pt_regs));
2025 +       DEFINE(_PREEMPT_ENABLED, PREEMPT_ENABLED);
2026  }
2027 diff -Nur linux-4.9.6.orig/arch/x86/kernel/cpu/mcheck/mce.c linux-4.9.6/arch/x86/kernel/cpu/mcheck/mce.c
2028 --- linux-4.9.6.orig/arch/x86/kernel/cpu/mcheck/mce.c   2017-01-26 08:25:24.000000000 +0100
2029 +++ linux-4.9.6/arch/x86/kernel/cpu/mcheck/mce.c        2017-01-28 13:59:09.899654958 +0100
2030 @@ -41,6 +41,8 @@
2031  #include <linux/debugfs.h>
2032  #include <linux/irq_work.h>
2033  #include <linux/export.h>
2034 +#include <linux/jiffies.h>
2035 +#include <linux/swork.h>
2036  #include <linux/jump_label.h>
2037
2038  #include <asm/processor.h>
2039 @@ -1317,7 +1319,7 @@
2040  static unsigned long check_interval = INITIAL_CHECK_INTERVAL;
2041
2042  static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
2043 -static DEFINE_PER_CPU(struct timer_list, mce_timer);
2044 +static DEFINE_PER_CPU(struct hrtimer, mce_timer);
2045
2046  static unsigned long mce_adjust_timer_default(unsigned long interval)
2047  {
2048 @@ -1326,32 +1328,18 @@
2049
2050  static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default;
2051
2052 -static void __restart_timer(struct timer_list *t, unsigned long interval)
2053 +static enum hrtimer_restart __restart_timer(struct hrtimer *timer, unsigned long interval)
2054  {
2055 -       unsigned long when = jiffies + interval;
2056 -       unsigned long flags;
2057 -
2058 -       local_irq_save(flags);
2059 -
2060 -       if (timer_pending(t)) {
2061 -               if (time_before(when, t->expires))
2062 -                       mod_timer(t, when);
2063 -       } else {
2064 -               t->expires = round_jiffies(when);
2065 -               add_timer_on(t, smp_processor_id());
2066 -       }
2067 -
2068 -       local_irq_restore(flags);
2069 +       if (!interval)
2070 +               return HRTIMER_NORESTART;
2071 +       hrtimer_forward_now(timer, ns_to_ktime(jiffies_to_nsecs(interval)));
2072 +       return HRTIMER_RESTART;
2073  }
2074
2075 -static void mce_timer_fn(unsigned long data)
2076 +static enum hrtimer_restart mce_timer_fn(struct hrtimer *timer)
2077  {
2078 -       struct timer_list *t = this_cpu_ptr(&mce_timer);
2079 -       int cpu = smp_processor_id();
2080         unsigned long iv;
2081
2082 -       WARN_ON(cpu != data);
2083 -
2084         iv = __this_cpu_read(mce_next_interval);
2085
2086         if (mce_available(this_cpu_ptr(&cpu_info))) {
2087 @@ -1374,7 +1362,7 @@
2088
2089  done:
2090         __this_cpu_write(mce_next_interval, iv);
2091 -       __restart_timer(t, iv);
2092 +       return __restart_timer(timer, iv);
2093  }
2094
2095  /*
2096 @@ -1382,7 +1370,7 @@
2097   */
2098  void mce_timer_kick(unsigned long interval)
2099  {
2100 -       struct timer_list *t = this_cpu_ptr(&mce_timer);
2101 +       struct hrtimer *t = this_cpu_ptr(&mce_timer);
2102         unsigned long iv = __this_cpu_read(mce_next_interval);
2103
2104         __restart_timer(t, interval);
2105 @@ -1397,7 +1385,7 @@
2106         int cpu;
2107
2108         for_each_online_cpu(cpu)
2109 -               del_timer_sync(&per_cpu(mce_timer, cpu));
2110 +               hrtimer_cancel(&per_cpu(mce_timer, cpu));
2111  }
2112
2113  static void mce_do_trigger(struct work_struct *work)
2114 @@ -1407,6 +1395,56 @@
2115
2116  static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
2117
2118 +static void __mce_notify_work(struct swork_event *event)
2119 +{
2120 +       /* Not more than two messages every minute */
2121 +       static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
2122 +
2123 +       /* wake processes polling /dev/mcelog */
2124 +       wake_up_interruptible(&mce_chrdev_wait);
2125 +
2126 +       /*
2127 +        * There is no risk of missing notifications because
2128 +        * work_pending is always cleared before the function is
2129 +        * executed.
2130 +        */
2131 +       if (mce_helper[0] && !work_pending(&mce_trigger_work))
2132 +               schedule_work(&mce_trigger_work);
2133 +
2134 +       if (__ratelimit(&ratelimit))
2135 +               pr_info(HW_ERR "Machine check events logged\n");
2136 +}
2137 +
2138 +#ifdef CONFIG_PREEMPT_RT_FULL
2139 +static bool notify_work_ready __read_mostly;
2140 +static struct swork_event notify_work;
2141 +
2142 +static int mce_notify_work_init(void)
2143 +{
2144 +       int err;
2145 +
2146 +       err = swork_get();
2147 +       if (err)
2148 +               return err;
2149 +
2150 +       INIT_SWORK(&notify_work, __mce_notify_work);
2151 +       notify_work_ready = true;
2152 +       return 0;
2153 +}
2154 +
2155 +static void mce_notify_work(void)
2156 +{
2157 +       if (notify_work_ready)
2158 +               swork_queue(&notify_work);
2159 +}
2160 +#else
2161 +static void mce_notify_work(void)
2162 +{
2163 +       __mce_notify_work(NULL);
2164 +}
2165 +static inline int mce_notify_work_init(void) { return 0; }
2166 +#endif
2167 +
2168  /*
2169   * Notify the user(s) about new machine check events.
2170   * Can be called from interrupt context, but not from machine check/NMI
2171 @@ -1414,19 +1452,8 @@
2172   */
2173  int mce_notify_irq(void)
2174  {
2175 -       /* Not more than two messages every minute */
2176 -       static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
2177 -
2178         if (test_and_clear_bit(0, &mce_need_notify)) {
2179 -               /* wake processes polling /dev/mcelog */
2180 -               wake_up_interruptible(&mce_chrdev_wait);
2181 -
2182 -               if (mce_helper[0])
2183 -                       schedule_work(&mce_trigger_work);
2184 -
2185 -               if (__ratelimit(&ratelimit))
2186 -                       pr_info(HW_ERR "Machine check events logged\n");
2187 -
2188 +               mce_notify_work();
2189                 return 1;
2190         }
2191         return 0;
2192 @@ -1732,7 +1759,7 @@
2193         }
2194  }
2195
2196 -static void mce_start_timer(unsigned int cpu, struct timer_list *t)
2197 +static void mce_start_timer(unsigned int cpu, struct hrtimer *t)
2198  {
2199         unsigned long iv = check_interval * HZ;
2200
2201 @@ -1741,16 +1768,17 @@
2202
2203         per_cpu(mce_next_interval, cpu) = iv;
2204
2205 -       t->expires = round_jiffies(jiffies + iv);
2206 -       add_timer_on(t, cpu);
2207 +       hrtimer_start_range_ns(t, ns_to_ktime(jiffies_to_usecs(iv) * 1000ULL),
2208 +                       0, HRTIMER_MODE_REL_PINNED);
2209  }
2210
2211  static void __mcheck_cpu_init_timer(void)
2212  {
2213 -       struct timer_list *t = this_cpu_ptr(&mce_timer);
2214 +       struct hrtimer *t = this_cpu_ptr(&mce_timer);
2215         unsigned int cpu = smp_processor_id();
2216
2217 -       setup_pinned_timer(t, mce_timer_fn, cpu);
2218 +       hrtimer_init(t, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2219 +       t->function = mce_timer_fn;
2220         mce_start_timer(cpu, t);
2221  }
2222
2223 @@ -2475,6 +2503,8 @@
2224         if (!mce_available(raw_cpu_ptr(&cpu_info)))
2225                 return;
2226
2227 +       hrtimer_cancel(this_cpu_ptr(&mce_timer));
2228 +
2229         if (!(action & CPU_TASKS_FROZEN))
2230                 cmci_clear();
2231
2232 @@ -2497,6 +2527,7 @@
2233                 if (b->init)
2234                         wrmsrl(msr_ops.ctl(i), b->ctl);
2235         }
2236 +       __mcheck_cpu_init_timer();
2237  }
2238
2239  /* Get notified when a cpu comes on/off. Be hotplug friendly. */
2240 @@ -2504,7 +2535,6 @@
2241  mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
2242  {
2243         unsigned int cpu = (unsigned long)hcpu;
2244 -       struct timer_list *t = &per_cpu(mce_timer, cpu);
2245
2246         switch (action & ~CPU_TASKS_FROZEN) {
2247         case CPU_ONLINE:
2248 @@ -2524,11 +2554,9 @@
2249                 break;
2250         case CPU_DOWN_PREPARE:
2251                 smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
2252 -               del_timer_sync(t);
2253                 break;
2254         case CPU_DOWN_FAILED:
2255                 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
2256 -               mce_start_timer(cpu, t);
2257                 break;
2258         }
2259
2260 @@ -2567,6 +2595,10 @@
2261                 goto err_out;
2262         }
2263
2264 +       err = mce_notify_work_init();
2265 +       if (err)
2266 +               goto err_out;
2267 +
2268         if (!zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL)) {
2269                 err = -ENOMEM;
2270                 goto err_out;
2271 diff -Nur linux-4.9.6.orig/arch/x86/kernel/irq_32.c linux-4.9.6/arch/x86/kernel/irq_32.c
2272 --- linux-4.9.6.orig/arch/x86/kernel/irq_32.c   2017-01-26 08:25:24.000000000 +0100
2273 +++ linux-4.9.6/arch/x86/kernel/irq_32.c        2017-01-28 13:59:09.899654958 +0100
2274 @@ -127,6 +127,7 @@
2275                cpu, per_cpu(hardirq_stack, cpu),  per_cpu(softirq_stack, cpu));
2276  }
2277
2278 +#ifndef CONFIG_PREEMPT_RT_FULL
2279  void do_softirq_own_stack(void)
2280  {
2281         struct irq_stack *irqstk;
2282 @@ -143,6 +144,7 @@
2283
2284         call_on_stack(__do_softirq, isp);
2285  }
2286 +#endif
2287
2288  bool handle_irq(struct irq_desc *desc, struct pt_regs *regs)
2289  {
2290 diff -Nur linux-4.9.6.orig/arch/x86/kernel/process_32.c linux-4.9.6/arch/x86/kernel/process_32.c
2291 --- linux-4.9.6.orig/arch/x86/kernel/process_32.c       2017-01-26 08:25:24.000000000 +0100
2292 +++ linux-4.9.6/arch/x86/kernel/process_32.c    2017-01-28 13:59:09.899654958 +0100
2293 @@ -35,6 +35,7 @@
2294  #include <linux/uaccess.h>
2295  #include <linux/io.h>
2296  #include <linux/kdebug.h>
2297 +#include <linux/highmem.h>
2298
2299  #include <asm/pgtable.h>
2300  #include <asm/ldt.h>
2301 @@ -195,6 +196,35 @@
2302  }
2303  EXPORT_SYMBOL_GPL(start_thread);
2304
2305 +#ifdef CONFIG_PREEMPT_RT_FULL
2306 +static void switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p)
2307 +{
2308 +       int i;
2309 +
2310 +       /*
2311 +        * Clear @prev's kmap_atomic mappings
2312 +        */
2313 +       for (i = 0; i < prev_p->kmap_idx; i++) {
2314 +               int idx = i + KM_TYPE_NR * smp_processor_id();
2315 +               pte_t *ptep = kmap_pte - idx;
2316 +
2317 +               kpte_clear_flush(ptep, __fix_to_virt(FIX_KMAP_BEGIN + idx));
2318 +       }
2319 +       /*
2320 +        * Restore @next_p's kmap_atomic mappings
2321 +        */
2322 +       for (i = 0; i < next_p->kmap_idx; i++) {
2323 +               int idx = i + KM_TYPE_NR * smp_processor_id();
2324 +
2325 +               if (!pte_none(next_p->kmap_pte[i]))
2326 +                       set_pte(kmap_pte - idx, next_p->kmap_pte[i]);
2327 +       }
2328 +}
2329 +#else
2330 +static inline void
2331 +switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p) { }
2332 +#endif
2333 +
2334
2335  /*
2336   *     switch_to(x,y) should switch tasks from x to y.
2337 @@ -271,6 +301,8 @@
2338                      task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT))
2339                 __switch_to_xtra(prev_p, next_p, tss);
2340
2341 +       switch_kmaps(prev_p, next_p);
2342 +
2343         /*
2344          * Leave lazy mode, flushing any hypercalls made here.
2345          * This must be done before restoring TLS segments so
2346 diff -Nur linux-4.9.6.orig/arch/x86/kvm/lapic.c linux-4.9.6/arch/x86/kvm/lapic.c
2347 --- linux-4.9.6.orig/arch/x86/kvm/lapic.c       2017-01-26 08:25:24.000000000 +0100
2348 +++ linux-4.9.6/arch/x86/kvm/lapic.c    2017-01-28 13:59:09.899654958 +0100
2349 @@ -1939,6 +1939,7 @@
2350         hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC,
2351                      HRTIMER_MODE_ABS_PINNED);
2352         apic->lapic_timer.timer.function = apic_timer_fn;
2353 +       apic->lapic_timer.timer.irqsafe = 1;
2354
2355         /*
2356          * APIC is created enabled. This will prevent kvm_lapic_set_base from
2357 diff -Nur linux-4.9.6.orig/arch/x86/kvm/x86.c linux-4.9.6/arch/x86/kvm/x86.c
2358 --- linux-4.9.6.orig/arch/x86/kvm/x86.c 2017-01-26 08:25:24.000000000 +0100
2359 +++ linux-4.9.6/arch/x86/kvm/x86.c      2017-01-28 13:59:09.903655111 +0100
2360 @@ -5932,6 +5932,13 @@
2361                 goto out;
2362         }
2363
2364 +#ifdef CONFIG_PREEMPT_RT_FULL
2365 +       if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
2366 +               printk(KERN_ERR "RT requires X86_FEATURE_CONSTANT_TSC\n");
2367 +               return -EOPNOTSUPP;
2368 +       }
2369 +#endif
2370 +
2371         r = kvm_mmu_module_init();
2372         if (r)
2373                 goto out_free_percpu;
2374 diff -Nur linux-4.9.6.orig/arch/x86/mm/highmem_32.c linux-4.9.6/arch/x86/mm/highmem_32.c
2375 --- linux-4.9.6.orig/arch/x86/mm/highmem_32.c   2017-01-26 08:25:24.000000000 +0100
2376 +++ linux-4.9.6/arch/x86/mm/highmem_32.c        2017-01-28 13:59:09.903655111 +0100
2377 @@ -32,10 +32,11 @@
2378   */
2379  void *kmap_atomic_prot(struct page *page, pgprot_t prot)
2380  {
2381 +       pte_t pte = mk_pte(page, prot);
2382         unsigned long vaddr;
2383         int idx, type;
2384
2385 -       preempt_disable();
2386 +       preempt_disable_nort();
2387         pagefault_disable();
2388
2389         if (!PageHighMem(page))
2390 @@ -45,7 +46,10 @@
2391         idx = type + KM_TYPE_NR*smp_processor_id();
2392         vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
2393         BUG_ON(!pte_none(*(kmap_pte-idx)));
2394 -       set_pte(kmap_pte-idx, mk_pte(page, prot));
2395 +#ifdef CONFIG_PREEMPT_RT_FULL
2396 +       current->kmap_pte[type] = pte;
2397 +#endif
2398 +       set_pte(kmap_pte-idx, pte);
2399         arch_flush_lazy_mmu_mode();
2400
2401         return (void *)vaddr;
2402 @@ -88,6 +92,9 @@
2403                  * is a bad idea also, in case the page changes cacheability
2404                  * attributes or becomes a protected page in a hypervisor.
2405                  */
2406 +#ifdef CONFIG_PREEMPT_RT_FULL
2407 +               current->kmap_pte[type] = __pte(0);
2408 +#endif
2409                 kpte_clear_flush(kmap_pte-idx, vaddr);
2410                 kmap_atomic_idx_pop();
2411                 arch_flush_lazy_mmu_mode();
2412 @@ -100,7 +107,7 @@
2413  #endif
2414
2415         pagefault_enable();
2416 -       preempt_enable();
2417 +       preempt_enable_nort();
2418  }
2419  EXPORT_SYMBOL(__kunmap_atomic);
2420
2421 diff -Nur linux-4.9.6.orig/arch/x86/mm/iomap_32.c linux-4.9.6/arch/x86/mm/iomap_32.c
2422 --- linux-4.9.6.orig/arch/x86/mm/iomap_32.c     2017-01-26 08:25:24.000000000 +0100
2423 +++ linux-4.9.6/arch/x86/mm/iomap_32.c  2017-01-28 13:59:09.903655111 +0100
2424 @@ -56,6 +56,7 @@
2425
2426  void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
2427  {
2428 +       pte_t pte = pfn_pte(pfn, prot);
2429         unsigned long vaddr;
2430         int idx, type;
2431
2432 @@ -65,7 +66,12 @@
2433         type = kmap_atomic_idx_push();
2434         idx = type + KM_TYPE_NR * smp_processor_id();
2435         vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
2436 -       set_pte(kmap_pte - idx, pfn_pte(pfn, prot));
2437 +       WARN_ON(!pte_none(*(kmap_pte - idx)));
2438 +
2439 +#ifdef CONFIG_PREEMPT_RT_FULL
2440 +       current->kmap_pte[type] = pte;
2441 +#endif
2442 +       set_pte(kmap_pte - idx, pte);
2443         arch_flush_lazy_mmu_mode();
2444
2445         return (void *)vaddr;
2446 @@ -113,6 +119,9 @@
2447                  * is a bad idea also, in case the page changes cacheability
2448                  * attributes or becomes a protected page in a hypervisor.
2449                  */
2450 +#ifdef CONFIG_PREEMPT_RT_FULL
2451 +               current->kmap_pte[type] = __pte(0);
2452 +#endif
2453                 kpte_clear_flush(kmap_pte-idx, vaddr);
2454                 kmap_atomic_idx_pop();
2455         }
2456 diff -Nur linux-4.9.6.orig/arch/x86/platform/uv/tlb_uv.c linux-4.9.6/arch/x86/platform/uv/tlb_uv.c
2457 --- linux-4.9.6.orig/arch/x86/platform/uv/tlb_uv.c      2017-01-26 08:25:24.000000000 +0100
2458 +++ linux-4.9.6/arch/x86/platform/uv/tlb_uv.c   2017-01-28 13:59:09.903655111 +0100
2459 @@ -748,9 +748,9 @@
2460
2461                 quiesce_local_uvhub(hmaster);
2462
2463 -               spin_lock(&hmaster->queue_lock);
2464 +               raw_spin_lock(&hmaster->queue_lock);
2465                 reset_with_ipi(&bau_desc->distribution, bcp);
2466 -               spin_unlock(&hmaster->queue_lock);
2467 +               raw_spin_unlock(&hmaster->queue_lock);
2468
2469                 end_uvhub_quiesce(hmaster);
2470
2471 @@ -770,9 +770,9 @@
2472
2473                 quiesce_local_uvhub(hmaster);
2474
2475 -               spin_lock(&hmaster->queue_lock);
2476 +               raw_spin_lock(&hmaster->queue_lock);
2477                 reset_with_ipi(&bau_desc->distribution, bcp);
2478 -               spin_unlock(&hmaster->queue_lock);
2479 +               raw_spin_unlock(&hmaster->queue_lock);
2480
2481                 end_uvhub_quiesce(hmaster);
2482
2483 @@ -793,7 +793,7 @@
2484         cycles_t tm1;
2485
2486         hmaster = bcp->uvhub_master;
2487 -       spin_lock(&hmaster->disable_lock);
2488 +       raw_spin_lock(&hmaster->disable_lock);
2489         if (!bcp->baudisabled) {
2490                 stat->s_bau_disabled++;
2491                 tm1 = get_cycles();
2492 @@ -806,7 +806,7 @@
2493                         }
2494                 }
2495         }
2496 -       spin_unlock(&hmaster->disable_lock);
2497 +       raw_spin_unlock(&hmaster->disable_lock);
2498  }
2499
2500  static void count_max_concurr(int stat, struct bau_control *bcp,
2501 @@ -869,7 +869,7 @@
2502   */
2503  static void uv1_throttle(struct bau_control *hmaster, struct ptc_stats *stat)
2504  {
2505 -       spinlock_t *lock = &hmaster->uvhub_lock;
2506 +       raw_spinlock_t *lock = &hmaster->uvhub_lock;
2507         atomic_t *v;
2508
2509         v = &hmaster->active_descriptor_count;
2510 @@ -1002,7 +1002,7 @@
2511         struct bau_control *hmaster;
2512
2513         hmaster = bcp->uvhub_master;
2514 -       spin_lock(&hmaster->disable_lock);
2515 +       raw_spin_lock(&hmaster->disable_lock);
2516         if (bcp->baudisabled && (get_cycles() >= bcp->set_bau_on_time)) {
2517                 stat->s_bau_reenabled++;
2518                 for_each_present_cpu(tcpu) {
2519 @@ -1014,10 +1014,10 @@
2520                                 tbcp->period_giveups = 0;
2521                         }
2522                 }
2523 -               spin_unlock(&hmaster->disable_lock);
2524 +               raw_spin_unlock(&hmaster->disable_lock);
2525                 return 0;
2526         }
2527 -       spin_unlock(&hmaster->disable_lock);
2528 +       raw_spin_unlock(&hmaster->disable_lock);
2529         return -1;
2530  }
2531
2532 @@ -1940,9 +1940,9 @@
2533                 bcp->cong_reps                  = congested_reps;
2534                 bcp->disabled_period            = sec_2_cycles(disabled_period);
2535                 bcp->giveup_limit               = giveup_limit;
2536 -               spin_lock_init(&bcp->queue_lock);
2537 -               spin_lock_init(&bcp->uvhub_lock);
2538 -               spin_lock_init(&bcp->disable_lock);
2539 +               raw_spin_lock_init(&bcp->queue_lock);
2540 +               raw_spin_lock_init(&bcp->uvhub_lock);
2541 +               raw_spin_lock_init(&bcp->disable_lock);
2542         }
2543  }
2544
2545 diff -Nur linux-4.9.6.orig/arch/x86/platform/uv/uv_time.c linux-4.9.6/arch/x86/platform/uv/uv_time.c
2546 --- linux-4.9.6.orig/arch/x86/platform/uv/uv_time.c     2017-01-26 08:25:24.000000000 +0100
2547 +++ linux-4.9.6/arch/x86/platform/uv/uv_time.c  2017-01-28 13:59:09.903655111 +0100
2548 @@ -57,7 +57,7 @@
2549
2550  /* There is one of these allocated per node */
2551  struct uv_rtc_timer_head {
2552 -       spinlock_t      lock;
2553 +       raw_spinlock_t  lock;
2554         /* next cpu waiting for timer, local node relative: */
2555         int             next_cpu;
2556         /* number of cpus on this node: */
2557 @@ -177,7 +177,7 @@
2558                                 uv_rtc_deallocate_timers();
2559                                 return -ENOMEM;
2560                         }
2561 -                       spin_lock_init(&head->lock);
2562 +                       raw_spin_lock_init(&head->lock);
2563                         head->ncpus = uv_blade_nr_possible_cpus(bid);
2564                         head->next_cpu = -1;
2565                         blade_info[bid] = head;
2566 @@ -231,7 +231,7 @@
2567         unsigned long flags;
2568         int next_cpu;
2569
2570 -       spin_lock_irqsave(&head->lock, flags);
2571 +       raw_spin_lock_irqsave(&head->lock, flags);
2572
2573         next_cpu = head->next_cpu;
2574         *t = expires;
2575 @@ -243,12 +243,12 @@
2576                 if (uv_setup_intr(cpu, expires)) {
2577                         *t = ULLONG_MAX;
2578                         uv_rtc_find_next_timer(head, pnode);
2579 -                       spin_unlock_irqrestore(&head->lock, flags);
2580 +                       raw_spin_unlock_irqrestore(&head->lock, flags);
2581                         return -ETIME;
2582                 }
2583         }
2584
2585 -       spin_unlock_irqrestore(&head->lock, flags);
2586 +       raw_spin_unlock_irqrestore(&head->lock, flags);
2587         return 0;
2588  }
2589
2590 @@ -267,7 +267,7 @@
2591         unsigned long flags;
2592         int rc = 0;
2593
2594 -       spin_lock_irqsave(&head->lock, flags);
2595 +       raw_spin_lock_irqsave(&head->lock, flags);
2596
2597         if ((head->next_cpu == bcpu && uv_read_rtc(NULL) >= *t) || force)
2598                 rc = 1;
2599 @@ -279,7 +279,7 @@
2600                         uv_rtc_find_next_timer(head, pnode);
2601         }
2602
2603 -       spin_unlock_irqrestore(&head->lock, flags);
2604 +       raw_spin_unlock_irqrestore(&head->lock, flags);
2605
2606         return rc;
2607  }
2608 @@ -299,13 +299,18 @@
2609  static cycle_t uv_read_rtc(struct clocksource *cs)
2610  {
2611         unsigned long offset;
2612 +       cycle_t cycles;
2613
2614 +       preempt_disable();
2615         if (uv_get_min_hub_revision_id() == 1)
2616                 offset = 0;
2617         else
2618                 offset = (uv_blade_processor_id() * L1_CACHE_BYTES) % PAGE_SIZE;
2619
2620 -       return (cycle_t)uv_read_local_mmr(UVH_RTC | offset);
2621 +       cycles = (cycle_t)uv_read_local_mmr(UVH_RTC | offset);
2622 +       preempt_enable();
2623 +
2624 +       return cycles;
2625  }
2626
2627  /*
2628 diff -Nur linux-4.9.6.orig/block/blk-core.c linux-4.9.6/block/blk-core.c
2629 --- linux-4.9.6.orig/block/blk-core.c   2017-01-26 08:25:24.000000000 +0100
2630 +++ linux-4.9.6/block/blk-core.c        2017-01-28 13:59:09.903655111 +0100
2631 @@ -125,6 +125,9 @@
2632
2633         INIT_LIST_HEAD(&rq->queuelist);
2634         INIT_LIST_HEAD(&rq->timeout_list);
2635 +#ifdef CONFIG_PREEMPT_RT_FULL
2636 +       INIT_WORK(&rq->work, __blk_mq_complete_request_remote_work);
2637 +#endif
2638         rq->cpu = -1;
2639         rq->q = q;
2640         rq->__sector = (sector_t) -1;
2641 @@ -233,7 +236,7 @@
2642   **/
2643  void blk_start_queue(struct request_queue *q)
2644  {
2645 -       WARN_ON(!irqs_disabled());
2646 +       WARN_ON_NONRT(!irqs_disabled());
2647
2648         queue_flag_clear(QUEUE_FLAG_STOPPED, q);
2649         __blk_run_queue(q);
2650 @@ -659,7 +662,7 @@
2651                 if (nowait)
2652                         return -EBUSY;
2653
2654 -               ret = wait_event_interruptible(q->mq_freeze_wq,
2655 +               ret = swait_event_interruptible(q->mq_freeze_wq,
2656                                 !atomic_read(&q->mq_freeze_depth) ||
2657                                 blk_queue_dying(q));
2658                 if (blk_queue_dying(q))
2659 @@ -679,7 +682,7 @@
2660         struct request_queue *q =
2661                 container_of(ref, struct request_queue, q_usage_counter);
2662
2663 -       wake_up_all(&q->mq_freeze_wq);
2664 +       swake_up_all(&q->mq_freeze_wq);
2665  }
2666
2667  static void blk_rq_timed_out_timer(unsigned long data)
2668 @@ -748,7 +751,7 @@
2669         q->bypass_depth = 1;
2670         __set_bit(QUEUE_FLAG_BYPASS, &q->queue_flags);
2671
2672 -       init_waitqueue_head(&q->mq_freeze_wq);
2673 +       init_swait_queue_head(&q->mq_freeze_wq);
2674
2675         /*
2676          * Init percpu_ref in atomic mode so that it's faster to shutdown.
2677 @@ -3177,7 +3180,7 @@
2678                 blk_run_queue_async(q);
2679         else
2680                 __blk_run_queue(q);
2681 -       spin_unlock(q->queue_lock);
2682 +       spin_unlock_irq(q->queue_lock);
2683  }
2684
2685  static void flush_plug_callbacks(struct blk_plug *plug, bool from_schedule)
2686 @@ -3225,7 +3228,6 @@
2687  void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
2688  {
2689         struct request_queue *q;
2690 -       unsigned long flags;
2691         struct request *rq;
2692         LIST_HEAD(list);
2693         unsigned int depth;
2694 @@ -3245,11 +3247,6 @@
2695         q = NULL;
2696         depth = 0;
2697
2698 -       /*
2699 -        * Save and disable interrupts here, to avoid doing it for every
2700 -        * queue lock we have to take.
2701 -        */
2702 -       local_irq_save(flags);
2703         while (!list_empty(&list)) {
2704                 rq = list_entry_rq(list.next);
2705                 list_del_init(&rq->queuelist);
2706 @@ -3262,7 +3259,7 @@
2707                                 queue_unplugged(q, depth, from_schedule);
2708                         q = rq->q;
2709                         depth = 0;
2710 -                       spin_lock(q->queue_lock);
2711 +                       spin_lock_irq(q->queue_lock);
2712                 }
2713
2714                 /*
2715 @@ -3289,8 +3286,6 @@
2716          */
2717         if (q)
2718                 queue_unplugged(q, depth, from_schedule);
2719 -
2720 -       local_irq_restore(flags);
2721  }
2722
2723  void blk_finish_plug(struct blk_plug *plug)
2724 diff -Nur linux-4.9.6.orig/block/blk-ioc.c linux-4.9.6/block/blk-ioc.c
2725 --- linux-4.9.6.orig/block/blk-ioc.c    2017-01-26 08:25:24.000000000 +0100
2726 +++ linux-4.9.6/block/blk-ioc.c 2017-01-28 13:59:09.903655111 +0100
2727 @@ -7,6 +7,7 @@
2728  #include <linux/bio.h>
2729  #include <linux/blkdev.h>
2730  #include <linux/slab.h>
2731 +#include <linux/delay.h>
2732
2733  #include "blk.h"
2734
2735 @@ -109,7 +110,7 @@
2736                         spin_unlock(q->queue_lock);
2737                 } else {
2738                         spin_unlock_irqrestore(&ioc->lock, flags);
2739 -                       cpu_relax();
2740 +                       cpu_chill();
2741                         spin_lock_irqsave_nested(&ioc->lock, flags, 1);
2742                 }
2743         }
2744 @@ -187,7 +188,7 @@
2745                         spin_unlock(icq->q->queue_lock);
2746                 } else {
2747                         spin_unlock_irqrestore(&ioc->lock, flags);
2748 -                       cpu_relax();
2749 +                       cpu_chill();
2750                         goto retry;
2751                 }
2752         }
2753 diff -Nur linux-4.9.6.orig/block/blk-mq.c linux-4.9.6/block/blk-mq.c
2754 --- linux-4.9.6.orig/block/blk-mq.c     2017-01-26 08:25:24.000000000 +0100
2755 +++ linux-4.9.6/block/blk-mq.c  2017-01-28 13:59:09.903655111 +0100
2756 @@ -72,7 +72,7 @@
2757
2758  static void blk_mq_freeze_queue_wait(struct request_queue *q)
2759  {
2760 -       wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter));
2761 +       swait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter));
2762  }
2763
2764  /*
2765 @@ -110,7 +110,7 @@
2766         WARN_ON_ONCE(freeze_depth < 0);
2767         if (!freeze_depth) {
2768                 percpu_ref_reinit(&q->q_usage_counter);
2769 -               wake_up_all(&q->mq_freeze_wq);
2770 +               swake_up_all(&q->mq_freeze_wq);
2771         }
2772  }
2773  EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);
2774 @@ -129,7 +129,7 @@
2775          * dying, we need to ensure that processes currently waiting on
2776          * the queue are notified as well.
2777          */
2778 -       wake_up_all(&q->mq_freeze_wq);
2779 +       swake_up_all(&q->mq_freeze_wq);
2780  }
2781
2782  bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
2783 @@ -177,6 +177,9 @@
2784         rq->resid_len = 0;
2785         rq->sense = NULL;
2786
2787 +#ifdef CONFIG_PREEMPT_RT_FULL
2788 +       INIT_WORK(&rq->work, __blk_mq_complete_request_remote_work);
2789 +#endif
2790         INIT_LIST_HEAD(&rq->timeout_list);
2791         rq->timeout = 0;
2792
2793 @@ -345,6 +348,17 @@
2794  }
2795  EXPORT_SYMBOL(blk_mq_end_request);
2796
2797 +#ifdef CONFIG_PREEMPT_RT_FULL
2798 +
2799 +void __blk_mq_complete_request_remote_work(struct work_struct *work)
2800 +{
2801 +       struct request *rq = container_of(work, struct request, work);
2802 +
2803 +       rq->q->softirq_done_fn(rq);
2804 +}
2805 +
2806 +#else
2807 +
2808  static void __blk_mq_complete_request_remote(void *data)
2809  {
2810         struct request *rq = data;
2811 @@ -352,6 +366,8 @@
2812         rq->q->softirq_done_fn(rq);
2813  }
2814
2815 +#endif
2816 +
2817  static void blk_mq_ipi_complete_request(struct request *rq)
2818  {
2819         struct blk_mq_ctx *ctx = rq->mq_ctx;
2820 @@ -363,19 +379,23 @@
2821                 return;
2822         }
2823
2824 -       cpu = get_cpu();
2825 +       cpu = get_cpu_light();
2826         if (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags))
2827                 shared = cpus_share_cache(cpu, ctx->cpu);
2828
2829         if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) {
2830 +#ifdef CONFIG_PREEMPT_RT_FULL
2831 +               schedule_work_on(ctx->cpu, &rq->work);
2832 +#else
2833                 rq->csd.func = __blk_mq_complete_request_remote;
2834                 rq->csd.info = rq;
2835                 rq->csd.flags = 0;
2836                 smp_call_function_single_async(ctx->cpu, &rq->csd);
2837 +#endif
2838         } else {
2839                 rq->q->softirq_done_fn(rq);
2840         }
2841 -       put_cpu();
2842 +       put_cpu_light();
2843  }
2844
2845  static void __blk_mq_complete_request(struct request *rq)
2846 @@ -915,14 +935,14 @@
2847                 return;
2848
2849         if (!async && !(hctx->flags & BLK_MQ_F_BLOCKING)) {
2850 -               int cpu = get_cpu();
2851 +               int cpu = get_cpu_light();
2852                 if (cpumask_test_cpu(cpu, hctx->cpumask)) {
2853                         __blk_mq_run_hw_queue(hctx);
2854 -                       put_cpu();
2855 +                       put_cpu_light();
2856                         return;
2857                 }
2858
2859 -               put_cpu();
2860 +               put_cpu_light();
2861         }
2862
2863         kblockd_schedule_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work);
2864 diff -Nur linux-4.9.6.orig/block/blk-mq.h linux-4.9.6/block/blk-mq.h
2865 --- linux-4.9.6.orig/block/blk-mq.h     2017-01-26 08:25:24.000000000 +0100
2866 +++ linux-4.9.6/block/blk-mq.h  2017-01-28 13:59:09.903655111 +0100
2867 @@ -72,12 +72,12 @@
2868   */
2869  static inline struct blk_mq_ctx *blk_mq_get_ctx(struct request_queue *q)
2870  {
2871 -       return __blk_mq_get_ctx(q, get_cpu());
2872 +       return __blk_mq_get_ctx(q, get_cpu_light());
2873  }
2874
2875  static inline void blk_mq_put_ctx(struct blk_mq_ctx *ctx)
2876  {
2877 -       put_cpu();
2878 +       put_cpu_light();
2879  }
2880
2881  struct blk_mq_alloc_data {
2882 diff -Nur linux-4.9.6.orig/block/blk-softirq.c linux-4.9.6/block/blk-softirq.c
2883 --- linux-4.9.6.orig/block/blk-softirq.c        2017-01-26 08:25:24.000000000 +0100
2884 +++ linux-4.9.6/block/blk-softirq.c     2017-01-28 13:59:09.903655111 +0100
2885 @@ -51,6 +51,7 @@
2886                 raise_softirq_irqoff(BLOCK_SOFTIRQ);
2887
2888         local_irq_restore(flags);
2889 +       preempt_check_resched_rt();
2890  }
2891
2892  /*
2893 @@ -89,6 +90,7 @@
2894                          this_cpu_ptr(&blk_cpu_done));
2895         raise_softirq_irqoff(BLOCK_SOFTIRQ);
2896         local_irq_enable();
2897 +       preempt_check_resched_rt();
2898
2899         return 0;
2900  }
2901 @@ -141,6 +143,7 @@
2902                 goto do_local;
2903
2904         local_irq_restore(flags);
2905 +       preempt_check_resched_rt();
2906  }
2907
2908  /**
2909 diff -Nur linux-4.9.6.orig/block/bounce.c linux-4.9.6/block/bounce.c
2910 --- linux-4.9.6.orig/block/bounce.c     2017-01-26 08:25:24.000000000 +0100
2911 +++ linux-4.9.6/block/bounce.c  2017-01-28 13:59:09.903655111 +0100
2912 @@ -55,11 +55,11 @@
2913         unsigned long flags;
2914         unsigned char *vto;
2915
2916 -       local_irq_save(flags);
2917 +       local_irq_save_nort(flags);
2918         vto = kmap_atomic(to->bv_page);
2919         memcpy(vto + to->bv_offset, vfrom, to->bv_len);
2920         kunmap_atomic(vto);
2921 -       local_irq_restore(flags);
2922 +       local_irq_restore_nort(flags);
2923  }
2924
2925  #else /* CONFIG_HIGHMEM */
2926 diff -Nur linux-4.9.6.orig/crypto/algapi.c linux-4.9.6/crypto/algapi.c
2927 --- linux-4.9.6.orig/crypto/algapi.c    2017-01-26 08:25:24.000000000 +0100
2928 +++ linux-4.9.6/crypto/algapi.c 2017-01-28 13:59:09.903655111 +0100
2929 @@ -718,13 +718,13 @@
2930
2931  int crypto_register_notifier(struct notifier_block *nb)
2932  {
2933 -       return blocking_notifier_chain_register(&crypto_chain, nb);
2934 +       return srcu_notifier_chain_register(&crypto_chain, nb);
2935  }
2936  EXPORT_SYMBOL_GPL(crypto_register_notifier);
2937
2938  int crypto_unregister_notifier(struct notifier_block *nb)
2939  {
2940 -       return blocking_notifier_chain_unregister(&crypto_chain, nb);
2941 +       return srcu_notifier_chain_unregister(&crypto_chain, nb);
2942  }
2943  EXPORT_SYMBOL_GPL(crypto_unregister_notifier);
2944
2945 diff -Nur linux-4.9.6.orig/crypto/api.c linux-4.9.6/crypto/api.c
2946 --- linux-4.9.6.orig/crypto/api.c       2017-01-26 08:25:24.000000000 +0100
2947 +++ linux-4.9.6/crypto/api.c    2017-01-28 13:59:09.903655111 +0100
2948 @@ -31,7 +31,7 @@
2949  DECLARE_RWSEM(crypto_alg_sem);
2950  EXPORT_SYMBOL_GPL(crypto_alg_sem);
2951
2952 -BLOCKING_NOTIFIER_HEAD(crypto_chain);
2953 +SRCU_NOTIFIER_HEAD(crypto_chain);
2954  EXPORT_SYMBOL_GPL(crypto_chain);
2955
2956  static struct crypto_alg *crypto_larval_wait(struct crypto_alg *alg);
2957 @@ -236,10 +236,10 @@
2958  {
2959         int ok;
2960
2961 -       ok = blocking_notifier_call_chain(&crypto_chain, val, v);
2962 +       ok = srcu_notifier_call_chain(&crypto_chain, val, v);
2963         if (ok == NOTIFY_DONE) {
2964                 request_module("cryptomgr");
2965 -               ok = blocking_notifier_call_chain(&crypto_chain, val, v);
2966 +               ok = srcu_notifier_call_chain(&crypto_chain, val, v);
2967         }
2968
2969         return ok;
2970 diff -Nur linux-4.9.6.orig/crypto/internal.h linux-4.9.6/crypto/internal.h
2971 --- linux-4.9.6.orig/crypto/internal.h  2017-01-26 08:25:24.000000000 +0100
2972 +++ linux-4.9.6/crypto/internal.h       2017-01-28 13:59:09.903655111 +0100
2973 @@ -47,7 +47,7 @@
2974
2975  extern struct list_head crypto_alg_list;
2976  extern struct rw_semaphore crypto_alg_sem;
2977 -extern struct blocking_notifier_head crypto_chain;
2978 +extern struct srcu_notifier_head crypto_chain;
2979
2980  #ifdef CONFIG_PROC_FS
2981  void __init crypto_init_proc(void);
2982 @@ -146,7 +146,7 @@
2983
2984  static inline void crypto_notify(unsigned long val, void *v)
2985  {
2986 -       blocking_notifier_call_chain(&crypto_chain, val, v);
2987 +       srcu_notifier_call_chain(&crypto_chain, val, v);
2988  }
2989
2990  #endif /* _CRYPTO_INTERNAL_H */
2991 diff -Nur linux-4.9.6.orig/Documentation/sysrq.txt linux-4.9.6/Documentation/sysrq.txt
2992 --- linux-4.9.6.orig/Documentation/sysrq.txt    2017-01-26 08:25:24.000000000 +0100
2993 +++ linux-4.9.6/Documentation/sysrq.txt 2017-01-28 13:59:09.887654498 +0100
2994 @@ -59,10 +59,17 @@
2995  On other - If you know of the key combos for other architectures, please
2996             let me know so I can add them to this section.
2997
2998 -On all -  write a character to /proc/sysrq-trigger.  e.g.:
2999 -
3000 +On all -  write a character to /proc/sysrq-trigger, e.g.:
3001                 echo t > /proc/sysrq-trigger
3002
3003 +On all - Enable network SysRq by writing a cookie to icmp_echo_sysrq, e.g.
3004 +               echo 0x01020304 >/proc/sys/net/ipv4/icmp_echo_sysrq
3005 +        Send an ICMP echo request with this pattern plus the particular
3006 +        SysRq command key. Example:
3007 +               # ping -c1 -s57 -p0102030468
3008 +        will trigger the SysRq-H (help) command.
3009 +
3010 +
3011  *  What are the 'command' keys?
3012  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
3013  'b'     - Will immediately reboot the system without syncing or unmounting
3014 diff -Nur linux-4.9.6.orig/Documentation/trace/histograms.txt linux-4.9.6/Documentation/trace/histograms.txt
3015 --- linux-4.9.6.orig/Documentation/trace/histograms.txt 1970-01-01 01:00:00.000000000 +0100
3016 +++ linux-4.9.6/Documentation/trace/histograms.txt      2017-01-28 13:59:09.887654498 +0100
3017 @@ -0,0 +1,186 @@
3018 +               Using the Linux Kernel Latency Histograms
3019 +
3020 +
3021 +This document gives a short explanation how to enable, configure and use
3022 +latency histograms. Latency histograms are primarily relevant in the
3023 +context of real-time enabled kernels (CONFIG_PREEMPT/CONFIG_PREEMPT_RT)
3024 +and are used in the quality management of the Linux real-time
3025 +capabilities.
3026 +
3027 +
3028 +* Purpose of latency histograms
3029 +
3030 +A latency histogram continuously accumulates the frequencies of latency
3031 +data. There are two types of histograms
3032 +- potential sources of latencies
3033 +- effective latencies
3034 +
3035 +
3036 +* Potential sources of latencies
3037 +
3038 +Potential sources of latencies are code segments where interrupts,
3039 +preemption or both are disabled (aka critical sections). To create
3040 +histograms of potential sources of latency, the kernel stores the time
3041 +stamp at the start of a critical section, determines the time elapsed
3042 +when the end of the section is reached, and increments the frequency
3043 +counter of that latency value - irrespective of whether any concurrently
3044 +running process is affected by latency or not.
3045 +- Configuration items (in the Kernel hacking/Tracers submenu)
3046 +  CONFIG_INTERRUPT_OFF_LATENCY
3047 +  CONFIG_PREEMPT_OFF_LATENCY
3048 +
3049 +
3050 +* Effective latencies
3051 +
3052 +Effective latencies are actually occuring during wakeup of a process. To
3053 +determine effective latencies, the kernel stores the time stamp when a
3054 +process is scheduled to be woken up, and determines the duration of the
3055 +wakeup time shortly before control is passed over to this process. Note
3056 +that the apparent latency in user space may be somewhat longer, since the
3057 +process may be interrupted after control is passed over to it but before
3058 +the execution in user space takes place. Simply measuring the interval
3059 +between enqueuing and wakeup may also not appropriate in cases when a
3060 +process is scheduled as a result of a timer expiration. The timer may have
3061 +missed its deadline, e.g. due to disabled interrupts, but this latency
3062 +would not be registered. Therefore, the offsets of missed timers are
3063 +recorded in a separate histogram. If both wakeup latency and missed timer
3064 +offsets are configured and enabled, a third histogram may be enabled that
3065 +records the overall latency as a sum of the timer latency, if any, and the
3066 +wakeup latency. This histogram is called "timerandwakeup".
3067 +- Configuration items (in the Kernel hacking/Tracers submenu)
3068 +  CONFIG_WAKEUP_LATENCY
3069 +  CONFIG_MISSED_TIMER_OFSETS
3070 +
3071 +
3072 +* Usage
3073 +
3074 +The interface to the administration of the latency histograms is located
3075 +in the debugfs file system. To mount it, either enter
3076 +
3077 +mount -t sysfs nodev /sys
3078 +mount -t debugfs nodev /sys/kernel/debug
3079 +
3080 +from shell command line level, or add
3081 +
3082 +nodev  /sys                    sysfs   defaults        0 0
3083 +nodev  /sys/kernel/debug       debugfs defaults        0 0
3084 +
3085 +to the file /etc/fstab. All latency histogram related files are then
3086 +available in the directory /sys/kernel/debug/tracing/latency_hist. A
3087 +particular histogram type is enabled by writing non-zero to the related
3088 +variable in the /sys/kernel/debug/tracing/latency_hist/enable directory.
3089 +Select "preemptirqsoff" for the histograms of potential sources of
3090 +latencies and "wakeup" for histograms of effective latencies etc. The
3091 +histogram data - one per CPU - are available in the files
3092 +
3093 +/sys/kernel/debug/tracing/latency_hist/preemptoff/CPUx
3094 +/sys/kernel/debug/tracing/latency_hist/irqsoff/CPUx
3095 +/sys/kernel/debug/tracing/latency_hist/preemptirqsoff/CPUx
3096 +/sys/kernel/debug/tracing/latency_hist/wakeup/CPUx
3097 +/sys/kernel/debug/tracing/latency_hist/wakeup/sharedprio/CPUx
3098 +/sys/kernel/debug/tracing/latency_hist/missed_timer_offsets/CPUx
3099 +/sys/kernel/debug/tracing/latency_hist/timerandwakeup/CPUx
3100 +
3101 +The histograms are reset by writing non-zero to the file "reset" in a
3102 +particular latency directory. To reset all latency data, use
3103 +
3104 +#!/bin/sh
3105 +
3106 +TRACINGDIR=/sys/kernel/debug/tracing
3107 +HISTDIR=$TRACINGDIR/latency_hist
3108 +
3109 +if test -d $HISTDIR
3110 +then
3111 +  cd $HISTDIR
3112 +  for i in `find . | grep /reset$`
3113 +  do
3114 +    echo 1 >$i
3115 +  done
3116 +fi
3117 +
3118 +
3119 +* Data format
3120 +
3121 +Latency data are stored with a resolution of one microsecond. The
3122 +maximum latency is 10,240 microseconds. The data are only valid, if the
3123 +overflow register is empty. Every output line contains the latency in
3124 +microseconds in the first row and the number of samples in the second
3125 +row. To display only lines with a positive latency count, use, for
3126 +example,
3127 +
3128 +grep -v " 0$" /sys/kernel/debug/tracing/latency_hist/preemptoff/CPU0
3129 +
3130 +#Minimum latency: 0 microseconds.
3131 +#Average latency: 0 microseconds.
3132 +#Maximum latency: 25 microseconds.
3133 +#Total samples: 3104770694
3134 +#There are 0 samples greater or equal than 10240 microseconds
3135 +#usecs          samples
3136 +    0        2984486876
3137 +    1          49843506
3138 +    2          58219047
3139 +    3           5348126
3140 +    4           2187960
3141 +    5           3388262
3142 +    6            959289
3143 +    7            208294
3144 +    8             40420
3145 +    9              4485
3146 +   10             14918
3147 +   11             18340
3148 +   12             25052
3149 +   13             19455
3150 +   14              5602
3151 +   15               969
3152 +   16                47
3153 +   17                18
3154 +   18                14
3155 +   19                 1
3156 +   20                 3
3157 +   21                 2
3158 +   22                 5
3159 +   23                 2
3160 +   25                 1
3161 +
3162 +
3163 +* Wakeup latency of a selected process
3164 +
3165 +To only collect wakeup latency data of a particular process, write the
3166 +PID of the requested process to
3167 +
3168 +/sys/kernel/debug/tracing/latency_hist/wakeup/pid
3169 +
3170 +PIDs are not considered, if this variable is set to 0.
3171 +
3172 +
3173 +* Details of the process with the highest wakeup latency so far
3174 +
3175 +Selected data of the process that suffered from the highest wakeup
3176 +latency that occurred in a particular CPU are available in the file
3177 +
3178 +/sys/kernel/debug/tracing/latency_hist/wakeup/max_latency-CPUx.
3179 +
3180 +In addition, other relevant system data at the time when the
3181 +latency occurred are given.
3182 +
3183 +The format of the data is (all in one line):
3184 +<PID> <Priority> <Latency> (<Timeroffset>) <Command> \
3185 +<- <PID> <Priority> <Command> <Timestamp>
3186 +
3187 +The value of <Timeroffset> is only relevant in the combined timer
3188 +and wakeup latency recording. In the wakeup recording, it is
3189 +always 0, in the missed_timer_offsets recording, it is the same
3190 +as <Latency>.
3191 +
3192 +When retrospectively searching for the origin of a latency and
3193 +tracing was not enabled, it may be helpful to know the name and
3194 +some basic data of the task that (finally) was switching to the
3195 +late real-tlme task. In addition to the victim's data, also the
3196 +data of the possible culprit are therefore displayed after the
3197 +"<-" symbol.
3198 +
3199 +Finally, the timestamp of the time when the latency occurred
3200 +in <seconds>.<microseconds> after the most recent system boot
3201 +is provided.
3202 +
3203 +These data are also reset when the wakeup histogram is reset.
3204 diff -Nur linux-4.9.6.orig/drivers/acpi/acpica/acglobal.h linux-4.9.6/drivers/acpi/acpica/acglobal.h
3205 --- linux-4.9.6.orig/drivers/acpi/acpica/acglobal.h     2017-01-26 08:25:24.000000000 +0100
3206 +++ linux-4.9.6/drivers/acpi/acpica/acglobal.h  2017-01-28 13:59:09.903655111 +0100
3207 @@ -116,7 +116,7 @@
3208   * interrupt level
3209   */
3210  ACPI_GLOBAL(acpi_spinlock, acpi_gbl_gpe_lock); /* For GPE data structs and registers */
3211 -ACPI_GLOBAL(acpi_spinlock, acpi_gbl_hardware_lock);    /* For ACPI H/W except GPE registers */
3212 +ACPI_GLOBAL(acpi_raw_spinlock, acpi_gbl_hardware_lock);        /* For ACPI H/W except GPE registers */
3213  ACPI_GLOBAL(acpi_spinlock, acpi_gbl_reference_count_lock);
3214
3215  /* Mutex for _OSI support */
3216 diff -Nur linux-4.9.6.orig/drivers/acpi/acpica/hwregs.c linux-4.9.6/drivers/acpi/acpica/hwregs.c
3217 --- linux-4.9.6.orig/drivers/acpi/acpica/hwregs.c       2017-01-26 08:25:24.000000000 +0100
3218 +++ linux-4.9.6/drivers/acpi/acpica/hwregs.c    2017-01-28 13:59:09.903655111 +0100
3219 @@ -363,14 +363,14 @@
3220                           ACPI_BITMASK_ALL_FIXED_STATUS,
3221                           ACPI_FORMAT_UINT64(acpi_gbl_xpm1a_status.address)));
3222
3223 -       lock_flags = acpi_os_acquire_lock(acpi_gbl_hardware_lock);
3224 +       raw_spin_lock_irqsave(acpi_gbl_hardware_lock, lock_flags);
3225
3226         /* Clear the fixed events in PM1 A/B */
3227
3228         status = acpi_hw_register_write(ACPI_REGISTER_PM1_STATUS,
3229                                         ACPI_BITMASK_ALL_FIXED_STATUS);
3230
3231 -       acpi_os_release_lock(acpi_gbl_hardware_lock, lock_flags);
3232 +       raw_spin_unlock_irqrestore(acpi_gbl_hardware_lock, lock_flags);
3233
3234         if (ACPI_FAILURE(status)) {
3235                 goto exit;
3236 diff -Nur linux-4.9.6.orig/drivers/acpi/acpica/hwxface.c linux-4.9.6/drivers/acpi/acpica/hwxface.c
3237 --- linux-4.9.6.orig/drivers/acpi/acpica/hwxface.c      2017-01-26 08:25:24.000000000 +0100
3238 +++ linux-4.9.6/drivers/acpi/acpica/hwxface.c   2017-01-28 13:59:09.903655111 +0100
3239 @@ -373,7 +373,7 @@
3240                 return_ACPI_STATUS(AE_BAD_PARAMETER);
3241         }
3242
3243 -       lock_flags = acpi_os_acquire_lock(acpi_gbl_hardware_lock);
3244 +       raw_spin_lock_irqsave(acpi_gbl_hardware_lock, lock_flags);
3245
3246         /*
3247          * At this point, we know that the parent register is one of the
3248 @@ -434,7 +434,7 @@
3249
3250  unlock_and_exit:
3251
3252 -       acpi_os_release_lock(acpi_gbl_hardware_lock, lock_flags);
3253 +       raw_spin_unlock_irqrestore(acpi_gbl_hardware_lock, lock_flags);
3254         return_ACPI_STATUS(status);
3255  }
3256
3257 diff -Nur linux-4.9.6.orig/drivers/acpi/acpica/utmutex.c linux-4.9.6/drivers/acpi/acpica/utmutex.c
3258 --- linux-4.9.6.orig/drivers/acpi/acpica/utmutex.c      2017-01-26 08:25:24.000000000 +0100
3259 +++ linux-4.9.6/drivers/acpi/acpica/utmutex.c   2017-01-28 13:59:09.903655111 +0100
3260 @@ -88,7 +88,7 @@
3261                 return_ACPI_STATUS (status);
3262         }
3263
3264 -       status = acpi_os_create_lock (&acpi_gbl_hardware_lock);
3265 +       status = acpi_os_create_raw_lock (&acpi_gbl_hardware_lock);
3266         if (ACPI_FAILURE (status)) {
3267                 return_ACPI_STATUS (status);
3268         }
3269 @@ -145,7 +145,7 @@
3270         /* Delete the spinlocks */
3271
3272         acpi_os_delete_lock(acpi_gbl_gpe_lock);
3273 -       acpi_os_delete_lock(acpi_gbl_hardware_lock);
3274 +       acpi_os_delete_raw_lock(acpi_gbl_hardware_lock);
3275         acpi_os_delete_lock(acpi_gbl_reference_count_lock);
3276
3277         /* Delete the reader/writer lock */
3278 diff -Nur linux-4.9.6.orig/drivers/ata/libata-sff.c linux-4.9.6/drivers/ata/libata-sff.c
3279 --- linux-4.9.6.orig/drivers/ata/libata-sff.c   2017-01-26 08:25:24.000000000 +0100
3280 +++ linux-4.9.6/drivers/ata/libata-sff.c        2017-01-28 13:59:09.907655265 +0100
3281 @@ -678,9 +678,9 @@
3282         unsigned long flags;
3283         unsigned int consumed;
3284
3285 -       local_irq_save(flags);
3286 +       local_irq_save_nort(flags);
3287         consumed = ata_sff_data_xfer32(dev, buf, buflen, rw);
3288 -       local_irq_restore(flags);
3289 +       local_irq_restore_nort(flags);
3290
3291         return consumed;
3292  }
3293 @@ -719,7 +719,7 @@
3294                 unsigned long flags;
3295
3296                 /* FIXME: use a bounce buffer */
3297 -               local_irq_save(flags);
3298 +               local_irq_save_nort(flags);
3299                 buf = kmap_atomic(page);
3300
3301                 /* do the actual data transfer */
3302 @@ -727,7 +727,7 @@
3303                                        do_write);
3304
3305                 kunmap_atomic(buf);
3306 -               local_irq_restore(flags);
3307 +               local_irq_restore_nort(flags);
3308         } else {
3309                 buf = page_address(page);
3310                 ap->ops->sff_data_xfer(qc->dev, buf + offset, qc->sect_size,
3311 @@ -864,7 +864,7 @@
3312                 unsigned long flags;
3313
3314                 /* FIXME: use bounce buffer */
3315 -               local_irq_save(flags);
3316 +               local_irq_save_nort(flags);
3317                 buf = kmap_atomic(page);
3318
3319                 /* do the actual data transfer */
3320 @@ -872,7 +872,7 @@
3321                                                                 count, rw);
3322
3323                 kunmap_atomic(buf);
3324 -               local_irq_restore(flags);
3325 +               local_irq_restore_nort(flags);
3326         } else {
3327                 buf = page_address(page);
3328                 consumed = ap->ops->sff_data_xfer(dev,  buf + offset,
3329 diff -Nur linux-4.9.6.orig/drivers/block/zram/zcomp.c linux-4.9.6/drivers/block/zram/zcomp.c
3330 --- linux-4.9.6.orig/drivers/block/zram/zcomp.c 2017-01-26 08:25:24.000000000 +0100
3331 +++ linux-4.9.6/drivers/block/zram/zcomp.c      2017-01-28 13:59:09.907655265 +0100
3332 @@ -118,12 +118,19 @@
3333
3334  struct zcomp_strm *zcomp_stream_get(struct zcomp *comp)
3335  {
3336 -       return *get_cpu_ptr(comp->stream);
3337 +       struct zcomp_strm *zstrm;
3338 +
3339 +       zstrm = *this_cpu_ptr(comp->stream);
3340 +       spin_lock(&zstrm->zcomp_lock);
3341 +       return zstrm;
3342  }
3343
3344  void zcomp_stream_put(struct zcomp *comp)
3345  {
3346 -       put_cpu_ptr(comp->stream);
3347 +       struct zcomp_strm *zstrm;
3348 +
3349 +       zstrm = *this_cpu_ptr(comp->stream);
3350 +       spin_unlock(&zstrm->zcomp_lock);
3351  }
3352
3353  int zcomp_compress(struct zcomp_strm *zstrm,
3354 @@ -174,6 +181,7 @@
3355                         pr_err("Can't allocate a compression stream\n");
3356                         return NOTIFY_BAD;
3357                 }
3358 +               spin_lock_init(&zstrm->zcomp_lock);
3359                 *per_cpu_ptr(comp->stream, cpu) = zstrm;
3360                 break;
3361         case CPU_DEAD:
3362 diff -Nur linux-4.9.6.orig/drivers/block/zram/zcomp.h linux-4.9.6/drivers/block/zram/zcomp.h
3363 --- linux-4.9.6.orig/drivers/block/zram/zcomp.h 2017-01-26 08:25:24.000000000 +0100
3364 +++ linux-4.9.6/drivers/block/zram/zcomp.h      2017-01-28 13:59:09.907655265 +0100
3365 @@ -14,6 +14,7 @@
3366         /* compression/decompression buffer */
3367         void *buffer;
3368         struct crypto_comp *tfm;
3369 +       spinlock_t zcomp_lock;
3370  };
3371
3372  /* dynamic per-device compression frontend */
3373 diff -Nur linux-4.9.6.orig/drivers/block/zram/zram_drv.c linux-4.9.6/drivers/block/zram/zram_drv.c
3374 --- linux-4.9.6.orig/drivers/block/zram/zram_drv.c      2017-01-26 08:25:24.000000000 +0100
3375 +++ linux-4.9.6/drivers/block/zram/zram_drv.c   2017-01-28 13:59:09.907655265 +0100
3376 @@ -528,6 +528,8 @@
3377                 goto out_error;
3378         }
3379
3380 +       zram_meta_init_table_locks(meta, disksize);
3381 +
3382         return meta;
3383
3384  out_error:
3385 @@ -575,28 +577,28 @@
3386         struct zram_meta *meta = zram->meta;
3387         unsigned long handle;
3388         unsigned int size;
3389 +       struct zcomp_strm *zstrm;
3390
3391 -       bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
3392 +       zram_lock_table(&meta->table[index]);
3393         handle = meta->table[index].handle;
3394         size = zram_get_obj_size(meta, index);
3395
3396         if (!handle || zram_test_flag(meta, index, ZRAM_ZERO)) {
3397 -               bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
3398 +               zram_unlock_table(&meta->table[index]);
3399                 clear_page(mem);
3400                 return 0;
3401         }
3402
3403 +       zstrm = zcomp_stream_get(zram->comp);
3404         cmem = zs_map_object(meta->mem_pool, handle, ZS_MM_RO);
3405         if (size == PAGE_SIZE) {
3406                 copy_page(mem, cmem);
3407         } else {
3408 -               struct zcomp_strm *zstrm = zcomp_stream_get(zram->comp);
3409 -
3410                 ret = zcomp_decompress(zstrm, cmem, size, mem);
3411 -               zcomp_stream_put(zram->comp);
3412         }
3413         zs_unmap_object(meta->mem_pool, handle);
3414 -       bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
3415 +       zcomp_stream_put(zram->comp);
3416 +       zram_unlock_table(&meta->table[index]);
3417
3418         /* Should NEVER happen. Return bio error if it does. */
3419         if (unlikely(ret)) {
3420 @@ -616,14 +618,14 @@
3421         struct zram_meta *meta = zram->meta;
3422         page = bvec->bv_page;
3423
3424 -       bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
3425 +       zram_lock_table(&meta->table[index]);
3426         if (unlikely(!meta->table[index].handle) ||
3427                         zram_test_flag(meta, index, ZRAM_ZERO)) {
3428 -               bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
3429 +               zram_unlock_table(&meta->table[index]);
3430                 handle_zero_page(bvec);
3431                 return 0;
3432         }
3433 -       bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
3434 +       zram_unlock_table(&meta->table[index]);
3435
3436         if (is_partial_io(bvec))
3437                 /* Use  a temporary buffer to decompress the page */
3438 @@ -700,10 +702,10 @@
3439                 if (user_mem)
3440                         kunmap_atomic(user_mem);
3441                 /* Free memory associated with this sector now. */
3442 -               bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
3443 +               zram_lock_table(&meta->table[index]);
3444                 zram_free_page(zram, index);
3445                 zram_set_flag(meta, index, ZRAM_ZERO);
3446 -               bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
3447 +               zram_unlock_table(&meta->table[index]);
3448
3449                 atomic64_inc(&zram->stats.zero_pages);
3450                 ret = 0;
3451 @@ -794,12 +796,12 @@
3452          * Free memory associated with this sector
3453          * before overwriting unused sectors.
3454          */
3455 -       bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
3456 +       zram_lock_table(&meta->table[index]);
3457         zram_free_page(zram, index);
3458
3459         meta->table[index].handle = handle;
3460         zram_set_obj_size(meta, index, clen);
3461 -       bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
3462 +       zram_unlock_table(&meta->table[index]);
3463
3464         /* Update stats */
3465         atomic64_add(clen, &zram->stats.compr_data_size);
3466 @@ -842,9 +844,9 @@
3467         }
3468
3469         while (n >= PAGE_SIZE) {
3470 -               bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
3471 +               zram_lock_table(&meta->table[index]);
3472                 zram_free_page(zram, index);
3473 -               bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
3474 +               zram_unlock_table(&meta->table[index]);
3475                 atomic64_inc(&zram->stats.notify_free);
3476                 index++;
3477                 n -= PAGE_SIZE;
3478 @@ -973,9 +975,9 @@
3479         zram = bdev->bd_disk->private_data;
3480         meta = zram->meta;
3481
3482 -       bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
3483 +       zram_lock_table(&meta->table[index]);
3484         zram_free_page(zram, index);
3485 -       bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
3486 +       zram_unlock_table(&meta->table[index]);
3487         atomic64_inc(&zram->stats.notify_free);
3488  }
3489
3490 diff -Nur linux-4.9.6.orig/drivers/block/zram/zram_drv.h linux-4.9.6/drivers/block/zram/zram_drv.h
3491 --- linux-4.9.6.orig/drivers/block/zram/zram_drv.h      2017-01-26 08:25:24.000000000 +0100
3492 +++ linux-4.9.6/drivers/block/zram/zram_drv.h   2017-01-28 13:59:09.907655265 +0100
3493 @@ -73,6 +73,9 @@
3494  struct zram_table_entry {
3495         unsigned long handle;
3496         unsigned long value;
3497 +#ifdef CONFIG_PREEMPT_RT_BASE
3498 +       spinlock_t lock;
3499 +#endif
3500  };
3501
3502  struct zram_stats {
3503 @@ -120,4 +123,42 @@
3504          */
3505         bool claim; /* Protected by bdev->bd_mutex */
3506  };
3507 +
3508 +#ifndef CONFIG_PREEMPT_RT_BASE
3509 +static inline void zram_lock_table(struct zram_table_entry *table)
3510 +{
3511 +       bit_spin_lock(ZRAM_ACCESS, &table->value);
3512 +}
3513 +
3514 +static inline void zram_unlock_table(struct zram_table_entry *table)
3515 +{
3516 +       bit_spin_unlock(ZRAM_ACCESS, &table->value);
3517 +}
3518 +
3519 +static inline void zram_meta_init_table_locks(struct zram_meta *meta, u64 disksize) { }
3520 +#else /* CONFIG_PREEMPT_RT_BASE */
3521 +static inline void zram_lock_table(struct zram_table_entry *table)
3522 +{
3523 +       spin_lock(&table->lock);
3524 +       __set_bit(ZRAM_ACCESS, &table->value);
3525 +}
3526 +
3527 +static inline void zram_unlock_table(struct zram_table_entry *table)
3528 +{
3529 +       __clear_bit(ZRAM_ACCESS, &table->value);
3530 +       spin_unlock(&table->lock);
3531 +}
3532 +
3533 +static inline void zram_meta_init_table_locks(struct zram_meta *meta, u64 disksize)
3534 +{
3535 +        size_t num_pages = disksize >> PAGE_SHIFT;
3536 +        size_t index;
3537 +
3538 +        for (index = 0; index < num_pages; index++) {
3539 +               spinlock_t *lock = &meta->table[index].lock;
3540 +               spin_lock_init(lock);
3541 +        }
3542 +}
3543 +#endif /* CONFIG_PREEMPT_RT_BASE */
3544 +
3545  #endif
3546 diff -Nur linux-4.9.6.orig/drivers/char/random.c linux-4.9.6/drivers/char/random.c
3547 --- linux-4.9.6.orig/drivers/char/random.c      2017-01-26 08:25:24.000000000 +0100
3548 +++ linux-4.9.6/drivers/char/random.c   2017-01-28 13:59:09.907655265 +0100
3549 @@ -1028,8 +1028,6 @@
3550         } sample;
3551         long delta, delta2, delta3;
3552
3553 -       preempt_disable();
3554 -
3555         sample.jiffies = jiffies;
3556         sample.cycles = random_get_entropy();
3557         sample.num = num;
3558 @@ -1070,7 +1068,6 @@
3559                  */
3560                 credit_entropy_bits(r, min_t(int, fls(delta>>1), 11));
3561         }
3562 -       preempt_enable();
3563  }
3564
3565  void add_input_randomness(unsigned int type, unsigned int code,
3566 @@ -1123,28 +1120,27 @@
3567         return *(ptr + f->reg_idx++);
3568  }
3569
3570 -void add_interrupt_randomness(int irq, int irq_flags)
3571 +void add_interrupt_randomness(int irq, int irq_flags, __u64 ip)
3572  {
3573         struct entropy_store    *r;
3574         struct fast_pool        *fast_pool = this_cpu_ptr(&irq_randomness);
3575 -       struct pt_regs          *regs = get_irq_regs();
3576         unsigned long           now = jiffies;
3577         cycles_t                cycles = random_get_entropy();
3578         __u32                   c_high, j_high;
3579 -       __u64                   ip;
3580         unsigned long           seed;
3581         int                     credit = 0;
3582
3583         if (cycles == 0)
3584 -               cycles = get_reg(fast_pool, regs);
3585 +               cycles = get_reg(fast_pool, NULL);
3586         c_high = (sizeof(cycles) > 4) ? cycles >> 32 : 0;
3587         j_high = (sizeof(now) > 4) ? now >> 32 : 0;
3588         fast_pool->pool[0] ^= cycles ^ j_high ^ irq;
3589         fast_pool->pool[1] ^= now ^ c_high;
3590 -       ip = regs ? instruction_pointer(regs) : _RET_IP_;
3591 +       if (!ip)
3592 +               ip = _RET_IP_;
3593         fast_pool->pool[2] ^= ip;
3594         fast_pool->pool[3] ^= (sizeof(ip) > 4) ? ip >> 32 :
3595 -               get_reg(fast_pool, regs);
3596 +               get_reg(fast_pool, NULL);
3597
3598         fast_mix(fast_pool);
3599         add_interrupt_bench(cycles);
3600 diff -Nur linux-4.9.6.orig/drivers/clocksource/tcb_clksrc.c linux-4.9.6/drivers/clocksource/tcb_clksrc.c
3601 --- linux-4.9.6.orig/drivers/clocksource/tcb_clksrc.c   2017-01-26 08:25:24.000000000 +0100
3602 +++ linux-4.9.6/drivers/clocksource/tcb_clksrc.c        2017-01-28 13:59:09.907655265 +0100
3603 @@ -23,8 +23,7 @@
3604   *     this 32 bit free-running counter. the second channel is not used.
3605   *
3606   *   - The third channel may be used to provide a 16-bit clockevent
3607 - *     source, used in either periodic or oneshot mode.  This runs
3608 - *     at 32 KiHZ, and can handle delays of up to two seconds.
3609 + *     source, used in either periodic or oneshot mode.
3610   *
3611   * A boot clocksource and clockevent source are also currently needed,
3612   * unless the relevant platforms (ARM/AT91, AVR32/AT32) are changed so
3613 @@ -74,6 +73,8 @@
3614  struct tc_clkevt_device {
3615         struct clock_event_device       clkevt;
3616         struct clk                      *clk;
3617 +       bool                            clk_enabled;
3618 +       u32                             freq;
3619         void __iomem                    *regs;
3620  };
3621
3622 @@ -82,15 +83,26 @@
3623         return container_of(clkevt, struct tc_clkevt_device, clkevt);
3624  }
3625
3626 -/* For now, we always use the 32K clock ... this optimizes for NO_HZ,
3627 - * because using one of the divided clocks would usually mean the
3628 - * tick rate can never be less than several dozen Hz (vs 0.5 Hz).
3629 - *
3630 - * A divided clock could be good for high resolution timers, since
3631 - * 30.5 usec resolution can seem "low".
3632 - */
3633  static u32 timer_clock;
3634
3635 +static void tc_clk_disable(struct clock_event_device *d)
3636 +{
3637 +       struct tc_clkevt_device *tcd = to_tc_clkevt(d);
3638 +
3639 +       clk_disable(tcd->clk);
3640 +       tcd->clk_enabled = false;
3641 +}
3642 +
3643 +static void tc_clk_enable(struct clock_event_device *d)
3644 +{
3645 +       struct tc_clkevt_device *tcd = to_tc_clkevt(d);
3646 +
3647 +       if (tcd->clk_enabled)
3648 +               return;
3649 +       clk_enable(tcd->clk);
3650 +       tcd->clk_enabled = true;
3651 +}
3652 +
3653  static int tc_shutdown(struct clock_event_device *d)
3654  {
3655         struct tc_clkevt_device *tcd = to_tc_clkevt(d);
3656 @@ -98,8 +110,14 @@
3657
3658         __raw_writel(0xff, regs + ATMEL_TC_REG(2, IDR));
3659         __raw_writel(ATMEL_TC_CLKDIS, regs + ATMEL_TC_REG(2, CCR));
3660 +       return 0;
3661 +}
3662 +
3663 +static int tc_shutdown_clk_off(struct clock_event_device *d)
3664 +{
3665 +       tc_shutdown(d);
3666         if (!clockevent_state_detached(d))
3667 -               clk_disable(tcd->clk);
3668 +               tc_clk_disable(d);
3669
3670         return 0;
3671  }
3672 @@ -112,9 +130,9 @@
3673         if (clockevent_state_oneshot(d) || clockevent_state_periodic(d))
3674                 tc_shutdown(d);
3675
3676 -       clk_enable(tcd->clk);
3677 +       tc_clk_enable(d);
3678
3679 -       /* slow clock, count up to RC, then irq and stop */
3680 +       /* count up to RC, then irq and stop */
3681         __raw_writel(timer_clock | ATMEL_TC_CPCSTOP | ATMEL_TC_WAVE |
3682                      ATMEL_TC_WAVESEL_UP_AUTO, regs + ATMEL_TC_REG(2, CMR));
3683         __raw_writel(ATMEL_TC_CPCS, regs + ATMEL_TC_REG(2, IER));
3684 @@ -134,12 +152,12 @@
3685         /* By not making the gentime core emulate periodic mode on top
3686          * of oneshot, we get lower overhead and improved accuracy.
3687          */
3688 -       clk_enable(tcd->clk);
3689 +       tc_clk_enable(d);
3690
3691 -       /* slow clock, count up to RC, then irq and restart */
3692 +       /* count up to RC, then irq and restart */
3693         __raw_writel(timer_clock | ATMEL_TC_WAVE | ATMEL_TC_WAVESEL_UP_AUTO,
3694                      regs + ATMEL_TC_REG(2, CMR));
3695 -       __raw_writel((32768 + HZ / 2) / HZ, tcaddr + ATMEL_TC_REG(2, RC));
3696 +       __raw_writel((tcd->freq + HZ / 2) / HZ, tcaddr + ATMEL_TC_REG(2, RC));
3697
3698         /* Enable clock and interrupts on RC compare */
3699         __raw_writel(ATMEL_TC_CPCS, regs + ATMEL_TC_REG(2, IER));
3700 @@ -166,9 +184,13 @@
3701                 .features               = CLOCK_EVT_FEAT_PERIODIC |
3702                                           CLOCK_EVT_FEAT_ONESHOT,
3703                 /* Should be lower than at91rm9200's system timer */
3704 +#ifdef CONFIG_ATMEL_TCB_CLKSRC_USE_SLOW_CLOCK
3705                 .rating                 = 125,
3706 +#else
3707 +               .rating                 = 200,
3708 +#endif
3709                 .set_next_event         = tc_next_event,
3710 -               .set_state_shutdown     = tc_shutdown,
3711 +               .set_state_shutdown     = tc_shutdown_clk_off,
3712                 .set_state_periodic     = tc_set_periodic,
3713                 .set_state_oneshot      = tc_set_oneshot,
3714         },
3715 @@ -188,8 +210,9 @@
3716         return IRQ_NONE;
3717  }
3718
3719 -static int __init setup_clkevents(struct atmel_tc *tc, int clk32k_divisor_idx)
3720 +static int __init setup_clkevents(struct atmel_tc *tc, int divisor_idx)
3721  {
3722 +       unsigned divisor = atmel_tc_divisors[divisor_idx];
3723         int ret;
3724         struct clk *t2_clk = tc->clk[2];
3725         int irq = tc->irq[2];
3726 @@ -210,7 +233,11 @@
3727         clkevt.regs = tc->regs;
3728         clkevt.clk = t2_clk;
3729
3730 -       timer_clock = clk32k_divisor_idx;
3731 +       timer_clock = divisor_idx;
3732 +       if (!divisor)
3733 +               clkevt.freq = 32768;
3734 +       else
3735 +               clkevt.freq = clk_get_rate(t2_clk) / divisor;
3736
3737         clkevt.clkevt.cpumask = cpumask_of(0);
3738
3739 @@ -221,7 +248,7 @@
3740                 return ret;
3741         }
3742
3743 -       clockevents_config_and_register(&clkevt.clkevt, 32768, 1, 0xffff);
3744 +       clockevents_config_and_register(&clkevt.clkevt, clkevt.freq, 1, 0xffff);
3745
3746         return ret;
3747  }
3748 @@ -358,7 +385,11 @@
3749                 goto err_disable_t1;
3750
3751         /* channel 2:  periodic and oneshot timer support */
3752 +#ifdef CONFIG_ATMEL_TCB_CLKSRC_USE_SLOW_CLOCK
3753         ret = setup_clkevents(tc, clk32k_divisor_idx);
3754 +#else
3755 +       ret = setup_clkevents(tc, best_divisor_idx);
3756 +#endif
3757         if (ret)
3758                 goto err_unregister_clksrc;
3759
3760 diff -Nur linux-4.9.6.orig/drivers/clocksource/timer-atmel-pit.c linux-4.9.6/drivers/clocksource/timer-atmel-pit.c
3761 --- linux-4.9.6.orig/drivers/clocksource/timer-atmel-pit.c      2017-01-26 08:25:24.000000000 +0100
3762 +++ linux-4.9.6/drivers/clocksource/timer-atmel-pit.c   2017-01-28 13:59:09.907655265 +0100
3763 @@ -46,6 +46,7 @@
3764         u32             cycle;
3765         u32             cnt;
3766         unsigned int    irq;
3767 +       bool            irq_requested;
3768         struct clk      *mck;
3769  };
3770
3771 @@ -96,15 +97,29 @@
3772
3773         /* disable irq, leaving the clocksource active */
3774         pit_write(data->base, AT91_PIT_MR, (data->cycle - 1) | AT91_PIT_PITEN);
3775 +       if (data->irq_requested) {
3776 +               free_irq(data->irq, data);
3777 +               data->irq_requested = false;
3778 +       }
3779         return 0;
3780  }
3781
3782 +static irqreturn_t at91sam926x_pit_interrupt(int irq, void *dev_id);
3783  /*
3784   * Clockevent device:  interrupts every 1/HZ (== pit_cycles * MCK/16)
3785   */
3786  static int pit_clkevt_set_periodic(struct clock_event_device *dev)
3787  {
3788         struct pit_data *data = clkevt_to_pit_data(dev);
3789 +       int ret;
3790 +
3791 +       ret = request_irq(data->irq, at91sam926x_pit_interrupt,
3792 +                         IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
3793 +                         "at91_tick", data);
3794 +       if (ret)
3795 +               panic(pr_fmt("Unable to setup IRQ\n"));
3796 +
3797 +       data->irq_requested = true;
3798
3799         /* update clocksource counter */
3800         data->cnt += data->cycle * PIT_PICNT(pit_read(data->base, AT91_PIT_PIVR));
3801 @@ -230,15 +245,6 @@
3802                 return ret;
3803         }
3804
3805 -       /* Set up irq handler */
3806 -       ret = request_irq(data->irq, at91sam926x_pit_interrupt,
3807 -                         IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
3808 -                         "at91_tick", data);
3809 -       if (ret) {
3810 -               pr_err("Unable to setup IRQ\n");
3811 -               return ret;
3812 -       }
3813 -
3814         /* Set up and register clockevents */
3815         data->clkevt.name = "pit";
3816         data->clkevt.features = CLOCK_EVT_FEAT_PERIODIC;
3817 diff -Nur linux-4.9.6.orig/drivers/clocksource/timer-atmel-st.c linux-4.9.6/drivers/clocksource/timer-atmel-st.c
3818 --- linux-4.9.6.orig/drivers/clocksource/timer-atmel-st.c       2017-01-26 08:25:24.000000000 +0100
3819 +++ linux-4.9.6/drivers/clocksource/timer-atmel-st.c    2017-01-28 13:59:09.907655265 +0100
3820 @@ -115,18 +115,29 @@
3821         last_crtr = read_CRTR();
3822  }
3823
3824 +static int atmel_st_irq;
3825 +
3826  static int clkevt32k_shutdown(struct clock_event_device *evt)
3827  {
3828         clkdev32k_disable_and_flush_irq();
3829         irqmask = 0;
3830         regmap_write(regmap_st, AT91_ST_IER, irqmask);
3831 +       free_irq(atmel_st_irq, regmap_st);
3832         return 0;
3833  }
3834
3835  static int clkevt32k_set_oneshot(struct clock_event_device *dev)
3836  {
3837 +       int ret;
3838 +
3839         clkdev32k_disable_and_flush_irq();
3840
3841 +       ret = request_irq(atmel_st_irq, at91rm9200_timer_interrupt,
3842 +                         IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
3843 +                         "at91_tick", regmap_st);
3844 +       if (ret)
3845 +               panic(pr_fmt("Unable to setup IRQ\n"));
3846 +
3847         /*
3848          * ALM for oneshot irqs, set by next_event()
3849          * before 32 seconds have passed.
3850 @@ -139,8 +150,16 @@
3851
3852  static int clkevt32k_set_periodic(struct clock_event_device *dev)
3853  {
3854 +       int ret;
3855 +
3856         clkdev32k_disable_and_flush_irq();
3857
3858 +       ret = request_irq(atmel_st_irq, at91rm9200_timer_interrupt,
3859 +                         IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
3860 +                         "at91_tick", regmap_st);
3861 +       if (ret)
3862 +               panic(pr_fmt("Unable to setup IRQ\n"));
3863 +
3864         /* PIT for periodic irqs; fixed rate of 1/HZ */
3865         irqmask = AT91_ST_PITS;
3866         regmap_write(regmap_st, AT91_ST_PIMR, timer_latch);
3867 @@ -198,7 +217,7 @@
3868  {
3869         struct clk *sclk;
3870         unsigned int sclk_rate, val;
3871 -       int irq, ret;
3872 +       int ret;
3873
3874         regmap_st = syscon_node_to_regmap(node);
3875         if (IS_ERR(regmap_st)) {
3876 @@ -212,21 +231,12 @@
3877         regmap_read(regmap_st, AT91_ST_SR, &val);
3878
3879         /* Get the interrupts property */
3880 -       irq  = irq_of_parse_and_map(node, 0);
3881 -       if (!irq) {
3882 +       atmel_st_irq  = irq_of_parse_and_map(node, 0);
3883 +       if (!atmel_st_irq) {
3884                 pr_err("Unable to get IRQ from DT\n");
3885                 return -EINVAL;
3886         }
3887
3888 -       /* Make IRQs happen for the system timer */
3889 -       ret = request_irq(irq, at91rm9200_timer_interrupt,
3890 -                         IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
3891 -                         "at91_tick", regmap_st);
3892 -       if (ret) {
3893 -               pr_err("Unable to setup IRQ\n");
3894 -               return ret;
3895 -       }
3896 -
3897         sclk = of_clk_get(node, 0);
3898         if (IS_ERR(sclk)) {
3899                 pr_err("Unable to get slow clock\n");
3900 diff -Nur linux-4.9.6.orig/drivers/connector/cn_proc.c linux-4.9.6/drivers/connector/cn_proc.c
3901 --- linux-4.9.6.orig/drivers/connector/cn_proc.c        2017-01-26 08:25:24.000000000 +0100
3902 +++ linux-4.9.6/drivers/connector/cn_proc.c     2017-01-28 13:59:09.907655265 +0100
3903 @@ -32,6 +32,7 @@
3904  #include <linux/pid_namespace.h>
3905
3906  #include <linux/cn_proc.h>
3907 +#include <linux/locallock.h>
3908
3909  /*
3910   * Size of a cn_msg followed by a proc_event structure.  Since the
3911 @@ -54,10 +55,11 @@
3912
3913  /* proc_event_counts is used as the sequence number of the netlink message */
3914  static DEFINE_PER_CPU(__u32, proc_event_counts) = { 0 };
3915 +static DEFINE_LOCAL_IRQ_LOCK(send_msg_lock);
3916
3917  static inline void send_msg(struct cn_msg *msg)
3918  {
3919 -       preempt_disable();
3920 +       local_lock(send_msg_lock);
3921
3922         msg->seq = __this_cpu_inc_return(proc_event_counts) - 1;
3923         ((struct proc_event *)msg->data)->cpu = smp_processor_id();
3924 @@ -70,7 +72,7 @@
3925          */
3926         cn_netlink_send(msg, 0, CN_IDX_PROC, GFP_NOWAIT);
3927
3928 -       preempt_enable();
3929 +       local_unlock(send_msg_lock);
3930  }
3931
3932  void proc_fork_connector(struct task_struct *task)
3933 diff -Nur linux-4.9.6.orig/drivers/cpufreq/Kconfig.x86 linux-4.9.6/drivers/cpufreq/Kconfig.x86
3934 --- linux-4.9.6.orig/drivers/cpufreq/Kconfig.x86        2017-01-26 08:25:24.000000000 +0100
3935 +++ linux-4.9.6/drivers/cpufreq/Kconfig.x86     2017-01-28 13:59:09.907655265 +0100
3936 @@ -124,7 +124,7 @@
3937
3938  config X86_POWERNOW_K8
3939         tristate "AMD Opteron/Athlon64 PowerNow!"
3940 -       depends on ACPI && ACPI_PROCESSOR && X86_ACPI_CPUFREQ
3941 +       depends on ACPI && ACPI_PROCESSOR && X86_ACPI_CPUFREQ && !PREEMPT_RT_BASE
3942         help
3943           This adds the CPUFreq driver for K8/early Opteron/Athlon64 processors.
3944           Support for K10 and newer processors is now in acpi-cpufreq.
3945 diff -Nur linux-4.9.6.orig/drivers/gpu/drm/i915/i915_gem_execbuffer.c linux-4.9.6/drivers/gpu/drm/i915/i915_gem_execbuffer.c
3946 --- linux-4.9.6.orig/drivers/gpu/drm/i915/i915_gem_execbuffer.c 2017-01-26 08:25:24.000000000 +0100
3947 +++ linux-4.9.6/drivers/gpu/drm/i915/i915_gem_execbuffer.c      2017-01-28 13:59:09.907655265 +0100
3948 @@ -1537,7 +1537,9 @@
3949         if (ret)
3950                 return ret;
3951
3952 +#ifndef CONFIG_PREEMPT_RT_BASE
3953         trace_i915_gem_ring_dispatch(params->request, params->dispatch_flags);
3954 +#endif
3955
3956         i915_gem_execbuffer_move_to_active(vmas, params->request);
3957
3958 diff -Nur linux-4.9.6.orig/drivers/gpu/drm/i915/i915_gem_shrinker.c linux-4.9.6/drivers/gpu/drm/i915/i915_gem_shrinker.c
3959 --- linux-4.9.6.orig/drivers/gpu/drm/i915/i915_gem_shrinker.c   2017-01-26 08:25:24.000000000 +0100
3960 +++ linux-4.9.6/drivers/gpu/drm/i915/i915_gem_shrinker.c        2017-01-28 13:59:09.907655265 +0100
3961 @@ -40,7 +40,7 @@
3962         if (!mutex_is_locked(mutex))
3963                 return false;
3964
3965 -#if defined(CONFIG_DEBUG_MUTEXES) || defined(CONFIG_MUTEX_SPIN_ON_OWNER)
3966 +#if (defined(CONFIG_DEBUG_MUTEXES) || defined(CONFIG_MUTEX_SPIN_ON_OWNER)) && !defined(CONFIG_PREEMPT_RT_BASE)
3967         return mutex->owner == task;
3968  #else
3969         /* Since UP may be pre-empted, we cannot assume that we own the lock */
3970 diff -Nur linux-4.9.6.orig/drivers/gpu/drm/i915/i915_irq.c linux-4.9.6/drivers/gpu/drm/i915/i915_irq.c
3971 --- linux-4.9.6.orig/drivers/gpu/drm/i915/i915_irq.c    2017-01-26 08:25:24.000000000 +0100
3972 +++ linux-4.9.6/drivers/gpu/drm/i915/i915_irq.c 2017-01-28 13:59:09.907655265 +0100
3973 @@ -812,6 +812,7 @@
3974         spin_lock_irqsave(&dev_priv->uncore.lock, irqflags);
3975
3976         /* preempt_disable_rt() should go right here in PREEMPT_RT patchset. */
3977 +       preempt_disable_rt();
3978
3979         /* Get optional system timestamp before query. */
3980         if (stime)
3981 @@ -863,6 +864,7 @@
3982                 *etime = ktime_get();
3983
3984         /* preempt_enable_rt() should go right here in PREEMPT_RT patchset. */
3985 +       preempt_enable_rt();
3986
3987         spin_unlock_irqrestore(&dev_priv->uncore.lock, irqflags);
3988
3989 diff -Nur linux-4.9.6.orig/drivers/gpu/drm/i915/intel_display.c linux-4.9.6/drivers/gpu/drm/i915/intel_display.c
3990 --- linux-4.9.6.orig/drivers/gpu/drm/i915/intel_display.c       2017-01-26 08:25:24.000000000 +0100
3991 +++ linux-4.9.6/drivers/gpu/drm/i915/intel_display.c    2017-01-28 13:59:09.911655420 +0100
3992 @@ -12131,7 +12131,7 @@
3993         struct intel_crtc *intel_crtc = to_intel_crtc(crtc);
3994         struct intel_flip_work *work;
3995
3996 -       WARN_ON(!in_interrupt());
3997 +       WARN_ON_NONRT(!in_interrupt());
3998
3999         if (crtc == NULL)
4000                 return;
4001 diff -Nur linux-4.9.6.orig/drivers/gpu/drm/i915/intel_sprite.c linux-4.9.6/drivers/gpu/drm/i915/intel_sprite.c
4002 --- linux-4.9.6.orig/drivers/gpu/drm/i915/intel_sprite.c        2017-01-26 08:25:24.000000000 +0100
4003 +++ linux-4.9.6/drivers/gpu/drm/i915/intel_sprite.c     2017-01-28 13:59:09.911655420 +0100
4004 @@ -35,6 +35,7 @@
4005  #include <drm/drm_rect.h>
4006  #include <drm/drm_atomic.h>
4007  #include <drm/drm_plane_helper.h>
4008 +#include <linux/locallock.h>
4009  #include "intel_drv.h"
4010  #include "intel_frontbuffer.h"
4011  #include <drm/i915_drm.h>
4012 @@ -65,6 +66,8 @@
4013                             1000 * adjusted_mode->crtc_htotal);
4014  }
4015
4016 +static DEFINE_LOCAL_IRQ_LOCK(pipe_update_lock);
4017 +
4018  /**
4019   * intel_pipe_update_start() - start update of a set of display registers
4020   * @crtc: the crtc of which the registers are going to be updated
4021 @@ -95,7 +98,7 @@
4022         min = vblank_start - intel_usecs_to_scanlines(adjusted_mode, 100);
4023         max = vblank_start - 1;
4024
4025 -       local_irq_disable();
4026 +       local_lock_irq(pipe_update_lock);
4027
4028         if (min <= 0 || max <= 0)
4029                 return;
4030 @@ -125,11 +128,11 @@
4031                         break;
4032                 }
4033
4034 -               local_irq_enable();
4035 +               local_unlock_irq(pipe_update_lock);
4036
4037                 timeout = schedule_timeout(timeout);
4038
4039 -               local_irq_disable();
4040 +               local_lock_irq(pipe_update_lock);
4041         }
4042
4043         finish_wait(wq, &wait);
4044 @@ -181,7 +184,7 @@
4045                 crtc->base.state->event = NULL;
4046         }
4047
4048 -       local_irq_enable();
4049 +       local_unlock_irq(pipe_update_lock);
4050
4051         if (crtc->debug.start_vbl_count &&
4052             crtc->debug.start_vbl_count != end_vbl_count) {
4053 diff -Nur linux-4.9.6.orig/drivers/gpu/drm/msm/msm_gem_shrinker.c linux-4.9.6/drivers/gpu/drm/msm/msm_gem_shrinker.c
4054 --- linux-4.9.6.orig/drivers/gpu/drm/msm/msm_gem_shrinker.c     2017-01-26 08:25:24.000000000 +0100
4055 +++ linux-4.9.6/drivers/gpu/drm/msm/msm_gem_shrinker.c  2017-01-28 13:59:09.911655420 +0100
4056 @@ -23,7 +23,7 @@
4057         if (!mutex_is_locked(mutex))
4058                 return false;
4059
4060 -#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_MUTEXES)
4061 +#if (defined(CONFIG_SMP) || defined(CONFIG_DEBUG_MUTEXES)) && !defined(CONFIG_PREEMPT_RT_BASE)
4062         return mutex->owner == task;
4063  #else
4064         /* Since UP may be pre-empted, we cannot assume that we own the lock */
4065 diff -Nur linux-4.9.6.orig/drivers/gpu/drm/radeon/radeon_display.c linux-4.9.6/drivers/gpu/drm/radeon/radeon_display.c
4066 --- linux-4.9.6.orig/drivers/gpu/drm/radeon/radeon_display.c    2017-01-26 08:25:24.000000000 +0100
4067 +++ linux-4.9.6/drivers/gpu/drm/radeon/radeon_display.c 2017-01-28 13:59:09.911655420 +0100
4068 @@ -1845,6 +1845,7 @@
4069         struct radeon_device *rdev = dev->dev_private;
4070
4071         /* preempt_disable_rt() should go right here in PREEMPT_RT patchset. */
4072 +       preempt_disable_rt();
4073
4074         /* Get optional system timestamp before query. */
4075         if (stime)
4076 @@ -1937,6 +1938,7 @@
4077                 *etime = ktime_get();
4078
4079         /* preempt_enable_rt() should go right here in PREEMPT_RT patchset. */
4080 +       preempt_enable_rt();
4081
4082         /* Decode into vertical and horizontal scanout position. */
4083         *vpos = position & 0x1fff;
4084 diff -Nur linux-4.9.6.orig/drivers/hv/vmbus_drv.c linux-4.9.6/drivers/hv/vmbus_drv.c
4085 --- linux-4.9.6.orig/drivers/hv/vmbus_drv.c     2017-01-26 08:25:24.000000000 +0100
4086 +++ linux-4.9.6/drivers/hv/vmbus_drv.c  2017-01-28 13:59:09.911655420 +0100
4087 @@ -761,6 +761,8 @@
4088         void *page_addr;
4089         struct hv_message *msg;
4090         union hv_synic_event_flags *event;
4091 +       struct pt_regs *regs = get_irq_regs();
4092 +       u64 ip = regs ? instruction_pointer(regs) : 0;
4093         bool handled = false;
4094
4095         page_addr = hv_context.synic_event_page[cpu];
4096 @@ -808,7 +810,7 @@
4097                         tasklet_schedule(hv_context.msg_dpc[cpu]);
4098         }
4099
4100 -       add_interrupt_randomness(HYPERVISOR_CALLBACK_VECTOR, 0);
4101 +       add_interrupt_randomness(HYPERVISOR_CALLBACK_VECTOR, 0, ip);
4102  }
4103
4104
4105 diff -Nur linux-4.9.6.orig/drivers/ide/alim15x3.c linux-4.9.6/drivers/ide/alim15x3.c
4106 --- linux-4.9.6.orig/drivers/ide/alim15x3.c     2017-01-26 08:25:24.000000000 +0100
4107 +++ linux-4.9.6/drivers/ide/alim15x3.c  2017-01-28 13:59:09.911655420 +0100
4108 @@ -234,7 +234,7 @@
4109
4110         isa_dev = pci_get_device(PCI_VENDOR_ID_AL, PCI_DEVICE_ID_AL_M1533, NULL);
4111
4112 -       local_irq_save(flags);
4113 +       local_irq_save_nort(flags);
4114
4115         if (m5229_revision < 0xC2) {
4116                 /*
4117 @@ -325,7 +325,7 @@
4118         }
4119         pci_dev_put(north);
4120         pci_dev_put(isa_dev);
4121 -       local_irq_restore(flags);
4122 +       local_irq_restore_nort(flags);
4123         return 0;
4124  }
4125
4126 diff -Nur linux-4.9.6.orig/drivers/ide/hpt366.c linux-4.9.6/drivers/ide/hpt366.c
4127 --- linux-4.9.6.orig/drivers/ide/hpt366.c       2017-01-26 08:25:24.000000000 +0100
4128 +++ linux-4.9.6/drivers/ide/hpt366.c    2017-01-28 13:59:09.911655420 +0100
4129 @@ -1236,7 +1236,7 @@
4130
4131         dma_old = inb(base + 2);
4132
4133 -       local_irq_save(flags);
4134 +       local_irq_save_nort(flags);
4135
4136         dma_new = dma_old;
4137         pci_read_config_byte(dev, hwif->channel ? 0x4b : 0x43, &masterdma);
4138 @@ -1247,7 +1247,7 @@
4139         if (dma_new != dma_old)
4140                 outb(dma_new, base + 2);
4141
4142 -       local_irq_restore(flags);
4143 +       local_irq_restore_nort(flags);
4144
4145         printk(KERN_INFO "    %s: BM-DMA at 0x%04lx-0x%04lx\n",
4146                          hwif->name, base, base + 7);
4147 diff -Nur linux-4.9.6.orig/drivers/ide/ide-io.c linux-4.9.6/drivers/ide/ide-io.c
4148 --- linux-4.9.6.orig/drivers/ide/ide-io.c       2017-01-26 08:25:24.000000000 +0100
4149 +++ linux-4.9.6/drivers/ide/ide-io.c    2017-01-28 13:59:09.911655420 +0100
4150 @@ -659,7 +659,7 @@
4151                 /* disable_irq_nosync ?? */
4152                 disable_irq(hwif->irq);
4153                 /* local CPU only, as if we were handling an interrupt */
4154 -               local_irq_disable();
4155 +               local_irq_disable_nort();
4156                 if (hwif->polling) {
4157                         startstop = handler(drive);
4158                 } else if (drive_is_ready(drive)) {
4159 diff -Nur linux-4.9.6.orig/drivers/ide/ide-iops.c linux-4.9.6/drivers/ide/ide-iops.c
4160 --- linux-4.9.6.orig/drivers/ide/ide-iops.c     2017-01-26 08:25:24.000000000 +0100
4161 +++ linux-4.9.6/drivers/ide/ide-iops.c  2017-01-28 13:59:09.911655420 +0100
4162 @@ -129,12 +129,12 @@
4163                                 if ((stat & ATA_BUSY) == 0)
4164                                         break;
4165
4166 -                               local_irq_restore(flags);
4167 +                               local_irq_restore_nort(flags);
4168                                 *rstat = stat;
4169                                 return -EBUSY;
4170                         }
4171                 }
4172 -               local_irq_restore(flags);
4173 +               local_irq_restore_nort(flags);
4174         }
4175         /*
4176          * Allow status to settle, then read it again.
4177 diff -Nur linux-4.9.6.orig/drivers/ide/ide-io-std.c linux-4.9.6/drivers/ide/ide-io-std.c
4178 --- linux-4.9.6.orig/drivers/ide/ide-io-std.c   2017-01-26 08:25:24.000000000 +0100
4179 +++ linux-4.9.6/drivers/ide/ide-io-std.c        2017-01-28 13:59:09.911655420 +0100
4180 @@ -175,7 +175,7 @@
4181                 unsigned long uninitialized_var(flags);
4182
4183                 if ((io_32bit & 2) && !mmio) {
4184 -                       local_irq_save(flags);
4185 +                       local_irq_save_nort(flags);
4186                         ata_vlb_sync(io_ports->nsect_addr);
4187                 }
4188
4189 @@ -186,7 +186,7 @@
4190                         insl(data_addr, buf, words);
4191
4192                 if ((io_32bit & 2) && !mmio)
4193 -                       local_irq_restore(flags);
4194 +                       local_irq_restore_nort(flags);
4195
4196                 if (((len + 1) & 3) < 2)
4197                         return;
4198 @@ -219,7 +219,7 @@
4199                 unsigned long uninitialized_var(flags);
4200
4201                 if ((io_32bit & 2) && !mmio) {
4202 -                       local_irq_save(flags);
4203 +                       local_irq_save_nort(flags);
4204                         ata_vlb_sync(io_ports->nsect_addr);
4205                 }
4206
4207 @@ -230,7 +230,7 @@
4208                         outsl(data_addr, buf, words);
4209
4210                 if ((io_32bit & 2) && !mmio)
4211 -                       local_irq_restore(flags);
4212 +                       local_irq_restore_nort(flags);
4213
4214                 if (((len + 1) & 3) < 2)
4215                         return;
4216 diff -Nur linux-4.9.6.orig/drivers/ide/ide-probe.c linux-4.9.6/drivers/ide/ide-probe.c
4217 --- linux-4.9.6.orig/drivers/ide/ide-probe.c    2017-01-26 08:25:24.000000000 +0100
4218 +++ linux-4.9.6/drivers/ide/ide-probe.c 2017-01-28 13:59:09.911655420 +0100
4219 @@ -196,10 +196,10 @@
4220         int bswap = 1;
4221
4222         /* local CPU only; some systems need this */
4223 -       local_irq_save(flags);
4224 +       local_irq_save_nort(flags);
4225         /* read 512 bytes of id info */
4226         hwif->tp_ops->input_data(drive, NULL, id, SECTOR_SIZE);
4227 -       local_irq_restore(flags);
4228 +       local_irq_restore_nort(flags);
4229
4230         drive->dev_flags |= IDE_DFLAG_ID_READ;
4231  #ifdef DEBUG
4232 diff -Nur linux-4.9.6.orig/drivers/ide/ide-taskfile.c linux-4.9.6/drivers/ide/ide-taskfile.c
4233 --- linux-4.9.6.orig/drivers/ide/ide-taskfile.c 2017-01-26 08:25:24.000000000 +0100
4234 +++ linux-4.9.6/drivers/ide/ide-taskfile.c      2017-01-28 13:59:09.911655420 +0100
4235 @@ -250,7 +250,7 @@
4236
4237                 page_is_high = PageHighMem(page);
4238                 if (page_is_high)
4239 -                       local_irq_save(flags);
4240 +                       local_irq_save_nort(flags);
4241
4242                 buf = kmap_atomic(page) + offset;
4243
4244 @@ -271,7 +271,7 @@
4245                 kunmap_atomic(buf);
4246
4247                 if (page_is_high)
4248 -                       local_irq_restore(flags);
4249 +                       local_irq_restore_nort(flags);
4250
4251                 len -= nr_bytes;
4252         }
4253 @@ -414,7 +414,7 @@
4254         }
4255
4256         if ((drive->dev_flags & IDE_DFLAG_UNMASK) == 0)
4257 -               local_irq_disable();
4258 +               local_irq_disable_nort();
4259
4260         ide_set_handler(drive, &task_pio_intr, WAIT_WORSTCASE);
4261
4262 diff -Nur linux-4.9.6.orig/drivers/infiniband/ulp/ipoib/ipoib_multicast.c linux-4.9.6/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
4263 --- linux-4.9.6.orig/drivers/infiniband/ulp/ipoib/ipoib_multicast.c     2017-01-26 08:25:24.000000000 +0100
4264 +++ linux-4.9.6/drivers/infiniband/ulp/ipoib/ipoib_multicast.c  2017-01-28 13:59:09.911655420 +0100
4265 @@ -902,7 +902,7 @@
4266
4267         ipoib_dbg_mcast(priv, "restarting multicast task\n");
4268
4269 -       local_irq_save(flags);
4270 +       local_irq_save_nort(flags);
4271         netif_addr_lock(dev);
4272         spin_lock(&priv->lock);
4273
4274 @@ -984,7 +984,7 @@
4275
4276         spin_unlock(&priv->lock);
4277         netif_addr_unlock(dev);
4278 -       local_irq_restore(flags);
4279 +       local_irq_restore_nort(flags);
4280
4281         /*
4282          * make sure the in-flight joins have finished before we attempt
4283 diff -Nur linux-4.9.6.orig/drivers/input/gameport/gameport.c linux-4.9.6/drivers/input/gameport/gameport.c
4284 --- linux-4.9.6.orig/drivers/input/gameport/gameport.c  2017-01-26 08:25:24.000000000 +0100
4285 +++ linux-4.9.6/drivers/input/gameport/gameport.c       2017-01-28 13:59:09.915655573 +0100
4286 @@ -91,13 +91,13 @@
4287         tx = ~0;
4288
4289         for (i = 0; i < 50; i++) {
4290 -               local_irq_save(flags);
4291 +               local_irq_save_nort(flags);
4292                 t1 = ktime_get_ns();
4293                 for (t = 0; t < 50; t++)
4294                         gameport_read(gameport);
4295                 t2 = ktime_get_ns();
4296                 t3 = ktime_get_ns();
4297 -               local_irq_restore(flags);
4298 +               local_irq_restore_nort(flags);
4299                 udelay(i * 10);
4300                 t = (t2 - t1) - (t3 - t2);
4301                 if (t < tx)
4302 @@ -124,12 +124,12 @@
4303         tx = 1 << 30;
4304
4305         for(i = 0; i < 50; i++) {
4306 -               local_irq_save(flags);
4307 +               local_irq_save_nort(flags);
4308                 GET_TIME(t1);
4309                 for (t = 0; t < 50; t++) gameport_read(gameport);
4310                 GET_TIME(t2);
4311                 GET_TIME(t3);
4312 -               local_irq_restore(flags);
4313 +               local_irq_restore_nort(flags);
4314                 udelay(i * 10);
4315                 if ((t = DELTA(t2,t1) - DELTA(t3,t2)) < tx) tx = t;
4316         }
4317 @@ -148,11 +148,11 @@
4318         tx = 1 << 30;
4319
4320         for(i = 0; i < 50; i++) {
4321 -               local_irq_save(flags);
4322 +               local_irq_save_nort(flags);
4323                 t1 = rdtsc();
4324                 for (t = 0; t < 50; t++) gameport_read(gameport);
4325                 t2 = rdtsc();
4326 -               local_irq_restore(flags);
4327 +               local_irq_restore_nort(flags);
4328                 udelay(i * 10);
4329                 if (t2 - t1 < tx) tx = t2 - t1;
4330         }
4331 diff -Nur linux-4.9.6.orig/drivers/iommu/amd_iommu.c linux-4.9.6/drivers/iommu/amd_iommu.c
4332 --- linux-4.9.6.orig/drivers/iommu/amd_iommu.c  2017-01-26 08:25:24.000000000 +0100
4333 +++ linux-4.9.6/drivers/iommu/amd_iommu.c       2017-01-28 13:59:09.915655573 +0100
4334 @@ -1923,10 +1923,10 @@
4335         int ret;
4336
4337         /*
4338 -        * Must be called with IRQs disabled. Warn here to detect early
4339 -        * when its not.
4340 +        * Must be called with IRQs disabled on a non RT kernel. Warn here to
4341 +        * detect early when its not.
4342          */
4343 -       WARN_ON(!irqs_disabled());
4344 +       WARN_ON_NONRT(!irqs_disabled());
4345
4346         /* lock domain */
4347         spin_lock(&domain->lock);
4348 @@ -2094,10 +2094,10 @@
4349         struct protection_domain *domain;
4350
4351         /*
4352 -        * Must be called with IRQs disabled. Warn here to detect early
4353 -        * when its not.
4354 +        * Must be called with IRQs disabled on a non RT kernel. Warn here to
4355 +        * detect early when its not.
4356          */
4357 -       WARN_ON(!irqs_disabled());
4358 +       WARN_ON_NONRT(!irqs_disabled());
4359
4360         if (WARN_ON(!dev_data->domain))
4361                 return;
4362 diff -Nur linux-4.9.6.orig/drivers/iommu/intel-iommu.c linux-4.9.6/drivers/iommu/intel-iommu.c
4363 --- linux-4.9.6.orig/drivers/iommu/intel-iommu.c        2017-01-26 08:25:24.000000000 +0100
4364 +++ linux-4.9.6/drivers/iommu/intel-iommu.c     2017-01-28 13:59:09.915655573 +0100
4365 @@ -479,7 +479,7 @@
4366         struct deferred_flush_table *tables;
4367  };
4368
4369 -DEFINE_PER_CPU(struct deferred_flush_data, deferred_flush);
4370 +static DEFINE_PER_CPU(struct deferred_flush_data, deferred_flush);
4371
4372  /* bitmap for indexing intel_iommus */
4373  static int g_num_of_iommus;
4374 @@ -3715,10 +3715,8 @@
4375         struct intel_iommu *iommu;
4376         struct deferred_flush_entry *entry;
4377         struct deferred_flush_data *flush_data;
4378 -       unsigned int cpuid;
4379
4380 -       cpuid = get_cpu();
4381 -       flush_data = per_cpu_ptr(&deferred_flush, cpuid);
4382 +       flush_data = raw_cpu_ptr(&deferred_flush);
4383
4384         /* Flush all CPUs' entries to avoid deferring too much.  If
4385          * this becomes a bottleneck, can just flush us, and rely on
4386 @@ -3751,8 +3749,6 @@
4387         }
4388         flush_data->size++;
4389         spin_unlock_irqrestore(&flush_data->lock, flags);
4390 -
4391 -       put_cpu();
4392  }
4393
4394  static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
4395 diff -Nur linux-4.9.6.orig/drivers/iommu/iova.c linux-4.9.6/drivers/iommu/iova.c
4396 --- linux-4.9.6.orig/drivers/iommu/iova.c       2017-01-26 08:25:24.000000000 +0100
4397 +++ linux-4.9.6/drivers/iommu/iova.c    2017-01-28 13:59:09.915655573 +0100
4398 @@ -22,6 +22,7 @@
4399  #include <linux/slab.h>
4400  #include <linux/smp.h>
4401  #include <linux/bitops.h>
4402 +#include <linux/cpu.h>
4403
4404  static bool iova_rcache_insert(struct iova_domain *iovad,
4405                                unsigned long pfn,
4406 @@ -420,10 +421,8 @@
4407
4408                 /* Try replenishing IOVAs by flushing rcache. */
4409                 flushed_rcache = true;
4410 -               preempt_disable();
4411                 for_each_online_cpu(cpu)
4412                         free_cpu_cached_iovas(cpu, iovad);
4413 -               preempt_enable();
4414                 goto retry;
4415         }
4416
4417 @@ -751,7 +750,7 @@
4418         bool can_insert = false;
4419         unsigned long flags;
4420
4421 -       cpu_rcache = get_cpu_ptr(rcache->cpu_rcaches);
4422 +       cpu_rcache = raw_cpu_ptr(rcache->cpu_rcaches);
4423         spin_lock_irqsave(&cpu_rcache->lock, flags);
4424
4425         if (!iova_magazine_full(cpu_rcache->loaded)) {
4426 @@ -781,7 +780,6 @@
4427                 iova_magazine_push(cpu_rcache->loaded, iova_pfn);
4428
4429         spin_unlock_irqrestore(&cpu_rcache->lock, flags);
4430 -       put_cpu_ptr(rcache->cpu_rcaches);
4431
4432         if (mag_to_free) {
4433                 iova_magazine_free_pfns(mag_to_free, iovad);
4434 @@ -815,7 +813,7 @@
4435         bool has_pfn = false;
4436         unsigned long flags;
4437
4438 -       cpu_rcache = get_cpu_ptr(rcache->cpu_rcaches);
4439 +       cpu_rcache = raw_cpu_ptr(rcache->cpu_rcaches);
4440         spin_lock_irqsave(&cpu_rcache->lock, flags);
4441
4442         if (!iova_magazine_empty(cpu_rcache->loaded)) {
4443 @@ -837,7 +835,6 @@
4444                 iova_pfn = iova_magazine_pop(cpu_rcache->loaded, limit_pfn);
4445
4446         spin_unlock_irqrestore(&cpu_rcache->lock, flags);
4447 -       put_cpu_ptr(rcache->cpu_rcaches);
4448
4449         return iova_pfn;
4450  }
4451 diff -Nur linux-4.9.6.orig/drivers/leds/trigger/Kconfig linux-4.9.6/drivers/leds/trigger/Kconfig
4452 --- linux-4.9.6.orig/drivers/leds/trigger/Kconfig       2017-01-26 08:25:24.000000000 +0100
4453 +++ linux-4.9.6/drivers/leds/trigger/Kconfig    2017-01-28 13:59:09.915655573 +0100
4454 @@ -69,7 +69,7 @@
4455
4456  config LEDS_TRIGGER_CPU
4457         bool "LED CPU Trigger"
4458 -       depends on LEDS_TRIGGERS
4459 +       depends on LEDS_TRIGGERS && !PREEMPT_RT_BASE
4460         help
4461           This allows LEDs to be controlled by active CPUs. This shows
4462           the active CPUs across an array of LEDs so you can see which
4463 diff -Nur linux-4.9.6.orig/drivers/md/bcache/Kconfig linux-4.9.6/drivers/md/bcache/Kconfig
4464 --- linux-4.9.6.orig/drivers/md/bcache/Kconfig  2017-01-26 08:25:24.000000000 +0100
4465 +++ linux-4.9.6/drivers/md/bcache/Kconfig       2017-01-28 13:59:09.915655573 +0100
4466 @@ -1,6 +1,7 @@
4467
4468  config BCACHE
4469         tristate "Block device as cache"
4470 +       depends on !PREEMPT_RT_FULL
4471         ---help---
4472         Allows a block device to be used as cache for other devices; uses
4473         a btree for indexing and the layout is optimized for SSDs.
4474 diff -Nur linux-4.9.6.orig/drivers/md/dm-rq.c linux-4.9.6/drivers/md/dm-rq.c
4475 --- linux-4.9.6.orig/drivers/md/dm-rq.c 2017-01-26 08:25:24.000000000 +0100
4476 +++ linux-4.9.6/drivers/md/dm-rq.c      2017-01-28 13:59:09.915655573 +0100
4477 @@ -838,7 +838,7 @@
4478                 /* Establish tio->ti before queuing work (map_tio_request) */
4479                 tio->ti = ti;
4480                 kthread_queue_work(&md->kworker, &tio->work);
4481 -               BUG_ON(!irqs_disabled());
4482 +               BUG_ON_NONRT(!irqs_disabled());
4483         }
4484  }
4485
4486 diff -Nur linux-4.9.6.orig/drivers/md/raid5.c linux-4.9.6/drivers/md/raid5.c
4487 --- linux-4.9.6.orig/drivers/md/raid5.c 2017-01-26 08:25:24.000000000 +0100
4488 +++ linux-4.9.6/drivers/md/raid5.c      2017-01-28 13:59:09.915655573 +0100
4489 @@ -1928,8 +1928,9 @@
4490         struct raid5_percpu *percpu;
4491         unsigned long cpu;
4492
4493 -       cpu = get_cpu();
4494 +       cpu = get_cpu_light();
4495         percpu = per_cpu_ptr(conf->percpu, cpu);
4496 +       spin_lock(&percpu->lock);
4497         if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) {
4498                 ops_run_biofill(sh);
4499                 overlap_clear++;
4500 @@ -1985,7 +1986,8 @@
4501                         if (test_and_clear_bit(R5_Overlap, &dev->flags))
4502                                 wake_up(&sh->raid_conf->wait_for_overlap);
4503                 }
4504 -       put_cpu();
4505 +       spin_unlock(&percpu->lock);
4506 +       put_cpu_light();
4507  }
4508
4509  static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp,
4510 @@ -6391,6 +6393,7 @@
4511                        __func__, cpu);
4512                 return -ENOMEM;
4513         }
4514 +       spin_lock_init(&per_cpu_ptr(conf->percpu, cpu)->lock);
4515         return 0;
4516  }
4517
4518 @@ -6401,7 +6404,6 @@
4519         conf->percpu = alloc_percpu(struct raid5_percpu);
4520         if (!conf->percpu)
4521                 return -ENOMEM;
4522 -
4523         err = cpuhp_state_add_instance(CPUHP_MD_RAID5_PREPARE, &conf->node);
4524         if (!err) {
4525                 conf->scribble_disks = max(conf->raid_disks,
4526 diff -Nur linux-4.9.6.orig/drivers/md/raid5.h linux-4.9.6/drivers/md/raid5.h
4527 --- linux-4.9.6.orig/drivers/md/raid5.h 2017-01-26 08:25:24.000000000 +0100
4528 +++ linux-4.9.6/drivers/md/raid5.h      2017-01-28 13:59:09.915655573 +0100
4529 @@ -504,6 +504,7 @@
4530         int                     recovery_disabled;
4531         /* per cpu variables */
4532         struct raid5_percpu {
4533 +               spinlock_t      lock;           /* Protection for -RT */
4534                 struct page     *spare_page; /* Used when checking P/Q in raid6 */
4535                 struct flex_array *scribble;   /* space for constructing buffer
4536                                               * lists and performing address
4537 diff -Nur linux-4.9.6.orig/drivers/misc/Kconfig linux-4.9.6/drivers/misc/Kconfig
4538 --- linux-4.9.6.orig/drivers/misc/Kconfig       2017-01-26 08:25:24.000000000 +0100
4539 +++ linux-4.9.6/drivers/misc/Kconfig    2017-01-28 13:59:09.915655573 +0100
4540 @@ -54,6 +54,7 @@
4541  config ATMEL_TCLIB
4542         bool "Atmel AT32/AT91 Timer/Counter Library"
4543         depends on (AVR32 || ARCH_AT91)
4544 +       default y if PREEMPT_RT_FULL
4545         help
4546           Select this if you want a library to allocate the Timer/Counter
4547           blocks found on many Atmel processors.  This facilitates using
4548 @@ -69,8 +70,7 @@
4549           are combined to make a single 32-bit timer.
4550
4551           When GENERIC_CLOCKEVENTS is defined, the third timer channel
4552 -         may be used as a clock event device supporting oneshot mode
4553 -         (delays of up to two seconds) based on the 32 KiHz clock.
4554 +         may be used as a clock event device supporting oneshot mode.
4555
4556  config ATMEL_TCB_CLKSRC_BLOCK
4557         int
4558 @@ -84,6 +84,15 @@
4559           TC can be used for other purposes, such as PWM generation and
4560           interval timing.
4561
4562 +config ATMEL_TCB_CLKSRC_USE_SLOW_CLOCK
4563 +       bool "TC Block use 32 KiHz clock"
4564 +       depends on ATMEL_TCB_CLKSRC
4565 +       default y if !PREEMPT_RT_FULL
4566 +       help
4567 +         Select this to use 32 KiHz base clock rate as TC block clock
4568 +         source for clock events.
4569 +
4570 +
4571  config DUMMY_IRQ
4572         tristate "Dummy IRQ handler"
4573         default n
4574 diff -Nur linux-4.9.6.orig/drivers/mmc/host/mmci.c linux-4.9.6/drivers/mmc/host/mmci.c
4575 --- linux-4.9.6.orig/drivers/mmc/host/mmci.c    2017-01-26 08:25:24.000000000 +0100
4576 +++ linux-4.9.6/drivers/mmc/host/mmci.c 2017-01-28 13:59:09.915655573 +0100
4577 @@ -1147,15 +1147,12 @@
4578         struct sg_mapping_iter *sg_miter = &host->sg_miter;
4579         struct variant_data *variant = host->variant;
4580         void __iomem *base = host->base;
4581 -       unsigned long flags;
4582         u32 status;
4583
4584         status = readl(base + MMCISTATUS);
4585
4586         dev_dbg(mmc_dev(host->mmc), "irq1 (pio) %08x\n", status);
4587
4588 -       local_irq_save(flags);
4589 -
4590         do {
4591                 unsigned int remain, len;
4592                 char *buffer;
4593 @@ -1195,8 +1192,6 @@
4594
4595         sg_miter_stop(sg_miter);
4596
4597 -       local_irq_restore(flags);
4598 -
4599         /*
4600          * If we have less than the fifo 'half-full' threshold to transfer,
4601          * trigger a PIO interrupt as soon as any data is available.
4602 diff -Nur linux-4.9.6.orig/drivers/net/ethernet/3com/3c59x.c linux-4.9.6/drivers/net/ethernet/3com/3c59x.c
4603 --- linux-4.9.6.orig/drivers/net/ethernet/3com/3c59x.c  2017-01-26 08:25:24.000000000 +0100
4604 +++ linux-4.9.6/drivers/net/ethernet/3com/3c59x.c       2017-01-28 13:59:09.919655726 +0100
4605 @@ -842,9 +842,9 @@
4606  {
4607         struct vortex_private *vp = netdev_priv(dev);
4608         unsigned long flags;
4609 -       local_irq_save(flags);
4610 +       local_irq_save_nort(flags);
4611         (vp->full_bus_master_rx ? boomerang_interrupt:vortex_interrupt)(dev->irq,dev);
4612 -       local_irq_restore(flags);
4613 +       local_irq_restore_nort(flags);
4614  }
4615  #endif
4616
4617 @@ -1910,12 +1910,12 @@
4618                          * Block interrupts because vortex_interrupt does a bare spin_lock()
4619                          */
4620                         unsigned long flags;
4621 -                       local_irq_save(flags);
4622 +                       local_irq_save_nort(flags);
4623                         if (vp->full_bus_master_tx)
4624                                 boomerang_interrupt(dev->irq, dev);
4625                         else
4626                                 vortex_interrupt(dev->irq, dev);
4627 -                       local_irq_restore(flags);
4628 +                       local_irq_restore_nort(flags);
4629                 }
4630         }
4631
4632 diff -Nur linux-4.9.6.orig/drivers/net/ethernet/realtek/8139too.c linux-4.9.6/drivers/net/ethernet/realtek/8139too.c
4633 --- linux-4.9.6.orig/drivers/net/ethernet/realtek/8139too.c     2017-01-26 08:25:24.000000000 +0100
4634 +++ linux-4.9.6/drivers/net/ethernet/realtek/8139too.c  2017-01-28 13:59:09.919655726 +0100
4635 @@ -2233,7 +2233,7 @@
4636         struct rtl8139_private *tp = netdev_priv(dev);
4637         const int irq = tp->pci_dev->irq;
4638
4639 -       disable_irq(irq);
4640 +       disable_irq_nosync(irq);
4641         rtl8139_interrupt(irq, dev);
4642         enable_irq(irq);
4643  }
4644 diff -Nur linux-4.9.6.orig/drivers/net/wireless/intersil/orinoco/orinoco_usb.c linux-4.9.6/drivers/net/wireless/intersil/orinoco/orinoco_usb.c
4645 --- linux-4.9.6.orig/drivers/net/wireless/intersil/orinoco/orinoco_usb.c        2017-01-26 08:25:24.000000000 +0100
4646 +++ linux-4.9.6/drivers/net/wireless/intersil/orinoco/orinoco_usb.c     2017-01-28 13:59:09.919655726 +0100
4647 @@ -697,7 +697,7 @@
4648                         while (!ctx->done.done && msecs--)
4649                                 udelay(1000);
4650                 } else {
4651 -                       wait_event_interruptible(ctx->done.wait,
4652 +                       swait_event_interruptible(ctx->done.wait,
4653                                                  ctx->done.done);
4654                 }
4655                 break;
4656 diff -Nur linux-4.9.6.orig/drivers/pci/access.c linux-4.9.6/drivers/pci/access.c
4657 --- linux-4.9.6.orig/drivers/pci/access.c       2017-01-26 08:25:24.000000000 +0100
4658 +++ linux-4.9.6/drivers/pci/access.c    2017-01-28 13:59:09.919655726 +0100
4659 @@ -672,7 +672,7 @@
4660         WARN_ON(!dev->block_cfg_access);
4661
4662         dev->block_cfg_access = 0;
4663 -       wake_up_all(&pci_cfg_wait);
4664 +       wake_up_all_locked(&pci_cfg_wait);
4665         raw_spin_unlock_irqrestore(&pci_lock, flags);
4666  }
4667  EXPORT_SYMBOL_GPL(pci_cfg_access_unlock);
4668 diff -Nur linux-4.9.6.orig/drivers/scsi/fcoe/fcoe.c linux-4.9.6/drivers/scsi/fcoe/fcoe.c
4669 --- linux-4.9.6.orig/drivers/scsi/fcoe/fcoe.c   2017-01-26 08:25:24.000000000 +0100
4670 +++ linux-4.9.6/drivers/scsi/fcoe/fcoe.c        2017-01-28 13:59:09.919655726 +0100
4671 @@ -1455,11 +1455,11 @@
4672  static int fcoe_alloc_paged_crc_eof(struct sk_buff *skb, int tlen)
4673  {
4674         struct fcoe_percpu_s *fps;
4675 -       int rc;
4676 +       int rc, cpu = get_cpu_light();
4677
4678 -       fps = &get_cpu_var(fcoe_percpu);
4679 +       fps = &per_cpu(fcoe_percpu, cpu);
4680         rc = fcoe_get_paged_crc_eof(skb, tlen, fps);
4681 -       put_cpu_var(fcoe_percpu);
4682 +       put_cpu_light();
4683
4684         return rc;
4685  }
4686 @@ -1646,11 +1646,11 @@
4687                 return 0;
4688         }
4689
4690 -       stats = per_cpu_ptr(lport->stats, get_cpu());
4691 +       stats = per_cpu_ptr(lport->stats, get_cpu_light());
4692         stats->InvalidCRCCount++;
4693         if (stats->InvalidCRCCount < 5)
4694                 printk(KERN_WARNING "fcoe: dropping frame with CRC error\n");
4695 -       put_cpu();
4696 +       put_cpu_light();
4697         return -EINVAL;
4698  }
4699
4700 @@ -1693,7 +1693,7 @@
4701          */
4702         hp = (struct fcoe_hdr *) skb_network_header(skb);
4703
4704 -       stats = per_cpu_ptr(lport->stats, get_cpu());
4705 +       stats = per_cpu_ptr(lport->stats, get_cpu_light());
4706         if (unlikely(FC_FCOE_DECAPS_VER(hp) != FC_FCOE_VER)) {
4707                 if (stats->ErrorFrames < 5)
4708                         printk(KERN_WARNING "fcoe: FCoE version "
4709 @@ -1725,13 +1725,13 @@
4710                 goto drop;
4711
4712         if (!fcoe_filter_frames(lport, fp)) {
4713 -               put_cpu();
4714 +               put_cpu_light();
4715                 fc_exch_recv(lport, fp);
4716                 return;
4717         }
4718  drop:
4719         stats->ErrorFrames++;
4720 -       put_cpu();
4721 +       put_cpu_light();
4722         kfree_skb(skb);
4723  }
4724
4725 diff -Nur linux-4.9.6.orig/drivers/scsi/fcoe/fcoe_ctlr.c linux-4.9.6/drivers/scsi/fcoe/fcoe_ctlr.c
4726 --- linux-4.9.6.orig/drivers/scsi/fcoe/fcoe_ctlr.c      2017-01-26 08:25:24.000000000 +0100
4727 +++ linux-4.9.6/drivers/scsi/fcoe/fcoe_ctlr.c   2017-01-28 13:59:09.919655726 +0100
4728 @@ -834,7 +834,7 @@
4729
4730         INIT_LIST_HEAD(&del_list);
4731
4732 -       stats = per_cpu_ptr(fip->lp->stats, get_cpu());
4733 +       stats = per_cpu_ptr(fip->lp->stats, get_cpu_light());
4734
4735         list_for_each_entry_safe(fcf, next, &fip->fcfs, list) {
4736                 deadline = fcf->time + fcf->fka_period + fcf->fka_period / 2;
4737 @@ -870,7 +870,7 @@
4738                                 sel_time = fcf->time;
4739                 }
4740         }
4741 -       put_cpu();
4742 +       put_cpu_light();
4743
4744         list_for_each_entry_safe(fcf, next, &del_list, list) {
4745                 /* Removes fcf from current list */
4746 diff -Nur linux-4.9.6.orig/drivers/scsi/libfc/fc_exch.c linux-4.9.6/drivers/scsi/libfc/fc_exch.c
4747 --- linux-4.9.6.orig/drivers/scsi/libfc/fc_exch.c       2017-01-26 08:25:24.000000000 +0100
4748 +++ linux-4.9.6/drivers/scsi/libfc/fc_exch.c    2017-01-28 13:59:09.919655726 +0100
4749 @@ -814,10 +814,10 @@
4750         }
4751         memset(ep, 0, sizeof(*ep));
4752
4753 -       cpu = get_cpu();
4754 +       cpu = get_cpu_light();
4755         pool = per_cpu_ptr(mp->pool, cpu);
4756         spin_lock_bh(&pool->lock);
4757 -       put_cpu();
4758 +       put_cpu_light();
4759
4760         /* peek cache of free slot */
4761         if (pool->left != FC_XID_UNKNOWN) {
4762 diff -Nur linux-4.9.6.orig/drivers/scsi/libsas/sas_ata.c linux-4.9.6/drivers/scsi/libsas/sas_ata.c
4763 --- linux-4.9.6.orig/drivers/scsi/libsas/sas_ata.c      2017-01-26 08:25:24.000000000 +0100
4764 +++ linux-4.9.6/drivers/scsi/libsas/sas_ata.c   2017-01-28 13:59:09.919655726 +0100
4765 @@ -190,7 +190,7 @@
4766         /* TODO: audit callers to ensure they are ready for qc_issue to
4767          * unconditionally re-enable interrupts
4768          */
4769 -       local_irq_save(flags);
4770 +       local_irq_save_nort(flags);
4771         spin_unlock(ap->lock);
4772
4773         /* If the device fell off, no sense in issuing commands */
4774 @@ -252,7 +252,7 @@
4775
4776   out:
4777         spin_lock(ap->lock);
4778 -       local_irq_restore(flags);
4779 +       local_irq_restore_nort(flags);
4780         return ret;
4781  }
4782
4783 diff -Nur linux-4.9.6.orig/drivers/scsi/qla2xxx/qla_inline.h linux-4.9.6/drivers/scsi/qla2xxx/qla_inline.h
4784 --- linux-4.9.6.orig/drivers/scsi/qla2xxx/qla_inline.h  2017-01-26 08:25:24.000000000 +0100
4785 +++ linux-4.9.6/drivers/scsi/qla2xxx/qla_inline.h       2017-01-28 13:59:09.919655726 +0100
4786 @@ -59,12 +59,12 @@
4787  {
4788         unsigned long flags;
4789         struct qla_hw_data *ha = rsp->hw;
4790 -       local_irq_save(flags);
4791 +       local_irq_save_nort(flags);
4792         if (IS_P3P_TYPE(ha))
4793                 qla82xx_poll(0, rsp);
4794         else
4795                 ha->isp_ops->intr_handler(0, rsp);
4796 -       local_irq_restore(flags);
4797 +       local_irq_restore_nort(flags);
4798  }
4799
4800  static inline uint8_t *
4801 diff -Nur linux-4.9.6.orig/drivers/scsi/qla2xxx/qla_isr.c linux-4.9.6/drivers/scsi/qla2xxx/qla_isr.c
4802 --- linux-4.9.6.orig/drivers/scsi/qla2xxx/qla_isr.c     2017-01-26 08:25:24.000000000 +0100
4803 +++ linux-4.9.6/drivers/scsi/qla2xxx/qla_isr.c  2017-01-28 13:59:09.919655726 +0100
4804 @@ -3125,7 +3125,11 @@
4805                 * kref_put().
4806                 */
4807                 kref_get(&qentry->irq_notify.kref);
4808 +#ifdef CONFIG_PREEMPT_RT_BASE
4809 +               swork_queue(&qentry->irq_notify.swork);
4810 +#else
4811                 schedule_work(&qentry->irq_notify.work);
4812 +#endif
4813         }
4814
4815         /*
4816 diff -Nur linux-4.9.6.orig/drivers/thermal/x86_pkg_temp_thermal.c linux-4.9.6/drivers/thermal/x86_pkg_temp_thermal.c
4817 --- linux-4.9.6.orig/drivers/thermal/x86_pkg_temp_thermal.c     2017-01-26 08:25:24.000000000 +0100
4818 +++ linux-4.9.6/drivers/thermal/x86_pkg_temp_thermal.c  2017-01-28 13:59:09.923655879 +0100
4819 @@ -29,6 +29,7 @@
4820  #include <linux/pm.h>
4821  #include <linux/thermal.h>
4822  #include <linux/debugfs.h>
4823 +#include <linux/swork.h>
4824  #include <asm/cpu_device_id.h>
4825  #include <asm/mce.h>
4826
4827 @@ -353,7 +354,7 @@
4828         }
4829  }
4830
4831 -static int pkg_temp_thermal_platform_thermal_notify(__u64 msr_val)
4832 +static void platform_thermal_notify_work(struct swork_event *event)
4833  {
4834         unsigned long flags;
4835         int cpu = smp_processor_id();
4836 @@ -370,7 +371,7 @@
4837                         pkg_work_scheduled[phy_id]) {
4838                 disable_pkg_thres_interrupt();
4839                 spin_unlock_irqrestore(&pkg_work_lock, flags);
4840 -               return -EINVAL;
4841 +               return;
4842         }
4843         pkg_work_scheduled[phy_id] = 1;
4844         spin_unlock_irqrestore(&pkg_work_lock, flags);
4845 @@ -379,9 +380,48 @@
4846         schedule_delayed_work_on(cpu,
4847                                 &per_cpu(pkg_temp_thermal_threshold_work, cpu),
4848                                 msecs_to_jiffies(notify_delay_ms));
4849 +}
4850 +
4851 +#ifdef CONFIG_PREEMPT_RT_FULL
4852 +static struct swork_event notify_work;
4853 +
4854 +static int thermal_notify_work_init(void)
4855 +{
4856 +       int err;
4857 +
4858 +       err = swork_get();
4859 +       if (err)
4860 +               return err;
4861 +
4862 +       INIT_SWORK(&notify_work, platform_thermal_notify_work);
4863         return 0;
4864  }
4865
4866 +static void thermal_notify_work_cleanup(void)
4867 +{
4868 +       swork_put();
4869 +}
4870 +
4871 +static int pkg_temp_thermal_platform_thermal_notify(__u64 msr_val)
4872 +{
4873 +       swork_queue(&notify_work);
4874 +       return 0;
4875 +}
4876 +
4877 +#else  /* !CONFIG_PREEMPT_RT_FULL */
4878 +
4879 +static int thermal_notify_work_init(void) { return 0; }
4880 +
4881 +static void thermal_notify_work_cleanup(void) {  }
4882 +
4883 +static int pkg_temp_thermal_platform_thermal_notify(__u64 msr_val)
4884 +{
4885 +       platform_thermal_notify_work(NULL);
4886 +
4887 +       return 0;
4888 +}
4889 +#endif /* CONFIG_PREEMPT_RT_FULL */
4890 +
4891  static int find_siblings_cpu(int cpu)
4892  {
4893         int i;
4894 @@ -585,6 +625,9 @@
4895         if (!x86_match_cpu(pkg_temp_thermal_ids))
4896                 return -ENODEV;
4897
4898 +       if (!thermal_notify_work_init())
4899 +               return -ENODEV;
4900 +
4901         spin_lock_init(&pkg_work_lock);
4902         platform_thermal_package_notify =
4903                         pkg_temp_thermal_platform_thermal_notify;
4904 @@ -609,7 +652,7 @@
4905         kfree(pkg_work_scheduled);
4906         platform_thermal_package_notify = NULL;
4907         platform_thermal_package_rate_control = NULL;
4908 -
4909 +       thermal_notify_work_cleanup();
4910         return -ENODEV;
4911  }
4912
4913 @@ -634,6 +677,7 @@
4914         mutex_unlock(&phy_dev_list_mutex);
4915         platform_thermal_package_notify = NULL;
4916         platform_thermal_package_rate_control = NULL;
4917 +       thermal_notify_work_cleanup();
4918         for_each_online_cpu(i)
4919                 cancel_delayed_work_sync(
4920                         &per_cpu(pkg_temp_thermal_threshold_work, i));
4921 diff -Nur linux-4.9.6.orig/drivers/tty/serial/8250/8250_core.c linux-4.9.6/drivers/tty/serial/8250/8250_core.c
4922 --- linux-4.9.6.orig/drivers/tty/serial/8250/8250_core.c        2017-01-26 08:25:24.000000000 +0100
4923 +++ linux-4.9.6/drivers/tty/serial/8250/8250_core.c     2017-01-28 13:59:09.923655879 +0100
4924 @@ -58,7 +58,16 @@
4925
4926  static unsigned int skip_txen_test; /* force skip of txen test at init time */
4927
4928 -#define PASS_LIMIT     512
4929 +/*
4930 + * On -rt we can have a more delays, and legitimately
4931 + * so - so don't drop work spuriously and spam the
4932 + * syslog:
4933 + */
4934 +#ifdef CONFIG_PREEMPT_RT_FULL
4935 +# define PASS_LIMIT    1000000
4936 +#else
4937 +# define PASS_LIMIT    512
4938 +#endif
4939
4940  #include <asm/serial.h>
4941  /*
4942 diff -Nur linux-4.9.6.orig/drivers/tty/serial/8250/8250_port.c linux-4.9.6/drivers/tty/serial/8250/8250_port.c
4943 --- linux-4.9.6.orig/drivers/tty/serial/8250/8250_port.c        2017-01-26 08:25:24.000000000 +0100
4944 +++ linux-4.9.6/drivers/tty/serial/8250/8250_port.c     2017-01-28 13:59:09.923655879 +0100
4945 @@ -35,6 +35,7 @@
4946  #include <linux/nmi.h>
4947  #include <linux/mutex.h>
4948  #include <linux/slab.h>
4949 +#include <linux/kdb.h>
4950  #include <linux/uaccess.h>
4951  #include <linux/pm_runtime.h>
4952  #include <linux/timer.h>
4953 @@ -3144,9 +3145,9 @@
4954
4955         serial8250_rpm_get(up);
4956
4957 -       if (port->sysrq)
4958 +       if (port->sysrq || oops_in_progress)
4959                 locked = 0;
4960 -       else if (oops_in_progress)
4961 +       else if (in_kdb_printk())
4962                 locked = spin_trylock_irqsave(&port->lock, flags);
4963         else
4964                 spin_lock_irqsave(&port->lock, flags);
4965 diff -Nur linux-4.9.6.orig/drivers/tty/serial/amba-pl011.c linux-4.9.6/drivers/tty/serial/amba-pl011.c
4966 --- linux-4.9.6.orig/drivers/tty/serial/amba-pl011.c    2017-01-26 08:25:24.000000000 +0100
4967 +++ linux-4.9.6/drivers/tty/serial/amba-pl011.c 2017-01-28 13:59:09.923655879 +0100
4968 @@ -2194,13 +2194,19 @@
4969
4970         clk_enable(uap->clk);
4971
4972 -       local_irq_save(flags);
4973 +       /*
4974 +        * local_irq_save(flags);
4975 +        *
4976 +        * This local_irq_save() is nonsense. If we come in via sysrq
4977 +        * handling then interrupts are already disabled. Aside of
4978 +        * that the port.sysrq check is racy on SMP regardless.
4979 +       */
4980         if (uap->port.sysrq)
4981                 locked = 0;
4982         else if (oops_in_progress)
4983 -               locked = spin_trylock(&uap->port.lock);
4984 +               locked = spin_trylock_irqsave(&uap->port.lock, flags);
4985         else
4986 -               spin_lock(&uap->port.lock);
4987 +               spin_lock_irqsave(&uap->port.lock, flags);
4988
4989         /*
4990          *      First save the CR then disable the interrupts
4991 @@ -2224,8 +2230,7 @@
4992                 pl011_write(old_cr, uap, REG_CR);
4993
4994         if (locked)
4995 -               spin_unlock(&uap->port.lock);
4996 -       local_irq_restore(flags);
4997 +               spin_unlock_irqrestore(&uap->port.lock, flags);
4998
4999         clk_disable(uap->clk);
5000  }
5001 diff -Nur linux-4.9.6.orig/drivers/tty/serial/omap-serial.c linux-4.9.6/drivers/tty/serial/omap-serial.c
5002 --- linux-4.9.6.orig/drivers/tty/serial/omap-serial.c   2017-01-26 08:25:24.000000000 +0100
5003 +++ linux-4.9.6/drivers/tty/serial/omap-serial.c        2017-01-28 13:59:09.923655879 +0100
5004 @@ -1257,13 +1257,10 @@
5005
5006         pm_runtime_get_sync(up->dev);
5007
5008 -       local_irq_save(flags);
5009 -       if (up->port.sysrq)
5010 -               locked = 0;
5011 -       else if (oops_in_progress)
5012 -               locked = spin_trylock(&up->port.lock);
5013 +       if (up->port.sysrq || oops_in_progress)
5014 +               locked = spin_trylock_irqsave(&up->port.lock, flags);
5015         else
5016 -               spin_lock(&up->port.lock);
5017 +               spin_lock_irqsave(&up->port.lock, flags);
5018
5019         /*
5020          * First save the IER then disable the interrupts
5021 @@ -1292,8 +1289,7 @@
5022         pm_runtime_mark_last_busy(up->dev);
5023         pm_runtime_put_autosuspend(up->dev);
5024         if (locked)
5025 -               spin_unlock(&up->port.lock);
5026 -       local_irq_restore(flags);
5027 +               spin_unlock_irqrestore(&up->port.lock, flags);
5028  }
5029
5030  static int __init
5031 diff -Nur linux-4.9.6.orig/drivers/usb/core/hcd.c linux-4.9.6/drivers/usb/core/hcd.c
5032 --- linux-4.9.6.orig/drivers/usb/core/hcd.c     2017-01-26 08:25:24.000000000 +0100
5033 +++ linux-4.9.6/drivers/usb/core/hcd.c  2017-01-28 13:59:09.923655879 +0100
5034 @@ -1761,9 +1761,9 @@
5035          * and no one may trigger the above deadlock situation when
5036          * running complete() in tasklet.
5037          */
5038 -       local_irq_save(flags);
5039 +       local_irq_save_nort(flags);
5040         urb->complete(urb);
5041 -       local_irq_restore(flags);
5042 +       local_irq_restore_nort(flags);
5043
5044         usb_anchor_resume_wakeups(anchor);
5045         atomic_dec(&urb->use_count);
5046 diff -Nur linux-4.9.6.orig/drivers/usb/gadget/function/f_fs.c linux-4.9.6/drivers/usb/gadget/function/f_fs.c
5047 --- linux-4.9.6.orig/drivers/usb/gadget/function/f_fs.c 2017-01-26 08:25:24.000000000 +0100
5048 +++ linux-4.9.6/drivers/usb/gadget/function/f_fs.c      2017-01-28 13:59:09.923655879 +0100
5049 @@ -1593,7 +1593,7 @@
5050                 pr_info("%s(): freeing\n", __func__);
5051                 ffs_data_clear(ffs);
5052                 BUG_ON(waitqueue_active(&ffs->ev.waitq) ||
5053 -                      waitqueue_active(&ffs->ep0req_completion.wait));
5054 +                      swait_active(&ffs->ep0req_completion.wait));
5055                 kfree(ffs->dev_name);
5056                 kfree(ffs);
5057         }
5058 diff -Nur linux-4.9.6.orig/drivers/usb/gadget/legacy/inode.c linux-4.9.6/drivers/usb/gadget/legacy/inode.c
5059 --- linux-4.9.6.orig/drivers/usb/gadget/legacy/inode.c  2017-01-26 08:25:24.000000000 +0100
5060 +++ linux-4.9.6/drivers/usb/gadget/legacy/inode.c       2017-01-28 13:59:09.923655879 +0100
5061 @@ -346,7 +346,7 @@
5062         spin_unlock_irq (&epdata->dev->lock);
5063
5064         if (likely (value == 0)) {
5065 -               value = wait_event_interruptible (done.wait, done.done);
5066 +               value = swait_event_interruptible (done.wait, done.done);
5067                 if (value != 0) {
5068                         spin_lock_irq (&epdata->dev->lock);
5069                         if (likely (epdata->ep != NULL)) {
5070 @@ -355,7 +355,7 @@
5071                                 usb_ep_dequeue (epdata->ep, epdata->req);
5072                                 spin_unlock_irq (&epdata->dev->lock);
5073
5074 -                               wait_event (done.wait, done.done);
5075 +                               swait_event (done.wait, done.done);
5076                                 if (epdata->status == -ECONNRESET)
5077                                         epdata->status = -EINTR;
5078                         } else {
5079 diff -Nur linux-4.9.6.orig/fs/aio.c linux-4.9.6/fs/aio.c
5080 --- linux-4.9.6.orig/fs/aio.c   2017-01-26 08:25:24.000000000 +0100
5081 +++ linux-4.9.6/fs/aio.c        2017-01-28 13:59:09.923655879 +0100
5082 @@ -40,6 +40,7 @@
5083  #include <linux/ramfs.h>
5084  #include <linux/percpu-refcount.h>
5085  #include <linux/mount.h>
5086 +#include <linux/swork.h>
5087
5088  #include <asm/kmap_types.h>
5089  #include <asm/uaccess.h>
5090 @@ -115,7 +116,7 @@
5091         struct page             **ring_pages;
5092         long                    nr_pages;
5093
5094 -       struct work_struct      free_work;
5095 +       struct swork_event      free_work;
5096
5097         /*
5098          * signals when all in-flight requests are done
5099 @@ -258,6 +259,7 @@
5100                 .mount          = aio_mount,
5101                 .kill_sb        = kill_anon_super,
5102         };
5103 +       BUG_ON(swork_get());
5104         aio_mnt = kern_mount(&aio_fs);
5105         if (IS_ERR(aio_mnt))
5106                 panic("Failed to create aio fs mount.");
5107 @@ -581,9 +583,9 @@
5108         return cancel(&kiocb->common);
5109  }
5110
5111 -static void free_ioctx(struct work_struct *work)
5112 +static void free_ioctx(struct swork_event *sev)
5113  {
5114 -       struct kioctx *ctx = container_of(work, struct kioctx, free_work);
5115 +       struct kioctx *ctx = container_of(sev, struct kioctx, free_work);
5116
5117         pr_debug("freeing %p\n", ctx);
5118
5119 @@ -602,8 +604,8 @@
5120         if (ctx->rq_wait && atomic_dec_and_test(&ctx->rq_wait->count))
5121                 complete(&ctx->rq_wait->comp);
5122
5123 -       INIT_WORK(&ctx->free_work, free_ioctx);
5124 -       schedule_work(&ctx->free_work);
5125 +       INIT_SWORK(&ctx->free_work, free_ioctx);
5126 +       swork_queue(&ctx->free_work);
5127  }
5128
5129  /*
5130 @@ -611,9 +613,9 @@
5131   * and ctx->users has dropped to 0, so we know no more kiocbs can be submitted -
5132   * now it's safe to cancel any that need to be.
5133   */
5134 -static void free_ioctx_users(struct percpu_ref *ref)
5135 +static void free_ioctx_users_work(struct swork_event *sev)
5136  {
5137 -       struct kioctx *ctx = container_of(ref, struct kioctx, users);
5138 +       struct kioctx *ctx = container_of(sev, struct kioctx, free_work);
5139         struct aio_kiocb *req;
5140
5141         spin_lock_irq(&ctx->ctx_lock);
5142 @@ -632,6 +634,14 @@
5143         percpu_ref_put(&ctx->reqs);
5144  }
5145
5146 +static void free_ioctx_users(struct percpu_ref *ref)
5147 +{
5148 +       struct kioctx *ctx = container_of(ref, struct kioctx, users);
5149 +
5150 +       INIT_SWORK(&ctx->free_work, free_ioctx_users_work);
5151 +       swork_queue(&ctx->free_work);
5152 +}
5153 +
5154  static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
5155  {
5156         unsigned i, new_nr;
5157 diff -Nur linux-4.9.6.orig/fs/autofs4/autofs_i.h linux-4.9.6/fs/autofs4/autofs_i.h
5158 --- linux-4.9.6.orig/fs/autofs4/autofs_i.h      2017-01-26 08:25:24.000000000 +0100
5159 +++ linux-4.9.6/fs/autofs4/autofs_i.h   2017-01-28 13:59:09.923655879 +0100
5160 @@ -31,6 +31,7 @@
5161  #include <linux/sched.h>
5162  #include <linux/mount.h>
5163  #include <linux/namei.h>
5164 +#include <linux/delay.h>
5165  #include <asm/current.h>
5166  #include <linux/uaccess.h>
5167
5168 diff -Nur linux-4.9.6.orig/fs/autofs4/expire.c linux-4.9.6/fs/autofs4/expire.c
5169 --- linux-4.9.6.orig/fs/autofs4/expire.c        2017-01-26 08:25:24.000000000 +0100
5170 +++ linux-4.9.6/fs/autofs4/expire.c     2017-01-28 13:59:09.923655879 +0100
5171 @@ -148,7 +148,7 @@
5172                         parent = p->d_parent;
5173                         if (!spin_trylock(&parent->d_lock)) {
5174                                 spin_unlock(&p->d_lock);
5175 -                               cpu_relax();
5176 +                               cpu_chill();
5177                                 goto relock;
5178                         }
5179                         spin_unlock(&p->d_lock);
5180 diff -Nur linux-4.9.6.orig/fs/buffer.c linux-4.9.6/fs/buffer.c
5181 --- linux-4.9.6.orig/fs/buffer.c        2017-01-26 08:25:24.000000000 +0100
5182 +++ linux-4.9.6/fs/buffer.c     2017-01-28 13:59:09.927656033 +0100
5183 @@ -301,8 +301,7 @@
5184          * decide that the page is now completely done.
5185          */
5186         first = page_buffers(page);
5187 -       local_irq_save(flags);
5188 -       bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
5189 +       flags = bh_uptodate_lock_irqsave(first);
5190         clear_buffer_async_read(bh);
5191         unlock_buffer(bh);
5192         tmp = bh;
5193 @@ -315,8 +314,7 @@
5194                 }
5195                 tmp = tmp->b_this_page;
5196         } while (tmp != bh);
5197 -       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
5198 -       local_irq_restore(flags);
5199 +       bh_uptodate_unlock_irqrestore(first, flags);
5200
5201         /*
5202          * If none of the buffers had errors and they are all
5203 @@ -328,9 +326,7 @@
5204         return;
5205
5206  still_busy:
5207 -       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
5208 -       local_irq_restore(flags);
5209 -       return;
5210 +       bh_uptodate_unlock_irqrestore(first, flags);
5211  }
5212
5213  /*
5214 @@ -358,8 +354,7 @@
5215         }
5216
5217         first = page_buffers(page);
5218 -       local_irq_save(flags);
5219 -       bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
5220 +       flags = bh_uptodate_lock_irqsave(first);
5221
5222         clear_buffer_async_write(bh);
5223         unlock_buffer(bh);
5224 @@ -371,15 +366,12 @@
5225                 }
5226                 tmp = tmp->b_this_page;
5227         }
5228 -       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
5229 -       local_irq_restore(flags);
5230 +       bh_uptodate_unlock_irqrestore(first, flags);
5231         end_page_writeback(page);
5232         return;
5233
5234  still_busy:
5235 -       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
5236 -       local_irq_restore(flags);
5237 -       return;
5238 +       bh_uptodate_unlock_irqrestore(first, flags);
5239  }
5240  EXPORT_SYMBOL(end_buffer_async_write);
5241
5242 @@ -3383,6 +3375,7 @@
5243         struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
5244         if (ret) {
5245                 INIT_LIST_HEAD(&ret->b_assoc_buffers);
5246 +               buffer_head_init_locks(ret);
5247                 preempt_disable();
5248                 __this_cpu_inc(bh_accounting.nr);
5249                 recalc_bh_state();
5250 diff -Nur linux-4.9.6.orig/fs/cifs/readdir.c linux-4.9.6/fs/cifs/readdir.c
5251 --- linux-4.9.6.orig/fs/cifs/readdir.c  2017-01-26 08:25:24.000000000 +0100
5252 +++ linux-4.9.6/fs/cifs/readdir.c       2017-01-28 13:59:09.927656033 +0100
5253 @@ -80,7 +80,7 @@
5254         struct inode *inode;
5255         struct super_block *sb = parent->d_sb;
5256         struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
5257 -       DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
5258 +       DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
5259
5260         cifs_dbg(FYI, "%s: for %s\n", __func__, name->name);
5261
5262 diff -Nur linux-4.9.6.orig/fs/dcache.c linux-4.9.6/fs/dcache.c
5263 --- linux-4.9.6.orig/fs/dcache.c        2017-01-26 08:25:24.000000000 +0100
5264 +++ linux-4.9.6/fs/dcache.c     2017-01-28 13:59:09.927656033 +0100
5265 @@ -19,6 +19,7 @@
5266  #include <linux/mm.h>
5267  #include <linux/fs.h>
5268  #include <linux/fsnotify.h>
5269 +#include <linux/delay.h>
5270  #include <linux/slab.h>
5271  #include <linux/init.h>
5272  #include <linux/hash.h>
5273 @@ -750,6 +751,8 @@
5274   */
5275  void dput(struct dentry *dentry)
5276  {
5277 +       struct dentry *parent;
5278 +
5279         if (unlikely(!dentry))
5280                 return;
5281
5282 @@ -788,9 +791,18 @@
5283         return;
5284
5285  kill_it:
5286 -       dentry = dentry_kill(dentry);
5287 -       if (dentry) {
5288 -               cond_resched();
5289 +       parent = dentry_kill(dentry);
5290 +       if (parent) {
5291 +               int r;
5292 +
5293 +               if (parent == dentry) {
5294 +                       /* the task with the highest priority won't schedule */
5295 +                       r = cond_resched();
5296 +                       if (!r)
5297 +                               cpu_chill();
5298 +               } else {
5299 +                       dentry = parent;
5300 +               }
5301                 goto repeat;
5302         }
5303  }
5304 @@ -2324,7 +2336,7 @@
5305         if (dentry->d_lockref.count == 1) {
5306                 if (!spin_trylock(&inode->i_lock)) {
5307                         spin_unlock(&dentry->d_lock);
5308 -                       cpu_relax();
5309 +                       cpu_chill();
5310                         goto again;
5311                 }
5312                 dentry->d_flags &= ~DCACHE_CANT_MOUNT;
5313 @@ -2384,21 +2396,24 @@
5314
5315  static void d_wait_lookup(struct dentry *dentry)
5316  {
5317 -       if (d_in_lookup(dentry)) {
5318 -               DECLARE_WAITQUEUE(wait, current);
5319 -               add_wait_queue(dentry->d_wait, &wait);
5320 -               do {
5321 -                       set_current_state(TASK_UNINTERRUPTIBLE);
5322 -                       spin_unlock(&dentry->d_lock);
5323 -                       schedule();
5324 -                       spin_lock(&dentry->d_lock);
5325 -               } while (d_in_lookup(dentry));
5326 -       }
5327 +       struct swait_queue __wait;
5328 +
5329 +       if (!d_in_lookup(dentry))
5330 +               return;
5331 +
5332 +       INIT_LIST_HEAD(&__wait.task_list);
5333 +       do {
5334 +               prepare_to_swait(dentry->d_wait, &__wait, TASK_UNINTERRUPTIBLE);
5335 +               spin_unlock(&dentry->d_lock);
5336 +               schedule();
5337 +               spin_lock(&dentry->d_lock);
5338 +       } while (d_in_lookup(dentry));
5339 +       finish_swait(dentry->d_wait, &__wait);
5340  }
5341
5342  struct dentry *d_alloc_parallel(struct dentry *parent,
5343                                 const struct qstr *name,
5344 -                               wait_queue_head_t *wq)
5345 +                               struct swait_queue_head *wq)
5346  {
5347         unsigned int hash = name->hash;
5348         struct hlist_bl_head *b = in_lookup_hash(parent, hash);
5349 @@ -2507,7 +2522,7 @@
5350         hlist_bl_lock(b);
5351         dentry->d_flags &= ~DCACHE_PAR_LOOKUP;
5352         __hlist_bl_del(&dentry->d_u.d_in_lookup_hash);
5353 -       wake_up_all(dentry->d_wait);
5354 +       swake_up_all(dentry->d_wait);
5355         dentry->d_wait = NULL;
5356         hlist_bl_unlock(b);
5357         INIT_HLIST_NODE(&dentry->d_u.d_alias);
5358 @@ -3604,6 +3619,11 @@
5359
5360  void __init vfs_caches_init_early(void)
5361  {
5362 +       int i;
5363 +
5364 +       for (i = 0; i < ARRAY_SIZE(in_lookup_hashtable); i++)
5365 +               INIT_HLIST_BL_HEAD(&in_lookup_hashtable[i]);
5366 +
5367         dcache_init_early();
5368         inode_init_early();
5369  }
5370 diff -Nur linux-4.9.6.orig/fs/eventpoll.c linux-4.9.6/fs/eventpoll.c
5371 --- linux-4.9.6.orig/fs/eventpoll.c     2017-01-26 08:25:24.000000000 +0100
5372 +++ linux-4.9.6/fs/eventpoll.c  2017-01-28 13:59:09.927656033 +0100
5373 @@ -510,12 +510,12 @@
5374   */
5375  static void ep_poll_safewake(wait_queue_head_t *wq)
5376  {
5377 -       int this_cpu = get_cpu();
5378 +       int this_cpu = get_cpu_light();
5379
5380         ep_call_nested(&poll_safewake_ncalls, EP_MAX_NESTS,
5381                        ep_poll_wakeup_proc, NULL, wq, (void *) (long) this_cpu);
5382
5383 -       put_cpu();
5384 +       put_cpu_light();
5385  }
5386
5387  static void ep_remove_wait_queue(struct eppoll_entry *pwq)
5388 diff -Nur linux-4.9.6.orig/fs/exec.c linux-4.9.6/fs/exec.c
5389 --- linux-4.9.6.orig/fs/exec.c  2017-01-26 08:25:24.000000000 +0100
5390 +++ linux-4.9.6/fs/exec.c       2017-01-28 13:59:09.927656033 +0100
5391 @@ -1017,12 +1017,14 @@
5392                 }
5393         }
5394         task_lock(tsk);
5395 +       preempt_disable_rt();
5396         active_mm = tsk->active_mm;
5397         tsk->mm = mm;
5398         tsk->active_mm = mm;
5399         activate_mm(active_mm, mm);
5400         tsk->mm->vmacache_seqnum = 0;
5401         vmacache_flush(tsk);
5402 +       preempt_enable_rt();
5403         task_unlock(tsk);
5404         if (old_mm) {
5405                 up_read(&old_mm->mmap_sem);
5406 diff -Nur linux-4.9.6.orig/fs/fuse/dir.c linux-4.9.6/fs/fuse/dir.c
5407 --- linux-4.9.6.orig/fs/fuse/dir.c      2017-01-26 08:25:24.000000000 +0100
5408 +++ linux-4.9.6/fs/fuse/dir.c   2017-01-28 13:59:09.927656033 +0100
5409 @@ -1191,7 +1191,7 @@
5410         struct inode *dir = d_inode(parent);
5411         struct fuse_conn *fc;
5412         struct inode *inode;
5413 -       DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
5414 +       DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
5415
5416         if (!o->nodeid) {
5417                 /*
5418 diff -Nur linux-4.9.6.orig/fs/jbd2/checkpoint.c linux-4.9.6/fs/jbd2/checkpoint.c
5419 --- linux-4.9.6.orig/fs/jbd2/checkpoint.c       2017-01-26 08:25:24.000000000 +0100
5420 +++ linux-4.9.6/fs/jbd2/checkpoint.c    2017-01-28 13:59:09.927656033 +0100
5421 @@ -116,6 +116,8 @@
5422         nblocks = jbd2_space_needed(journal);
5423         while (jbd2_log_space_left(journal) < nblocks) {
5424                 write_unlock(&journal->j_state_lock);
5425 +               if (current->plug)
5426 +                       io_schedule();
5427                 mutex_lock(&journal->j_checkpoint_mutex);
5428
5429                 /*
5430 diff -Nur linux-4.9.6.orig/fs/locks.c linux-4.9.6/fs/locks.c
5431 --- linux-4.9.6.orig/fs/locks.c 2017-01-26 08:25:24.000000000 +0100
5432 +++ linux-4.9.6/fs/locks.c      2017-01-28 13:59:09.927656033 +0100
5433 @@ -935,7 +935,7 @@
5434                         return -ENOMEM;
5435         }
5436
5437 -       percpu_down_read_preempt_disable(&file_rwsem);
5438 +       percpu_down_read(&file_rwsem);
5439         spin_lock(&ctx->flc_lock);
5440         if (request->fl_flags & FL_ACCESS)
5441                 goto find_conflict;
5442 @@ -976,7 +976,7 @@
5443
5444  out:
5445         spin_unlock(&ctx->flc_lock);
5446 -       percpu_up_read_preempt_enable(&file_rwsem);
5447 +       percpu_up_read(&file_rwsem);
5448         if (new_fl)
5449                 locks_free_lock(new_fl);
5450         locks_dispose_list(&dispose);
5451 @@ -1013,7 +1013,7 @@
5452                 new_fl2 = locks_alloc_lock();
5453         }
5454
5455 -       percpu_down_read_preempt_disable(&file_rwsem);
5456 +       percpu_down_read(&file_rwsem);
5457         spin_lock(&ctx->flc_lock);
5458         /*
5459          * New lock request. Walk all POSIX locks and look for conflicts. If
5460 @@ -1185,7 +1185,7 @@
5461         }
5462   out:
5463         spin_unlock(&ctx->flc_lock);
5464 -       percpu_up_read_preempt_enable(&file_rwsem);
5465 +       percpu_up_read(&file_rwsem);
5466         /*
5467          * Free any unused locks.
5468          */
5469 @@ -1460,7 +1460,7 @@
5470                 return error;
5471         }
5472
5473 -       percpu_down_read_preempt_disable(&file_rwsem);
5474 +       percpu_down_read(&file_rwsem);
5475         spin_lock(&ctx->flc_lock);
5476
5477         time_out_leases(inode, &dispose);
5478 @@ -1512,13 +1512,13 @@
5479         locks_insert_block(fl, new_fl);
5480         trace_break_lease_block(inode, new_fl);
5481         spin_unlock(&ctx->flc_lock);
5482 -       percpu_up_read_preempt_enable(&file_rwsem);
5483 +       percpu_up_read(&file_rwsem);
5484
5485         locks_dispose_list(&dispose);
5486         error = wait_event_interruptible_timeout(new_fl->fl_wait,
5487                                                 !new_fl->fl_next, break_time);
5488
5489 -       percpu_down_read_preempt_disable(&file_rwsem);
5490 +       percpu_down_read(&file_rwsem);
5491         spin_lock(&ctx->flc_lock);
5492         trace_break_lease_unblock(inode, new_fl);
5493         locks_delete_block(new_fl);
5494 @@ -1535,7 +1535,7 @@
5495         }
5496  out:
5497         spin_unlock(&ctx->flc_lock);
5498 -       percpu_up_read_preempt_enable(&file_rwsem);
5499 +       percpu_up_read(&file_rwsem);
5500         locks_dispose_list(&dispose);
5501         locks_free_lock(new_fl);
5502         return error;
5503 @@ -1609,7 +1609,7 @@
5504
5505         ctx = smp_load_acquire(&inode->i_flctx);
5506         if (ctx && !list_empty_careful(&ctx->flc_lease)) {
5507 -               percpu_down_read_preempt_disable(&file_rwsem);
5508 +               percpu_down_read(&file_rwsem);
5509                 spin_lock(&ctx->flc_lock);
5510                 time_out_leases(inode, &dispose);
5511                 list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
5512 @@ -1619,7 +1619,7 @@
5513                         break;
5514                 }
5515                 spin_unlock(&ctx->flc_lock);
5516 -               percpu_up_read_preempt_enable(&file_rwsem);
5517 +               percpu_up_read(&file_rwsem);
5518
5519                 locks_dispose_list(&dispose);
5520         }
5521 @@ -1694,7 +1694,7 @@
5522                 return -EINVAL;
5523         }
5524
5525 -       percpu_down_read_preempt_disable(&file_rwsem);
5526 +       percpu_down_read(&file_rwsem);
5527         spin_lock(&ctx->flc_lock);
5528         time_out_leases(inode, &dispose);
5529         error = check_conflicting_open(dentry, arg, lease->fl_flags);
5530 @@ -1765,7 +1765,7 @@
5531                 lease->fl_lmops->lm_setup(lease, priv);
5532  out:
5533         spin_unlock(&ctx->flc_lock);
5534 -       percpu_up_read_preempt_enable(&file_rwsem);
5535 +       percpu_up_read(&file_rwsem);
5536         locks_dispose_list(&dispose);
5537         if (is_deleg)
5538                 inode_unlock(inode);
5539 @@ -1788,7 +1788,7 @@
5540                 return error;
5541         }
5542
5543 -       percpu_down_read_preempt_disable(&file_rwsem);
5544 +       percpu_down_read(&file_rwsem);
5545         spin_lock(&ctx->flc_lock);
5546         list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
5547                 if (fl->fl_file == filp &&
5548 @@ -1801,7 +1801,7 @@
5549         if (victim)
5550                 error = fl->fl_lmops->lm_change(victim, F_UNLCK, &dispose);
5551         spin_unlock(&ctx->flc_lock);
5552 -       percpu_up_read_preempt_enable(&file_rwsem);
5553 +       percpu_up_read(&file_rwsem);
5554         locks_dispose_list(&dispose);
5555         return error;
5556  }
5557 @@ -2532,13 +2532,13 @@
5558         if (list_empty(&ctx->flc_lease))
5559                 return;
5560
5561 -       percpu_down_read_preempt_disable(&file_rwsem);
5562 +       percpu_down_read(&file_rwsem);
5563         spin_lock(&ctx->flc_lock);
5564         list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, fl_list)
5565                 if (filp == fl->fl_file)
5566                         lease_modify(fl, F_UNLCK, &dispose);
5567         spin_unlock(&ctx->flc_lock);
5568 -       percpu_up_read_preempt_enable(&file_rwsem);
5569 +       percpu_up_read(&file_rwsem);
5570
5571         locks_dispose_list(&dispose);
5572  }
5573 diff -Nur linux-4.9.6.orig/fs/namei.c linux-4.9.6/fs/namei.c
5574 --- linux-4.9.6.orig/fs/namei.c 2017-01-26 08:25:24.000000000 +0100
5575 +++ linux-4.9.6/fs/namei.c      2017-01-28 13:59:09.927656033 +0100
5576 @@ -1629,7 +1629,7 @@
5577  {
5578         struct dentry *dentry = ERR_PTR(-ENOENT), *old;
5579         struct inode *inode = dir->d_inode;
5580 -       DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
5581 +       DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
5582
5583         inode_lock_shared(inode);
5584         /* Don't go there if it's already dead */
5585 @@ -3086,7 +3086,7 @@
5586         struct dentry *dentry;
5587         int error, create_error = 0;
5588         umode_t mode = op->mode;
5589 -       DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
5590 +       DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
5591
5592         if (unlikely(IS_DEADDIR(dir_inode)))
5593                 return -ENOENT;
5594 diff -Nur linux-4.9.6.orig/fs/namespace.c linux-4.9.6/fs/namespace.c
5595 --- linux-4.9.6.orig/fs/namespace.c     2017-01-26 08:25:24.000000000 +0100
5596 +++ linux-4.9.6/fs/namespace.c  2017-01-28 13:59:09.931656187 +0100
5597 @@ -14,6 +14,7 @@
5598  #include <linux/mnt_namespace.h>
5599  #include <linux/user_namespace.h>
5600  #include <linux/namei.h>
5601 +#include <linux/delay.h>
5602  #include <linux/security.h>
5603  #include <linux/idr.h>
5604  #include <linux/init.h>                /* init_rootfs */
5605 @@ -356,8 +357,11 @@
5606          * incremented count after it has set MNT_WRITE_HOLD.
5607          */
5608         smp_mb();
5609 -       while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD)
5610 -               cpu_relax();
5611 +       while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) {
5612 +               preempt_enable();
5613 +               cpu_chill();
5614 +               preempt_disable();
5615 +       }
5616         /*
5617          * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
5618          * be set to match its requirements. So we must not load that until
5619 diff -Nur linux-4.9.6.orig/fs/nfs/delegation.c linux-4.9.6/fs/nfs/delegation.c
5620 --- linux-4.9.6.orig/fs/nfs/delegation.c        2017-01-26 08:25:24.000000000 +0100
5621 +++ linux-4.9.6/fs/nfs/delegation.c     2017-01-28 13:59:09.931656187 +0100
5622 @@ -150,11 +150,11 @@
5623                 sp = state->owner;
5624                 /* Block nfs4_proc_unlck */
5625                 mutex_lock(&sp->so_delegreturn_mutex);
5626 -               seq = raw_seqcount_begin(&sp->so_reclaim_seqcount);
5627 +               seq = read_seqbegin(&sp->so_reclaim_seqlock);
5628                 err = nfs4_open_delegation_recall(ctx, state, stateid, type);
5629                 if (!err)
5630                         err = nfs_delegation_claim_locks(ctx, state, stateid);
5631 -               if (!err && read_seqcount_retry(&sp->so_reclaim_seqcount, seq))
5632 +               if (!err && read_seqretry(&sp->so_reclaim_seqlock, seq))
5633                         err = -EAGAIN;
5634                 mutex_unlock(&sp->so_delegreturn_mutex);
5635                 put_nfs_open_context(ctx);
5636 diff -Nur linux-4.9.6.orig/fs/nfs/dir.c linux-4.9.6/fs/nfs/dir.c
5637 --- linux-4.9.6.orig/fs/nfs/dir.c       2017-01-26 08:25:24.000000000 +0100
5638 +++ linux-4.9.6/fs/nfs/dir.c    2017-01-28 13:59:09.931656187 +0100
5639 @@ -485,7 +485,7 @@
5640  void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
5641  {
5642         struct qstr filename = QSTR_INIT(entry->name, entry->len);
5643 -       DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
5644 +       DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
5645         struct dentry *dentry;
5646         struct dentry *alias;
5647         struct inode *dir = d_inode(parent);
5648 @@ -1487,7 +1487,7 @@
5649                     struct file *file, unsigned open_flags,
5650                     umode_t mode, int *opened)
5651  {
5652 -       DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
5653 +       DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
5654         struct nfs_open_context *ctx;
5655         struct dentry *res;
5656         struct iattr attr = { .ia_valid = ATTR_OPEN };
5657 @@ -1802,7 +1802,11 @@
5658
5659         trace_nfs_rmdir_enter(dir, dentry);
5660         if (d_really_is_positive(dentry)) {
5661 +#ifdef CONFIG_PREEMPT_RT_BASE
5662 +               down(&NFS_I(d_inode(dentry))->rmdir_sem);
5663 +#else
5664                 down_write(&NFS_I(d_inode(dentry))->rmdir_sem);
5665 +#endif
5666                 error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name);
5667                 /* Ensure the VFS deletes this inode */
5668                 switch (error) {
5669 @@ -1812,7 +1816,11 @@
5670                 case -ENOENT:
5671                         nfs_dentry_handle_enoent(dentry);
5672                 }
5673 +#ifdef CONFIG_PREEMPT_RT_BASE
5674 +               up(&NFS_I(d_inode(dentry))->rmdir_sem);
5675 +#else
5676                 up_write(&NFS_I(d_inode(dentry))->rmdir_sem);
5677 +#endif
5678         } else
5679                 error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name);
5680         trace_nfs_rmdir_exit(dir, dentry, error);
5681 diff -Nur linux-4.9.6.orig/fs/nfs/inode.c linux-4.9.6/fs/nfs/inode.c
5682 --- linux-4.9.6.orig/fs/nfs/inode.c     2017-01-26 08:25:24.000000000 +0100
5683 +++ linux-4.9.6/fs/nfs/inode.c  2017-01-28 13:59:09.931656187 +0100
5684 @@ -1957,7 +1957,11 @@
5685         nfsi->nrequests = 0;
5686         nfsi->commit_info.ncommit = 0;
5687         atomic_set(&nfsi->commit_info.rpcs_out, 0);
5688 +#ifdef CONFIG_PREEMPT_RT_BASE
5689 +       sema_init(&nfsi->rmdir_sem, 1);
5690 +#else
5691         init_rwsem(&nfsi->rmdir_sem);
5692 +#endif
5693         nfs4_init_once(nfsi);
5694  }
5695
5696 diff -Nur linux-4.9.6.orig/fs/nfs/nfs4_fs.h linux-4.9.6/fs/nfs/nfs4_fs.h
5697 --- linux-4.9.6.orig/fs/nfs/nfs4_fs.h   2017-01-26 08:25:24.000000000 +0100
5698 +++ linux-4.9.6/fs/nfs/nfs4_fs.h        2017-01-28 13:59:09.931656187 +0100
5699 @@ -111,7 +111,7 @@
5700         unsigned long        so_flags;
5701         struct list_head     so_states;
5702         struct nfs_seqid_counter so_seqid;
5703 -       seqcount_t           so_reclaim_seqcount;
5704 +       seqlock_t            so_reclaim_seqlock;
5705         struct mutex         so_delegreturn_mutex;
5706  };
5707
5708 diff -Nur linux-4.9.6.orig/fs/nfs/nfs4proc.c linux-4.9.6/fs/nfs/nfs4proc.c
5709 --- linux-4.9.6.orig/fs/nfs/nfs4proc.c  2017-01-26 08:25:24.000000000 +0100
5710 +++ linux-4.9.6/fs/nfs/nfs4proc.c       2017-01-28 13:59:09.931656187 +0100
5711 @@ -2697,7 +2697,7 @@
5712         unsigned int seq;
5713         int ret;
5714
5715 -       seq = raw_seqcount_begin(&sp->so_reclaim_seqcount);
5716 +       seq = raw_seqcount_begin(&sp->so_reclaim_seqlock.seqcount);
5717
5718         ret = _nfs4_proc_open(opendata);
5719         if (ret != 0)
5720 @@ -2735,7 +2735,7 @@
5721         ctx->state = state;
5722         if (d_inode(dentry) == state->inode) {
5723                 nfs_inode_attach_open_context(ctx);
5724 -               if (read_seqcount_retry(&sp->so_reclaim_seqcount, seq))
5725 +               if (read_seqretry(&sp->so_reclaim_seqlock, seq))
5726                         nfs4_schedule_stateid_recovery(server, state);
5727         }
5728  out:
5729 diff -Nur linux-4.9.6.orig/fs/nfs/nfs4state.c linux-4.9.6/fs/nfs/nfs4state.c
5730 --- linux-4.9.6.orig/fs/nfs/nfs4state.c 2017-01-26 08:25:24.000000000 +0100
5731 +++ linux-4.9.6/fs/nfs/nfs4state.c      2017-01-28 13:59:09.931656187 +0100
5732 @@ -488,7 +488,7 @@
5733         nfs4_init_seqid_counter(&sp->so_seqid);
5734         atomic_set(&sp->so_count, 1);
5735         INIT_LIST_HEAD(&sp->so_lru);
5736 -       seqcount_init(&sp->so_reclaim_seqcount);
5737 +       seqlock_init(&sp->so_reclaim_seqlock);
5738         mutex_init(&sp->so_delegreturn_mutex);
5739         return sp;
5740  }
5741 @@ -1497,8 +1497,12 @@
5742          * recovering after a network partition or a reboot from a
5743          * server that doesn't support a grace period.
5744          */
5745 +#ifdef CONFIG_PREEMPT_RT_FULL
5746 +       write_seqlock(&sp->so_reclaim_seqlock);
5747 +#else
5748 +       write_seqcount_begin(&sp->so_reclaim_seqlock.seqcount);
5749 +#endif
5750         spin_lock(&sp->so_lock);
5751 -       raw_write_seqcount_begin(&sp->so_reclaim_seqcount);
5752  restart:
5753         list_for_each_entry(state, &sp->so_states, open_states) {
5754                 if (!test_and_clear_bit(ops->state_flag_bit, &state->flags))
5755 @@ -1567,14 +1571,20 @@
5756                 spin_lock(&sp->so_lock);
5757                 goto restart;
5758         }
5759 -       raw_write_seqcount_end(&sp->so_reclaim_seqcount);
5760         spin_unlock(&sp->so_lock);
5761 +#ifdef CONFIG_PREEMPT_RT_FULL
5762 +       write_sequnlock(&sp->so_reclaim_seqlock);
5763 +#else
5764 +       write_seqcount_end(&sp->so_reclaim_seqlock.seqcount);
5765 +#endif
5766         return 0;
5767  out_err:
5768         nfs4_put_open_state(state);
5769 -       spin_lock(&sp->so_lock);
5770 -       raw_write_seqcount_end(&sp->so_reclaim_seqcount);
5771 -       spin_unlock(&sp->so_lock);
5772 +#ifdef CONFIG_PREEMPT_RT_FULL
5773 +       write_sequnlock(&sp->so_reclaim_seqlock);
5774 +#else
5775 +       write_seqcount_end(&sp->so_reclaim_seqlock.seqcount);
5776 +#endif
5777         return status;
5778  }
5779
5780 diff -Nur linux-4.9.6.orig/fs/nfs/unlink.c linux-4.9.6/fs/nfs/unlink.c
5781 --- linux-4.9.6.orig/fs/nfs/unlink.c    2017-01-26 08:25:24.000000000 +0100
5782 +++ linux-4.9.6/fs/nfs/unlink.c 2017-01-28 13:59:09.931656187 +0100
5783 @@ -12,7 +12,7 @@
5784  #include <linux/sunrpc/clnt.h>
5785  #include <linux/nfs_fs.h>
5786  #include <linux/sched.h>
5787 -#include <linux/wait.h>
5788 +#include <linux/swait.h>
5789  #include <linux/namei.h>
5790  #include <linux/fsnotify.h>
5791
5792 @@ -51,6 +51,29 @@
5793                 rpc_restart_call_prepare(task);
5794  }
5795
5796 +#ifdef CONFIG_PREEMPT_RT_BASE
5797 +static void nfs_down_anon(struct semaphore *sema)
5798 +{
5799 +       down(sema);
5800 +}
5801 +
5802 +static void nfs_up_anon(struct semaphore *sema)
5803 +{
5804 +       up(sema);
5805 +}
5806 +
5807 +#else
5808 +static void nfs_down_anon(struct rw_semaphore *rwsem)
5809 +{
5810 +       down_read_non_owner(rwsem);
5811 +}
5812 +
5813 +static void nfs_up_anon(struct rw_semaphore *rwsem)
5814 +{
5815 +       up_read_non_owner(rwsem);
5816 +}
5817 +#endif
5818 +
5819  /**
5820   * nfs_async_unlink_release - Release the sillydelete data.
5821   * @task: rpc_task of the sillydelete
5822 @@ -64,7 +87,7 @@
5823         struct dentry *dentry = data->dentry;
5824         struct super_block *sb = dentry->d_sb;
5825
5826 -       up_read_non_owner(&NFS_I(d_inode(dentry->d_parent))->rmdir_sem);
5827 +       nfs_up_anon(&NFS_I(d_inode(dentry->d_parent))->rmdir_sem);
5828         d_lookup_done(dentry);
5829         nfs_free_unlinkdata(data);
5830         dput(dentry);
5831 @@ -117,10 +140,10 @@
5832         struct inode *dir = d_inode(dentry->d_parent);
5833         struct dentry *alias;
5834
5835 -       down_read_non_owner(&NFS_I(dir)->rmdir_sem);
5836 +       nfs_down_anon(&NFS_I(dir)->rmdir_sem);
5837         alias = d_alloc_parallel(dentry->d_parent, &data->args.name, &data->wq);
5838         if (IS_ERR(alias)) {
5839 -               up_read_non_owner(&NFS_I(dir)->rmdir_sem);
5840 +               nfs_up_anon(&NFS_I(dir)->rmdir_sem);
5841                 return 0;
5842         }
5843         if (!d_in_lookup(alias)) {
5844 @@ -142,7 +165,7 @@
5845                         ret = 0;
5846                 spin_unlock(&alias->d_lock);
5847                 dput(alias);
5848 -               up_read_non_owner(&NFS_I(dir)->rmdir_sem);
5849 +               nfs_up_anon(&NFS_I(dir)->rmdir_sem);
5850                 /*
5851                  * If we'd displaced old cached devname, free it.  At that
5852                  * point dentry is definitely not a root, so we won't need
5853 @@ -182,7 +205,7 @@
5854                 goto out_free_name;
5855         }
5856         data->res.dir_attr = &data->dir_attr;
5857 -       init_waitqueue_head(&data->wq);
5858 +       init_swait_queue_head(&data->wq);
5859
5860         status = -EBUSY;
5861         spin_lock(&dentry->d_lock);
5862 diff -Nur linux-4.9.6.orig/fs/ntfs/aops.c linux-4.9.6/fs/ntfs/aops.c
5863 --- linux-4.9.6.orig/fs/ntfs/aops.c     2017-01-26 08:25:24.000000000 +0100
5864 +++ linux-4.9.6/fs/ntfs/aops.c  2017-01-28 13:59:09.931656187 +0100
5865 @@ -92,13 +92,13 @@
5866                         ofs = 0;
5867                         if (file_ofs < init_size)
5868                                 ofs = init_size - file_ofs;
5869 -                       local_irq_save(flags);
5870 +                       local_irq_save_nort(flags);
5871                         kaddr = kmap_atomic(page);
5872                         memset(kaddr + bh_offset(bh) + ofs, 0,
5873                                         bh->b_size - ofs);
5874                         flush_dcache_page(page);
5875                         kunmap_atomic(kaddr);
5876 -                       local_irq_restore(flags);
5877 +                       local_irq_restore_nort(flags);
5878                 }
5879         } else {
5880                 clear_buffer_uptodate(bh);
5881 @@ -107,8 +107,7 @@
5882                                 "0x%llx.", (unsigned long long)bh->b_blocknr);
5883         }
5884         first = page_buffers(page);
5885 -       local_irq_save(flags);
5886 -       bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
5887 +       flags = bh_uptodate_lock_irqsave(first);
5888         clear_buffer_async_read(bh);
5889         unlock_buffer(bh);
5890         tmp = bh;
5891 @@ -123,8 +122,7 @@
5892                 }
5893                 tmp = tmp->b_this_page;
5894         } while (tmp != bh);
5895 -       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
5896 -       local_irq_restore(flags);
5897 +       bh_uptodate_unlock_irqrestore(first, flags);
5898         /*
5899          * If none of the buffers had errors then we can set the page uptodate,
5900          * but we first have to perform the post read mst fixups, if the
5901 @@ -145,13 +143,13 @@
5902                 recs = PAGE_SIZE / rec_size;
5903                 /* Should have been verified before we got here... */
5904                 BUG_ON(!recs);
5905 -               local_irq_save(flags);
5906 +               local_irq_save_nort(flags);
5907                 kaddr = kmap_atomic(page);
5908                 for (i = 0; i < recs; i++)
5909                         post_read_mst_fixup((NTFS_RECORD*)(kaddr +
5910                                         i * rec_size), rec_size);
5911                 kunmap_atomic(kaddr);
5912 -               local_irq_restore(flags);
5913 +               local_irq_restore_nort(flags);
5914                 flush_dcache_page(page);
5915                 if (likely(page_uptodate && !PageError(page)))
5916                         SetPageUptodate(page);
5917 @@ -159,9 +157,7 @@
5918         unlock_page(page);
5919         return;
5920  still_busy:
5921 -       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
5922 -       local_irq_restore(flags);
5923 -       return;
5924 +       bh_uptodate_unlock_irqrestore(first, flags);
5925  }
5926
5927  /**
5928 diff -Nur linux-4.9.6.orig/fs/proc/base.c linux-4.9.6/fs/proc/base.c
5929 --- linux-4.9.6.orig/fs/proc/base.c     2017-01-26 08:25:24.000000000 +0100
5930 +++ linux-4.9.6/fs/proc/base.c  2017-01-28 13:59:09.931656187 +0100
5931 @@ -1834,7 +1834,7 @@
5932
5933         child = d_hash_and_lookup(dir, &qname);
5934         if (!child) {
5935 -               DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
5936 +               DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
5937                 child = d_alloc_parallel(dir, &qname, &wq);
5938                 if (IS_ERR(child))
5939                         goto end_instantiate;
5940 diff -Nur linux-4.9.6.orig/fs/proc/proc_sysctl.c linux-4.9.6/fs/proc/proc_sysctl.c
5941 --- linux-4.9.6.orig/fs/proc/proc_sysctl.c      2017-01-26 08:25:24.000000000 +0100
5942 +++ linux-4.9.6/fs/proc/proc_sysctl.c   2017-01-28 13:59:09.935656342 +0100
5943 @@ -632,7 +632,7 @@
5944
5945         child = d_lookup(dir, &qname);
5946         if (!child) {
5947 -               DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
5948 +               DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
5949                 child = d_alloc_parallel(dir, &qname, &wq);
5950                 if (IS_ERR(child))
5951                         return false;
5952 diff -Nur linux-4.9.6.orig/fs/timerfd.c linux-4.9.6/fs/timerfd.c
5953 --- linux-4.9.6.orig/fs/timerfd.c       2017-01-26 08:25:24.000000000 +0100
5954 +++ linux-4.9.6/fs/timerfd.c    2017-01-28 13:59:09.935656342 +0100
5955 @@ -460,7 +460,10 @@
5956                                 break;
5957                 }
5958                 spin_unlock_irq(&ctx->wqh.lock);
5959 -               cpu_relax();
5960 +               if (isalarm(ctx))
5961 +                       hrtimer_wait_for_timer(&ctx->t.alarm.timer);
5962 +               else
5963 +                       hrtimer_wait_for_timer(&ctx->t.tmr);
5964         }
5965
5966         /*
5967 diff -Nur linux-4.9.6.orig/include/acpi/platform/aclinux.h linux-4.9.6/include/acpi/platform/aclinux.h
5968 --- linux-4.9.6.orig/include/acpi/platform/aclinux.h    2017-01-26 08:25:24.000000000 +0100
5969 +++ linux-4.9.6/include/acpi/platform/aclinux.h 2017-01-28 13:59:09.935656342 +0100
5970 @@ -133,6 +133,7 @@
5971
5972  #define acpi_cache_t                        struct kmem_cache
5973  #define acpi_spinlock                       spinlock_t *
5974 +#define acpi_raw_spinlock              raw_spinlock_t *
5975  #define acpi_cpu_flags                      unsigned long
5976
5977  /* Use native linux version of acpi_os_allocate_zeroed */
5978 @@ -151,6 +152,20 @@
5979  #define ACPI_USE_ALTERNATE_PROTOTYPE_acpi_os_get_thread_id
5980  #define ACPI_USE_ALTERNATE_PROTOTYPE_acpi_os_create_lock
5981
5982 +#define acpi_os_create_raw_lock(__handle)                      \
5983 +({                                                             \
5984 +        raw_spinlock_t *lock = ACPI_ALLOCATE(sizeof(*lock));   \
5985 +                                                               \
5986 +        if (lock) {                                            \
5987 +               *(__handle) = lock;                             \
5988 +               raw_spin_lock_init(*(__handle));                \
5989 +        }                                                      \
5990 +        lock ? AE_OK : AE_NO_MEMORY;                           \
5991 + })
5992 +
5993 +#define acpi_os_delete_raw_lock(__handle)      kfree(__handle)
5994 +
5995 +
5996  /*
5997   * OSL interfaces used by debugger/disassembler
5998   */
5999 diff -Nur linux-4.9.6.orig/include/asm-generic/bug.h linux-4.9.6/include/asm-generic/bug.h
6000 --- linux-4.9.6.orig/include/asm-generic/bug.h  2017-01-26 08:25:24.000000000 +0100
6001 +++ linux-4.9.6/include/asm-generic/bug.h       2017-01-28 13:59:09.935656342 +0100
6002 @@ -215,6 +215,20 @@
6003  # define WARN_ON_SMP(x)                        ({0;})
6004  #endif
6005
6006 +#ifdef CONFIG_PREEMPT_RT_BASE
6007 +# define BUG_ON_RT(c)                  BUG_ON(c)
6008 +# define BUG_ON_NONRT(c)               do { } while (0)
6009 +# define WARN_ON_RT(condition)         WARN_ON(condition)
6010 +# define WARN_ON_NONRT(condition)      do { } while (0)
6011 +# define WARN_ON_ONCE_NONRT(condition) do { } while (0)
6012 +#else
6013 +# define BUG_ON_RT(c)                  do { } while (0)
6014 +# define BUG_ON_NONRT(c)               BUG_ON(c)
6015 +# define WARN_ON_RT(condition)         do { } while (0)
6016 +# define WARN_ON_NONRT(condition)      WARN_ON(condition)
6017 +# define WARN_ON_ONCE_NONRT(condition) WARN_ON_ONCE(condition)
6018 +#endif
6019 +
6020  #endif /* __ASSEMBLY__ */
6021
6022  #endif
6023 diff -Nur linux-4.9.6.orig/include/linux/blkdev.h linux-4.9.6/include/linux/blkdev.h
6024 --- linux-4.9.6.orig/include/linux/blkdev.h     2017-01-26 08:25:24.000000000 +0100
6025 +++ linux-4.9.6/include/linux/blkdev.h  2017-01-28 13:59:09.935656342 +0100
6026 @@ -89,6 +89,7 @@
6027         struct list_head queuelist;
6028         union {
6029                 struct call_single_data csd;
6030 +               struct work_struct work;
6031                 u64 fifo_time;
6032         };
6033
6034 @@ -467,7 +468,7 @@
6035         struct throtl_data *td;
6036  #endif
6037         struct rcu_head         rcu_head;
6038 -       wait_queue_head_t       mq_freeze_wq;
6039 +       struct swait_queue_head mq_freeze_wq;
6040         struct percpu_ref       q_usage_counter;
6041         struct list_head        all_q_node;
6042
6043 diff -Nur linux-4.9.6.orig/include/linux/blk-mq.h linux-4.9.6/include/linux/blk-mq.h
6044 --- linux-4.9.6.orig/include/linux/blk-mq.h     2017-01-26 08:25:24.000000000 +0100
6045 +++ linux-4.9.6/include/linux/blk-mq.h  2017-01-28 13:59:09.935656342 +0100
6046 @@ -209,7 +209,7 @@
6047         return unique_tag & BLK_MQ_UNIQUE_TAG_MASK;
6048  }
6049
6050 -
6051 +void __blk_mq_complete_request_remote_work(struct work_struct *work);
6052  int blk_mq_request_started(struct request *rq);
6053  void blk_mq_start_request(struct request *rq);
6054  void blk_mq_end_request(struct request *rq, int error);
6055 diff -Nur linux-4.9.6.orig/include/linux/bottom_half.h linux-4.9.6/include/linux/bottom_half.h
6056 --- linux-4.9.6.orig/include/linux/bottom_half.h        2017-01-26 08:25:24.000000000 +0100
6057 +++ linux-4.9.6/include/linux/bottom_half.h     2017-01-28 13:59:09.935656342 +0100
6058 @@ -3,6 +3,39 @@
6059
6060  #include <linux/preempt.h>
6061
6062 +#ifdef CONFIG_PREEMPT_RT_FULL
6063 +
6064 +extern void __local_bh_disable(void);
6065 +extern void _local_bh_enable(void);
6066 +extern void __local_bh_enable(void);
6067 +
6068 +static inline void local_bh_disable(void)
6069 +{
6070 +       __local_bh_disable();
6071 +}
6072 +
6073 +static inline void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
6074 +{
6075 +       __local_bh_disable();
6076 +}
6077 +
6078 +static inline void local_bh_enable(void)
6079 +{
6080 +       __local_bh_enable();
6081 +}
6082 +
6083 +static inline void __local_bh_enable_ip(unsigned long ip, unsigned int cnt)
6084 +{
6085 +       __local_bh_enable();
6086 +}
6087 +
6088 +static inline void local_bh_enable_ip(unsigned long ip)
6089 +{
6090 +       __local_bh_enable();
6091 +}
6092 +
6093 +#else
6094 +
6095  #ifdef CONFIG_TRACE_IRQFLAGS
6096  extern void __local_bh_disable_ip(unsigned long ip, unsigned int cnt);
6097  #else
6098 @@ -30,5 +63,6 @@
6099  {
6100         __local_bh_enable_ip(_THIS_IP_, SOFTIRQ_DISABLE_OFFSET);
6101  }
6102 +#endif
6103
6104  #endif /* _LINUX_BH_H */
6105 diff -Nur linux-4.9.6.orig/include/linux/buffer_head.h linux-4.9.6/include/linux/buffer_head.h
6106 --- linux-4.9.6.orig/include/linux/buffer_head.h        2017-01-26 08:25:24.000000000 +0100
6107 +++ linux-4.9.6/include/linux/buffer_head.h     2017-01-28 13:59:09.935656342 +0100
6108 @@ -75,8 +75,50 @@
6109         struct address_space *b_assoc_map;      /* mapping this buffer is
6110                                                    associated with */
6111         atomic_t b_count;               /* users using this buffer_head */
6112 +#ifdef CONFIG_PREEMPT_RT_BASE
6113 +       spinlock_t b_uptodate_lock;
6114 +#if IS_ENABLED(CONFIG_JBD2)
6115 +       spinlock_t b_state_lock;
6116 +       spinlock_t b_journal_head_lock;
6117 +#endif
6118 +#endif
6119  };
6120
6121 +static inline unsigned long bh_uptodate_lock_irqsave(struct buffer_head *bh)
6122 +{
6123 +       unsigned long flags;
6124 +
6125 +#ifndef CONFIG_PREEMPT_RT_BASE
6126 +       local_irq_save(flags);
6127 +       bit_spin_lock(BH_Uptodate_Lock, &bh->b_state);
6128 +#else
6129 +       spin_lock_irqsave(&bh->b_uptodate_lock, flags);
6130 +#endif
6131 +       return flags;
6132 +}
6133 +
6134 +static inline void
6135 +bh_uptodate_unlock_irqrestore(struct buffer_head *bh, unsigned long flags)
6136 +{
6137 +#ifndef CONFIG_PREEMPT_RT_BASE
6138 +       bit_spin_unlock(BH_Uptodate_Lock, &bh->b_state);
6139 +       local_irq_restore(flags);
6140 +#else
6141 +       spin_unlock_irqrestore(&bh->b_uptodate_lock, flags);
6142 +#endif
6143 +}
6144 +
6145 +static inline void buffer_head_init_locks(struct buffer_head *bh)
6146 +{
6147 +#ifdef CONFIG_PREEMPT_RT_BASE
6148 +       spin_lock_init(&bh->b_uptodate_lock);
6149 +#if IS_ENABLED(CONFIG_JBD2)
6150 +       spin_lock_init(&bh->b_state_lock);
6151 +       spin_lock_init(&bh->b_journal_head_lock);
6152 +#endif
6153 +#endif
6154 +}
6155 +
6156  /*
6157   * macro tricks to expand the set_buffer_foo(), clear_buffer_foo()
6158   * and buffer_foo() functions.
6159 diff -Nur linux-4.9.6.orig/include/linux/cgroup-defs.h linux-4.9.6/include/linux/cgroup-defs.h
6160 --- linux-4.9.6.orig/include/linux/cgroup-defs.h        2017-01-26 08:25:24.000000000 +0100
6161 +++ linux-4.9.6/include/linux/cgroup-defs.h     2017-01-28 13:59:09.935656342 +0100
6162 @@ -16,6 +16,7 @@
6163  #include <linux/percpu-refcount.h>
6164  #include <linux/percpu-rwsem.h>
6165  #include <linux/workqueue.h>
6166 +#include <linux/swork.h>
6167
6168  #ifdef CONFIG_CGROUPS
6169
6170 @@ -137,6 +138,7 @@
6171         /* percpu_ref killing and RCU release */
6172         struct rcu_head rcu_head;
6173         struct work_struct destroy_work;
6174 +       struct swork_event destroy_swork;
6175  };
6176
6177  /*
6178 diff -Nur linux-4.9.6.orig/include/linux/completion.h linux-4.9.6/include/linux/completion.h
6179 --- linux-4.9.6.orig/include/linux/completion.h 2017-01-26 08:25:24.000000000 +0100
6180 +++ linux-4.9.6/include/linux/completion.h      2017-01-28 13:59:09.935656342 +0100
6181 @@ -7,8 +7,7 @@
6182   * Atomic wait-for-completion handler data structures.
6183   * See kernel/sched/completion.c for details.
6184   */
6185 -
6186 -#include <linux/wait.h>
6187 +#include <linux/swait.h>
6188
6189  /*
6190   * struct completion - structure used to maintain state for a "completion"
6191 @@ -24,11 +23,11 @@
6192   */
6193  struct completion {
6194         unsigned int done;
6195 -       wait_queue_head_t wait;
6196 +       struct swait_queue_head wait;
6197  };
6198
6199  #define COMPLETION_INITIALIZER(work) \
6200 -       { 0, __WAIT_QUEUE_HEAD_INITIALIZER((work).wait) }
6201 +       { 0, __SWAIT_QUEUE_HEAD_INITIALIZER((work).wait) }
6202
6203  #define COMPLETION_INITIALIZER_ONSTACK(work) \
6204         ({ init_completion(&work); work; })
6205 @@ -73,7 +72,7 @@
6206  static inline void init_completion(struct completion *x)
6207  {
6208         x->done = 0;
6209 -       init_waitqueue_head(&x->wait);
6210 +       init_swait_queue_head(&x->wait);
6211  }
6212
6213  /**
6214 diff -Nur linux-4.9.6.orig/include/linux/cpu.h linux-4.9.6/include/linux/cpu.h
6215 --- linux-4.9.6.orig/include/linux/cpu.h        2017-01-26 08:25:24.000000000 +0100
6216 +++ linux-4.9.6/include/linux/cpu.h     2017-01-28 13:59:09.935656342 +0100
6217 @@ -182,6 +182,8 @@
6218  extern void put_online_cpus(void);
6219  extern void cpu_hotplug_disable(void);
6220  extern void cpu_hotplug_enable(void);
6221 +extern void pin_current_cpu(void);
6222 +extern void unpin_current_cpu(void);
6223  #define hotcpu_notifier(fn, pri)       cpu_notifier(fn, pri)
6224  #define __hotcpu_notifier(fn, pri)     __cpu_notifier(fn, pri)
6225  #define register_hotcpu_notifier(nb)   register_cpu_notifier(nb)
6226 @@ -199,6 +201,8 @@
6227  #define put_online_cpus()      do { } while (0)
6228  #define cpu_hotplug_disable()  do { } while (0)
6229  #define cpu_hotplug_enable()   do { } while (0)
6230 +static inline void pin_current_cpu(void) { }
6231 +static inline void unpin_current_cpu(void) { }
6232  #define hotcpu_notifier(fn, pri)       do { (void)(fn); } while (0)
6233  #define __hotcpu_notifier(fn, pri)     do { (void)(fn); } while (0)
6234  /* These aren't inline functions due to a GCC bug. */
6235 diff -Nur linux-4.9.6.orig/include/linux/dcache.h linux-4.9.6/include/linux/dcache.h
6236 --- linux-4.9.6.orig/include/linux/dcache.h     2017-01-26 08:25:24.000000000 +0100
6237 +++ linux-4.9.6/include/linux/dcache.h  2017-01-28 13:59:09.935656342 +0100
6238 @@ -11,6 +11,7 @@
6239  #include <linux/rcupdate.h>
6240  #include <linux/lockref.h>
6241  #include <linux/stringhash.h>
6242 +#include <linux/wait.h>
6243
6244  struct path;
6245  struct vfsmount;
6246 @@ -100,7 +101,7 @@
6247
6248         union {
6249                 struct list_head d_lru;         /* LRU list */
6250 -               wait_queue_head_t *d_wait;      /* in-lookup ones only */
6251 +               struct swait_queue_head *d_wait;        /* in-lookup ones only */
6252         };
6253         struct list_head d_child;       /* child of parent list */
6254         struct list_head d_subdirs;     /* our children */
6255 @@ -230,7 +231,7 @@
6256  extern struct dentry * d_alloc(struct dentry *, const struct qstr *);
6257  extern struct dentry * d_alloc_pseudo(struct super_block *, const struct qstr *);
6258  extern struct dentry * d_alloc_parallel(struct dentry *, const struct qstr *,
6259 -                                       wait_queue_head_t *);
6260 +                                       struct swait_queue_head *);
6261  extern struct dentry * d_splice_alias(struct inode *, struct dentry *);
6262  extern struct dentry * d_add_ci(struct dentry *, struct inode *, struct qstr *);
6263  extern struct dentry * d_exact_alias(struct dentry *, struct inode *);
6264 diff -Nur linux-4.9.6.orig/include/linux/delay.h linux-4.9.6/include/linux/delay.h
6265 --- linux-4.9.6.orig/include/linux/delay.h      2017-01-26 08:25:24.000000000 +0100
6266 +++ linux-4.9.6/include/linux/delay.h   2017-01-28 13:59:09.935656342 +0100
6267 @@ -52,4 +52,10 @@
6268         msleep(seconds * 1000);
6269  }
6270
6271 +#ifdef CONFIG_PREEMPT_RT_FULL
6272 +extern void cpu_chill(void);
6273 +#else
6274 +# define cpu_chill()   cpu_relax()
6275 +#endif
6276 +
6277  #endif /* defined(_LINUX_DELAY_H) */
6278 diff -Nur linux-4.9.6.orig/include/linux/highmem.h linux-4.9.6/include/linux/highmem.h
6279 --- linux-4.9.6.orig/include/linux/highmem.h    2017-01-26 08:25:24.000000000 +0100
6280 +++ linux-4.9.6/include/linux/highmem.h 2017-01-28 13:59:09.935656342 +0100
6281 @@ -7,6 +7,7 @@
6282  #include <linux/mm.h>
6283  #include <linux/uaccess.h>
6284  #include <linux/hardirq.h>
6285 +#include <linux/sched.h>
6286
6287  #include <asm/cacheflush.h>
6288
6289 @@ -65,7 +66,7 @@
6290
6291  static inline void *kmap_atomic(struct page *page)
6292  {
6293 -       preempt_disable();
6294 +       preempt_disable_nort();
6295         pagefault_disable();
6296         return page_address(page);
6297  }
6298 @@ -74,7 +75,7 @@
6299  static inline void __kunmap_atomic(void *addr)
6300  {
6301         pagefault_enable();
6302 -       preempt_enable();
6303 +       preempt_enable_nort();
6304  }
6305
6306  #define kmap_atomic_pfn(pfn)   kmap_atomic(pfn_to_page(pfn))
6307 @@ -86,32 +87,51 @@
6308
6309  #if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32)
6310
6311 +#ifndef CONFIG_PREEMPT_RT_FULL
6312  DECLARE_PER_CPU(int, __kmap_atomic_idx);
6313 +#endif
6314
6315  static inline int kmap_atomic_idx_push(void)
6316  {
6317 +#ifndef CONFIG_PREEMPT_RT_FULL
6318         int idx = __this_cpu_inc_return(__kmap_atomic_idx) - 1;
6319
6320 -#ifdef CONFIG_DEBUG_HIGHMEM
6321 +# ifdef CONFIG_DEBUG_HIGHMEM
6322         WARN_ON_ONCE(in_irq() && !irqs_disabled());
6323         BUG_ON(idx >= KM_TYPE_NR);
6324 -#endif
6325 +# endif
6326         return idx;
6327 +#else
6328 +       current->kmap_idx++;
6329 +       BUG_ON(current->kmap_idx > KM_TYPE_NR);
6330 +       return current->kmap_idx - 1;
6331 +#endif
6332  }
6333
6334  static inline int kmap_atomic_idx(void)
6335  {
6336 +#ifndef CONFIG_PREEMPT_RT_FULL
6337         return __this_cpu_read(__kmap_atomic_idx) - 1;
6338 +#else
6339 +       return current->kmap_idx - 1;
6340 +#endif
6341  }
6342
6343  static inline void kmap_atomic_idx_pop(void)
6344  {
6345 -#ifdef CONFIG_DEBUG_HIGHMEM
6346 +#ifndef CONFIG_PREEMPT_RT_FULL
6347 +# ifdef CONFIG_DEBUG_HIGHMEM
6348         int idx = __this_cpu_dec_return(__kmap_atomic_idx);
6349
6350         BUG_ON(idx < 0);
6351 -#else
6352 +# else
6353         __this_cpu_dec(__kmap_atomic_idx);
6354 +# endif
6355 +#else
6356 +       current->kmap_idx--;
6357 +# ifdef CONFIG_DEBUG_HIGHMEM
6358 +       BUG_ON(current->kmap_idx < 0);
6359 +# endif
6360  #endif
6361  }
6362
6363 diff -Nur linux-4.9.6.orig/include/linux/hrtimer.h linux-4.9.6/include/linux/hrtimer.h
6364 --- linux-4.9.6.orig/include/linux/hrtimer.h    2017-01-26 08:25:24.000000000 +0100
6365 +++ linux-4.9.6/include/linux/hrtimer.h 2017-01-28 13:59:09.935656342 +0100
6366 @@ -87,6 +87,9 @@
6367   * @function:  timer expiry callback function
6368   * @base:      pointer to the timer base (per cpu and per clock)
6369   * @state:     state information (See bit values above)
6370 + * @cb_entry:  list entry to defer timers from hardirq context
6371 + * @irqsafe:   timer can run in hardirq context
6372 + * @praecox:   timer expiry time if expired at the time of programming
6373   * @is_rel:    Set if the timer was armed relative
6374   * @start_pid:  timer statistics field to store the pid of the task which
6375   *             started the timer
6376 @@ -103,6 +106,11 @@
6377         enum hrtimer_restart            (*function)(struct hrtimer *);
6378         struct hrtimer_clock_base       *base;
6379         u8                              state;
6380 +       struct list_head                cb_entry;
6381 +       int                             irqsafe;
6382 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
6383 +       ktime_t                         praecox;
6384 +#endif
6385         u8                              is_rel;
6386  #ifdef CONFIG_TIMER_STATS
6387         int                             start_pid;
6388 @@ -123,11 +131,7 @@
6389         struct task_struct *task;
6390  };
6391
6392 -#ifdef CONFIG_64BIT
6393  # define HRTIMER_CLOCK_BASE_ALIGN      64
6394 -#else
6395 -# define HRTIMER_CLOCK_BASE_ALIGN      32
6396 -#endif
6397
6398  /**
6399   * struct hrtimer_clock_base - the timer base for a specific clock
6400 @@ -136,6 +140,7 @@
6401   *                     timer to a base on another cpu.
6402   * @clockid:           clock id for per_cpu support
6403   * @active:            red black tree root node for the active timers
6404 + * @expired:           list head for deferred timers.
6405   * @get_time:          function to retrieve the current time of the clock
6406   * @offset:            offset of this clock to the monotonic base
6407   */
6408 @@ -144,6 +149,7 @@
6409         int                     index;
6410         clockid_t               clockid;
6411         struct timerqueue_head  active;
6412 +       struct list_head        expired;
6413         ktime_t                 (*get_time)(void);
6414         ktime_t                 offset;
6415  } __attribute__((__aligned__(HRTIMER_CLOCK_BASE_ALIGN)));
6416 @@ -187,6 +193,7 @@
6417         raw_spinlock_t                  lock;
6418         seqcount_t                      seq;
6419         struct hrtimer                  *running;
6420 +       struct hrtimer                  *running_soft;
6421         unsigned int                    cpu;
6422         unsigned int                    active_bases;
6423         unsigned int                    clock_was_set_seq;
6424 @@ -203,6 +210,9 @@
6425         unsigned int                    nr_hangs;
6426         unsigned int                    max_hang_time;
6427  #endif
6428 +#ifdef CONFIG_PREEMPT_RT_BASE
6429 +       wait_queue_head_t               wait;
6430 +#endif
6431         struct hrtimer_clock_base       clock_base[HRTIMER_MAX_CLOCK_BASES];
6432  } ____cacheline_aligned;
6433
6434 @@ -412,6 +422,13 @@
6435         hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
6436  }
6437
6438 +/* Softirq preemption could deadlock timer removal */
6439 +#ifdef CONFIG_PREEMPT_RT_BASE
6440 +  extern void hrtimer_wait_for_timer(const struct hrtimer *timer);
6441 +#else
6442 +# define hrtimer_wait_for_timer(timer) do { cpu_relax(); } while (0)
6443 +#endif
6444 +
6445  /* Query timers: */
6446  extern ktime_t __hrtimer_get_remaining(const struct hrtimer *timer, bool adjust);
6447
6448 @@ -436,7 +453,7 @@
6449   * Helper function to check, whether the timer is running the callback
6450   * function
6451   */
6452 -static inline int hrtimer_callback_running(struct hrtimer *timer)
6453 +static inline int hrtimer_callback_running(const struct hrtimer *timer)
6454  {
6455         return timer->base->cpu_base->running == timer;
6456  }
6457 diff -Nur linux-4.9.6.orig/include/linux/idr.h linux-4.9.6/include/linux/idr.h
6458 --- linux-4.9.6.orig/include/linux/idr.h        2017-01-26 08:25:24.000000000 +0100
6459 +++ linux-4.9.6/include/linux/idr.h     2017-01-28 13:59:09.935656342 +0100
6460 @@ -95,10 +95,14 @@
6461   * Each idr_preload() should be matched with an invocation of this
6462   * function.  See idr_preload() for details.
6463   */
6464 +#ifdef CONFIG_PREEMPT_RT_FULL
6465 +void idr_preload_end(void);
6466 +#else
6467  static inline void idr_preload_end(void)
6468  {
6469         preempt_enable();
6470  }
6471 +#endif
6472
6473  /**
6474   * idr_find - return pointer for given id
6475 diff -Nur linux-4.9.6.orig/include/linux/init_task.h linux-4.9.6/include/linux/init_task.h
6476 --- linux-4.9.6.orig/include/linux/init_task.h  2017-01-26 08:25:24.000000000 +0100
6477 +++ linux-4.9.6/include/linux/init_task.h       2017-01-28 13:59:09.935656342 +0100
6478 @@ -150,6 +150,12 @@
6479  # define INIT_PERF_EVENTS(tsk)
6480  #endif
6481
6482 +#ifdef CONFIG_PREEMPT_RT_BASE
6483 +# define INIT_TIMER_LIST               .posix_timer_list = NULL,
6484 +#else
6485 +# define INIT_TIMER_LIST
6486 +#endif
6487 +
6488  #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
6489  # define INIT_VTIME(tsk)                                               \
6490         .vtime_seqcount = SEQCNT_ZERO(tsk.vtime_seqcount),      \
6491 @@ -250,6 +256,7 @@
6492         .cpu_timers     = INIT_CPU_TIMERS(tsk.cpu_timers),              \
6493         .pi_lock        = __RAW_SPIN_LOCK_UNLOCKED(tsk.pi_lock),        \
6494         .timer_slack_ns = 50000, /* 50 usec default slack */            \
6495 +       INIT_TIMER_LIST                                                 \
6496         .pids = {                                                       \
6497                 [PIDTYPE_PID]  = INIT_PID_LINK(PIDTYPE_PID),            \
6498                 [PIDTYPE_PGID] = INIT_PID_LINK(PIDTYPE_PGID),           \
6499 diff -Nur linux-4.9.6.orig/include/linux/interrupt.h linux-4.9.6/include/linux/interrupt.h
6500 --- linux-4.9.6.orig/include/linux/interrupt.h  2017-01-26 08:25:24.000000000 +0100
6501 +++ linux-4.9.6/include/linux/interrupt.h       2017-01-28 13:59:09.935656342 +0100
6502 @@ -14,6 +14,7 @@
6503  #include <linux/hrtimer.h>
6504  #include <linux/kref.h>
6505  #include <linux/workqueue.h>
6506 +#include <linux/swork.h>
6507
6508  #include <linux/atomic.h>
6509  #include <asm/ptrace.h>
6510 @@ -61,6 +62,7 @@
6511   *                interrupt handler after suspending interrupts. For system
6512   *                wakeup devices users need to implement wakeup detection in
6513   *                their interrupt handlers.
6514 + * IRQF_NO_SOFTIRQ_CALL - Do not process softirqs in the irq thread context (RT)
6515   */
6516  #define IRQF_SHARED            0x00000080
6517  #define IRQF_PROBE_SHARED      0x00000100
6518 @@ -74,6 +76,7 @@
6519  #define IRQF_NO_THREAD         0x00010000
6520  #define IRQF_EARLY_RESUME      0x00020000
6521  #define IRQF_COND_SUSPEND      0x00040000
6522 +#define IRQF_NO_SOFTIRQ_CALL   0x00080000
6523
6524  #define IRQF_TIMER             (__IRQF_TIMER | IRQF_NO_SUSPEND | IRQF_NO_THREAD)
6525
6526 @@ -196,7 +199,7 @@
6527  #ifdef CONFIG_LOCKDEP
6528  # define local_irq_enable_in_hardirq() do { } while (0)
6529  #else
6530 -# define local_irq_enable_in_hardirq() local_irq_enable()
6531 +# define local_irq_enable_in_hardirq() local_irq_enable_nort()
6532  #endif
6533
6534  extern void disable_irq_nosync(unsigned int irq);
6535 @@ -216,6 +219,7 @@
6536   * struct irq_affinity_notify - context for notification of IRQ affinity changes
6537   * @irq:               Interrupt to which notification applies
6538   * @kref:              Reference count, for internal use
6539 + * @swork:             Swork item, for internal use
6540   * @work:              Work item, for internal use
6541   * @notify:            Function to be called on change.  This will be
6542   *                     called in process context.
6543 @@ -227,7 +231,11 @@
6544  struct irq_affinity_notify {
6545         unsigned int irq;
6546         struct kref kref;
6547 +#ifdef CONFIG_PREEMPT_RT_BASE
6548 +       struct swork_event swork;
6549 +#else
6550         struct work_struct work;
6551 +#endif
6552         void (*notify)(struct irq_affinity_notify *, const cpumask_t *mask);
6553         void (*release)(struct kref *ref);
6554  };
6555 @@ -406,9 +414,13 @@
6556                                  bool state);
6557
6558  #ifdef CONFIG_IRQ_FORCED_THREADING
6559 +# ifndef CONFIG_PREEMPT_RT_BASE
6560  extern bool force_irqthreads;
6561 +# else
6562 +#  define force_irqthreads     (true)
6563 +# endif
6564  #else
6565 -#define force_irqthreads       (0)
6566 +#define force_irqthreads       (false)
6567  #endif
6568
6569  #ifndef __ARCH_SET_SOFTIRQ_PENDING
6570 @@ -465,9 +477,10 @@
6571         void    (*action)(struct softirq_action *);
6572  };
6573
6574 +#ifndef CONFIG_PREEMPT_RT_FULL
6575  asmlinkage void do_softirq(void);
6576  asmlinkage void __do_softirq(void);
6577 -
6578 +static inline void thread_do_softirq(void) { do_softirq(); }
6579  #ifdef __ARCH_HAS_DO_SOFTIRQ
6580  void do_softirq_own_stack(void);
6581  #else
6582 @@ -476,13 +489,25 @@
6583         __do_softirq();
6584  }
6585  #endif
6586 +#else
6587 +extern void thread_do_softirq(void);
6588 +#endif
6589
6590  extern void open_softirq(int nr, void (*action)(struct softirq_action *));
6591  extern void softirq_init(void);
6592  extern void __raise_softirq_irqoff(unsigned int nr);
6593 +#ifdef CONFIG_PREEMPT_RT_FULL
6594 +extern void __raise_softirq_irqoff_ksoft(unsigned int nr);
6595 +#else
6596 +static inline void __raise_softirq_irqoff_ksoft(unsigned int nr)
6597 +{
6598 +       __raise_softirq_irqoff(nr);
6599 +}
6600 +#endif
6601
6602  extern void raise_softirq_irqoff(unsigned int nr);
6603  extern void raise_softirq(unsigned int nr);
6604 +extern void softirq_check_pending_idle(void);
6605
6606  DECLARE_PER_CPU(struct task_struct *, ksoftirqd);
6607
6608 @@ -504,8 +529,9 @@
6609       to be executed on some cpu at least once after this.
6610     * If the tasklet is already scheduled, but its execution is still not
6611       started, it will be executed only once.
6612 -   * If this tasklet is already running on another CPU (or schedule is called
6613 -     from tasklet itself), it is rescheduled for later.
6614 +   * If this tasklet is already running on another CPU, it is rescheduled
6615 +     for later.
6616 +   * Schedule must not be called from the tasklet itself (a lockup occurs)
6617     * Tasklet is strictly serialized wrt itself, but not
6618       wrt another tasklets. If client needs some intertask synchronization,
6619       he makes it with spinlocks.
6620 @@ -530,27 +556,36 @@
6621  enum
6622  {
6623         TASKLET_STATE_SCHED,    /* Tasklet is scheduled for execution */
6624 -       TASKLET_STATE_RUN       /* Tasklet is running (SMP only) */
6625 +       TASKLET_STATE_RUN,      /* Tasklet is running (SMP only) */
6626 +       TASKLET_STATE_PENDING   /* Tasklet is pending */
6627  };
6628
6629 -#ifdef CONFIG_SMP
6630 +#define TASKLET_STATEF_SCHED   (1 << TASKLET_STATE_SCHED)
6631 +#define TASKLET_STATEF_RUN     (1 << TASKLET_STATE_RUN)
6632 +#define TASKLET_STATEF_PENDING (1 << TASKLET_STATE_PENDING)
6633 +
6634 +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
6635  static inline int tasklet_trylock(struct tasklet_struct *t)
6636  {
6637         return !test_and_set_bit(TASKLET_STATE_RUN, &(t)->state);
6638  }
6639
6640 +static inline int tasklet_tryunlock(struct tasklet_struct *t)
6641 +{
6642 +       return cmpxchg(&t->state, TASKLET_STATEF_RUN, 0) == TASKLET_STATEF_RUN;
6643 +}
6644 +
6645  static inline void tasklet_unlock(struct tasklet_struct *t)
6646  {
6647         smp_mb__before_atomic();
6648         clear_bit(TASKLET_STATE_RUN, &(t)->state);
6649  }
6650
6651 -static inline void tasklet_unlock_wait(struct tasklet_struct *t)
6652 -{
6653 -       while (test_bit(TASKLET_STATE_RUN, &(t)->state)) { barrier(); }
6654 -}
6655 +extern void tasklet_unlock_wait(struct tasklet_struct *t);
6656 +
6657  #else
6658  #define tasklet_trylock(t) 1
6659 +#define tasklet_tryunlock(t)   1
6660  #define tasklet_unlock_wait(t) do { } while (0)
6661  #define tasklet_unlock(t) do { } while (0)
6662  #endif
6663 @@ -599,12 +634,7 @@
6664         smp_mb();
6665  }
6666
6667 -static inline void tasklet_enable(struct tasklet_struct *t)
6668 -{
6669 -       smp_mb__before_atomic();
6670 -       atomic_dec(&t->count);
6671 -}
6672 -
6673 +extern void tasklet_enable(struct tasklet_struct *t);
6674  extern void tasklet_kill(struct tasklet_struct *t);
6675  extern void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu);
6676  extern void tasklet_init(struct tasklet_struct *t,
6677 @@ -635,6 +665,12 @@
6678         tasklet_kill(&ttimer->tasklet);
6679  }
6680
6681 +#ifdef CONFIG_PREEMPT_RT_FULL
6682 +extern void softirq_early_init(void);
6683 +#else
6684 +static inline void softirq_early_init(void) { }
6685 +#endif
6686 +
6687  /*
6688   * Autoprobing for irqs:
6689   *
6690 diff -Nur linux-4.9.6.orig/include/linux/irqdesc.h linux-4.9.6/include/linux/irqdesc.h
6691 --- linux-4.9.6.orig/include/linux/irqdesc.h    2017-01-26 08:25:24.000000000 +0100
6692 +++ linux-4.9.6/include/linux/irqdesc.h 2017-01-28 13:59:09.935656342 +0100
6693 @@ -66,6 +66,7 @@
6694         unsigned int            irqs_unhandled;
6695         atomic_t                threads_handled;
6696         int                     threads_handled_last;
6697 +       u64                     random_ip;
6698         raw_spinlock_t          lock;
6699         struct cpumask          *percpu_enabled;
6700         const struct cpumask    *percpu_affinity;
6701 diff -Nur linux-4.9.6.orig/include/linux/irqflags.h linux-4.9.6/include/linux/irqflags.h
6702 --- linux-4.9.6.orig/include/linux/irqflags.h   2017-01-26 08:25:24.000000000 +0100
6703 +++ linux-4.9.6/include/linux/irqflags.h        2017-01-28 13:59:09.935656342 +0100
6704 @@ -25,8 +25,6 @@
6705  # define trace_softirqs_enabled(p)     ((p)->softirqs_enabled)
6706  # define trace_hardirq_enter() do { current->hardirq_context++; } while (0)
6707  # define trace_hardirq_exit()  do { current->hardirq_context--; } while (0)
6708 -# define lockdep_softirq_enter()       do { current->softirq_context++; } while (0)
6709 -# define lockdep_softirq_exit()        do { current->softirq_context--; } while (0)
6710  # define INIT_TRACE_IRQFLAGS   .softirqs_enabled = 1,
6711  #else
6712  # define trace_hardirqs_on()           do { } while (0)
6713 @@ -39,9 +37,15 @@
6714  # define trace_softirqs_enabled(p)     0
6715  # define trace_hardirq_enter()         do { } while (0)
6716  # define trace_hardirq_exit()          do { } while (0)
6717 +# define INIT_TRACE_IRQFLAGS
6718 +#endif
6719 +
6720 +#if defined(CONFIG_TRACE_IRQFLAGS) && !defined(CONFIG_PREEMPT_RT_FULL)
6721 +# define lockdep_softirq_enter() do { current->softirq_context++; } while (0)
6722 +# define lockdep_softirq_exit()         do { current->softirq_context--; } while (0)
6723 +#else
6724  # define lockdep_softirq_enter()       do { } while (0)
6725  # define lockdep_softirq_exit()                do { } while (0)
6726 -# define INIT_TRACE_IRQFLAGS
6727  #endif
6728
6729  #if defined(CONFIG_IRQSOFF_TRACER) || \
6730 @@ -148,4 +152,23 @@
6731
6732  #define irqs_disabled_flags(flags) raw_irqs_disabled_flags(flags)
6733
6734 +/*
6735 + * local_irq* variants depending on RT/!RT
6736 + */
6737 +#ifdef CONFIG_PREEMPT_RT_FULL
6738 +# define local_irq_disable_nort()      do { } while (0)
6739 +# define local_irq_enable_nort()       do { } while (0)
6740 +# define local_irq_save_nort(flags)    local_save_flags(flags)
6741 +# define local_irq_restore_nort(flags) (void)(flags)
6742 +# define local_irq_disable_rt()                local_irq_disable()
6743 +# define local_irq_enable_rt()         local_irq_enable()
6744 +#else
6745 +# define local_irq_disable_nort()      local_irq_disable()
6746 +# define local_irq_enable_nort()       local_irq_enable()
6747 +# define local_irq_save_nort(flags)    local_irq_save(flags)
6748 +# define local_irq_restore_nort(flags) local_irq_restore(flags)
6749 +# define local_irq_disable_rt()                do { } while (0)
6750 +# define local_irq_enable_rt()         do { } while (0)
6751 +#endif
6752 +
6753  #endif
6754 diff -Nur linux-4.9.6.orig/include/linux/irq.h linux-4.9.6/include/linux/irq.h
6755 --- linux-4.9.6.orig/include/linux/irq.h        2017-01-26 08:25:24.000000000 +0100
6756 +++ linux-4.9.6/include/linux/irq.h     2017-01-28 13:59:09.935656342 +0100
6757 @@ -72,6 +72,7 @@
6758   * IRQ_IS_POLLED               - Always polled by another interrupt. Exclude
6759   *                               it from the spurious interrupt detection
6760   *                               mechanism and from core side polling.
6761 + * IRQ_NO_SOFTIRQ_CALL         - No softirq processing in the irq thread context (RT)
6762   * IRQ_DISABLE_UNLAZY          - Disable lazy irq disable
6763   */
6764  enum {
6765 @@ -99,13 +100,14 @@
6766         IRQ_PER_CPU_DEVID       = (1 << 17),
6767         IRQ_IS_POLLED           = (1 << 18),
6768         IRQ_DISABLE_UNLAZY      = (1 << 19),
6769 +       IRQ_NO_SOFTIRQ_CALL     = (1 << 20),
6770  };
6771
6772  #define IRQF_MODIFY_MASK       \
6773         (IRQ_TYPE_SENSE_MASK | IRQ_NOPROBE | IRQ_NOREQUEST | \
6774          IRQ_NOAUTOEN | IRQ_MOVE_PCNTXT | IRQ_LEVEL | IRQ_NO_BALANCING | \
6775          IRQ_PER_CPU | IRQ_NESTED_THREAD | IRQ_NOTHREAD | IRQ_PER_CPU_DEVID | \
6776 -        IRQ_IS_POLLED | IRQ_DISABLE_UNLAZY)
6777 +        IRQ_IS_POLLED | IRQ_DISABLE_UNLAZY | IRQ_NO_SOFTIRQ_CALL)
6778
6779  #define IRQ_NO_BALANCING_MASK  (IRQ_PER_CPU | IRQ_NO_BALANCING)
6780
6781 diff -Nur linux-4.9.6.orig/include/linux/irq_work.h linux-4.9.6/include/linux/irq_work.h
6782 --- linux-4.9.6.orig/include/linux/irq_work.h   2017-01-26 08:25:24.000000000 +0100
6783 +++ linux-4.9.6/include/linux/irq_work.h        2017-01-28 13:59:09.935656342 +0100
6784 @@ -16,6 +16,7 @@
6785  #define IRQ_WORK_BUSY          2UL
6786  #define IRQ_WORK_FLAGS         3UL
6787  #define IRQ_WORK_LAZY          4UL /* Doesn't want IPI, wait for tick */
6788 +#define IRQ_WORK_HARD_IRQ      8UL /* Run hard IRQ context, even on RT */
6789
6790  struct irq_work {
6791         unsigned long flags;
6792 @@ -51,4 +52,10 @@
6793  static inline void irq_work_run(void) { }
6794  #endif
6795
6796 +#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_PREEMPT_RT_FULL)
6797 +void irq_work_tick_soft(void);
6798 +#else
6799 +static inline void irq_work_tick_soft(void) { }
6800 +#endif
6801 +
6802  #endif /* _LINUX_IRQ_WORK_H */
6803 diff -Nur linux-4.9.6.orig/include/linux/jbd2.h linux-4.9.6/include/linux/jbd2.h
6804 --- linux-4.9.6.orig/include/linux/jbd2.h       2017-01-26 08:25:24.000000000 +0100
6805 +++ linux-4.9.6/include/linux/jbd2.h    2017-01-28 13:59:09.939656496 +0100
6806 @@ -347,32 +347,56 @@
6807
6808  static inline void jbd_lock_bh_state(struct buffer_head *bh)
6809  {
6810 +#ifndef CONFIG_PREEMPT_RT_BASE
6811         bit_spin_lock(BH_State, &bh->b_state);
6812 +#else
6813 +       spin_lock(&bh->b_state_lock);
6814 +#endif
6815  }
6816
6817  static inline int jbd_trylock_bh_state(struct buffer_head *bh)
6818  {
6819 +#ifndef CONFIG_PREEMPT_RT_BASE
6820         return bit_spin_trylock(BH_State, &bh->b_state);
6821 +#else
6822 +       return spin_trylock(&bh->b_state_lock);
6823 +#endif
6824  }
6825
6826  static inline int jbd_is_locked_bh_state(struct buffer_head *bh)
6827  {
6828 +#ifndef CONFIG_PREEMPT_RT_BASE
6829         return bit_spin_is_locked(BH_State, &bh->b_state);
6830 +#else
6831 +       return spin_is_locked(&bh->b_state_lock);
6832 +#endif
6833  }
6834
6835  static inline void jbd_unlock_bh_state(struct buffer_head *bh)
6836  {
6837 +#ifndef CONFIG_PREEMPT_RT_BASE
6838         bit_spin_unlock(BH_State, &bh->b_state);
6839 +#else
6840 +       spin_unlock(&bh->b_state_lock);
6841 +#endif
6842  }
6843
6844  static inline void jbd_lock_bh_journal_head(struct buffer_head *bh)
6845  {
6846 +#ifndef CONFIG_PREEMPT_RT_BASE
6847         bit_spin_lock(BH_JournalHead, &bh->b_state);
6848 +#else
6849 +       spin_lock(&bh->b_journal_head_lock);
6850 +#endif
6851  }
6852
6853  static inline void jbd_unlock_bh_journal_head(struct buffer_head *bh)
6854  {
6855 +#ifndef CONFIG_PREEMPT_RT_BASE
6856         bit_spin_unlock(BH_JournalHead, &bh->b_state);
6857 +#else
6858 +       spin_unlock(&bh->b_journal_head_lock);
6859 +#endif
6860  }
6861
6862  #define J_ASSERT(assert)       BUG_ON(!(assert))
6863 diff -Nur linux-4.9.6.orig/include/linux/kdb.h linux-4.9.6/include/linux/kdb.h
6864 --- linux-4.9.6.orig/include/linux/kdb.h        2017-01-26 08:25:24.000000000 +0100
6865 +++ linux-4.9.6/include/linux/kdb.h     2017-01-28 13:59:09.939656496 +0100
6866 @@ -167,6 +167,7 @@
6867  extern __printf(1, 2) int kdb_printf(const char *, ...);
6868  typedef __printf(1, 2) int (*kdb_printf_t)(const char *, ...);
6869
6870 +#define in_kdb_printk()        (kdb_trap_printk)
6871  extern void kdb_init(int level);
6872
6873  /* Access to kdb specific polling devices */
6874 @@ -201,6 +202,7 @@
6875  extern int kdb_unregister(char *);
6876  #else /* ! CONFIG_KGDB_KDB */
6877  static inline __printf(1, 2) int kdb_printf(const char *fmt, ...) { return 0; }
6878 +#define in_kdb_printk() (0)
6879  static inline void kdb_init(int level) {}
6880  static inline int kdb_register(char *cmd, kdb_func_t func, char *usage,
6881                                char *help, short minlen) { return 0; }
6882 diff -Nur linux-4.9.6.orig/include/linux/kernel.h linux-4.9.6/include/linux/kernel.h
6883 --- linux-4.9.6.orig/include/linux/kernel.h     2017-01-26 08:25:24.000000000 +0100
6884 +++ linux-4.9.6/include/linux/kernel.h  2017-01-28 13:59:09.939656496 +0100
6885 @@ -194,6 +194,9 @@
6886   */
6887  # define might_sleep() \
6888         do { __might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0)
6889 +
6890 +# define might_sleep_no_state_check() \
6891 +       do { ___might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0)
6892  # define sched_annotate_sleep()        (current->task_state_change = 0)
6893  #else
6894    static inline void ___might_sleep(const char *file, int line,
6895 @@ -201,6 +204,7 @@
6896    static inline void __might_sleep(const char *file, int line,
6897                                    int preempt_offset) { }
6898  # define might_sleep() do { might_resched(); } while (0)
6899 +# define might_sleep_no_state_check() do { might_resched(); } while (0)
6900  # define sched_annotate_sleep() do { } while (0)
6901  #endif
6902
6903 @@ -488,6 +492,7 @@
6904         SYSTEM_HALT,
6905         SYSTEM_POWER_OFF,
6906         SYSTEM_RESTART,
6907 +       SYSTEM_SUSPEND,
6908  } system_state;
6909
6910  #define TAINT_PROPRIETARY_MODULE       0
6911 diff -Nur linux-4.9.6.orig/include/linux/list_bl.h linux-4.9.6/include/linux/list_bl.h
6912 --- linux-4.9.6.orig/include/linux/list_bl.h    2017-01-26 08:25:24.000000000 +0100
6913 +++ linux-4.9.6/include/linux/list_bl.h 2017-01-28 13:59:09.939656496 +0100
6914 @@ -2,6 +2,7 @@
6915  #define _LINUX_LIST_BL_H
6916
6917  #include <linux/list.h>
6918 +#include <linux/spinlock.h>
6919  #include <linux/bit_spinlock.h>
6920
6921  /*
6922 @@ -32,13 +33,24 @@
6923
6924  struct hlist_bl_head {
6925         struct hlist_bl_node *first;
6926 +#ifdef CONFIG_PREEMPT_RT_BASE
6927 +       raw_spinlock_t lock;
6928 +#endif
6929  };
6930
6931  struct hlist_bl_node {
6932         struct hlist_bl_node *next, **pprev;
6933  };
6934 -#define INIT_HLIST_BL_HEAD(ptr) \
6935 -       ((ptr)->first = NULL)
6936 +
6937 +#ifdef CONFIG_PREEMPT_RT_BASE
6938 +#define INIT_HLIST_BL_HEAD(h)          \
6939 +do {                                   \
6940 +       (h)->first = NULL;              \
6941 +       raw_spin_lock_init(&(h)->lock); \
6942 +} while (0)
6943 +#else
6944 +#define INIT_HLIST_BL_HEAD(h) (h)->first = NULL
6945 +#endif
6946
6947  static inline void INIT_HLIST_BL_NODE(struct hlist_bl_node *h)
6948  {
6949 @@ -118,12 +130,26 @@
6950
6951  static inline void hlist_bl_lock(struct hlist_bl_head *b)
6952  {
6953 +#ifndef CONFIG_PREEMPT_RT_BASE
6954         bit_spin_lock(0, (unsigned long *)b);
6955 +#else
6956 +       raw_spin_lock(&b->lock);
6957 +#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
6958 +       __set_bit(0, (unsigned long *)b);
6959 +#endif
6960 +#endif
6961  }
6962
6963  static inline void hlist_bl_unlock(struct hlist_bl_head *b)
6964  {
6965 +#ifndef CONFIG_PREEMPT_RT_BASE
6966         __bit_spin_unlock(0, (unsigned long *)b);
6967 +#else
6968 +#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
6969 +       __clear_bit(0, (unsigned long *)b);
6970 +#endif
6971 +       raw_spin_unlock(&b->lock);
6972 +#endif
6973  }
6974
6975  static inline bool hlist_bl_is_locked(struct hlist_bl_head *b)
6976 diff -Nur linux-4.9.6.orig/include/linux/locallock.h linux-4.9.6/include/linux/locallock.h
6977 --- linux-4.9.6.orig/include/linux/locallock.h  1970-01-01 01:00:00.000000000 +0100
6978 +++ linux-4.9.6/include/linux/locallock.h       2017-01-28 13:59:09.939656496 +0100
6979 @@ -0,0 +1,278 @@
6980 +#ifndef _LINUX_LOCALLOCK_H
6981 +#define _LINUX_LOCALLOCK_H
6982 +
6983 +#include <linux/percpu.h>
6984 +#include <linux/spinlock.h>
6985 +
6986 +#ifdef CONFIG_PREEMPT_RT_BASE
6987 +
6988 +#ifdef CONFIG_DEBUG_SPINLOCK
6989 +# define LL_WARN(cond) WARN_ON(cond)
6990 +#else
6991 +# define LL_WARN(cond) do { } while (0)
6992 +#endif
6993 +
6994 +/*
6995 + * per cpu lock based substitute for local_irq_*()
6996 + */
6997 +struct local_irq_lock {
6998 +       spinlock_t              lock;
6999 +       struct task_struct      *owner;
7000 +       int                     nestcnt;
7001 +       unsigned long           flags;
7002 +};
7003 +
7004 +#define DEFINE_LOCAL_IRQ_LOCK(lvar)                                    \
7005 +       DEFINE_PER_CPU(struct local_irq_lock, lvar) = {                 \
7006 +               .lock = __SPIN_LOCK_UNLOCKED((lvar).lock) }
7007 +
7008 +#define DECLARE_LOCAL_IRQ_LOCK(lvar)                                   \
7009 +       DECLARE_PER_CPU(struct local_irq_lock, lvar)
7010 +
7011 +#define local_irq_lock_init(lvar)                                      \
7012 +       do {                                                            \
7013 +               int __cpu;                                              \
7014 +               for_each_possible_cpu(__cpu)                            \
7015 +                       spin_lock_init(&per_cpu(lvar, __cpu).lock);     \
7016 +       } while (0)
7017 +
7018 +/*
7019 + * spin_lock|trylock|unlock_local flavour that does not migrate disable
7020 + * used for __local_lock|trylock|unlock where get_local_var/put_local_var
7021 + * already takes care of the migrate_disable/enable
7022 + * for CONFIG_PREEMPT_BASE map to the normal spin_* calls.
7023 + */
7024 +#ifdef CONFIG_PREEMPT_RT_FULL
7025 +# define spin_lock_local(lock)                 rt_spin_lock__no_mg(lock)
7026 +# define spin_trylock_local(lock)              rt_spin_trylock__no_mg(lock)
7027 +# define spin_unlock_local(lock)               rt_spin_unlock__no_mg(lock)
7028 +#else
7029 +# define spin_lock_local(lock)                 spin_lock(lock)
7030 +# define spin_trylock_local(lock)              spin_trylock(lock)
7031 +# define spin_unlock_local(lock)               spin_unlock(lock)
7032 +#endif
7033 +
7034 +static inline void __local_lock(struct local_irq_lock *lv)
7035 +{
7036 +       if (lv->owner != current) {
7037 +               spin_lock_local(&lv->lock);
7038 +               LL_WARN(lv->owner);
7039 +               LL_WARN(lv->nestcnt);
7040 +               lv->owner = current;
7041 +       }
7042 +       lv->nestcnt++;
7043 +}
7044 +
7045 +#define local_lock(lvar)                                       \
7046 +       do { __local_lock(&get_local_var(lvar)); } while (0)
7047 +
7048 +#define local_lock_on(lvar, cpu)                               \
7049 +       do { __local_lock(&per_cpu(lvar, cpu)); } while (0)
7050 +
7051 +static inline int __local_trylock(struct local_irq_lock *lv)
7052 +{
7053 +       if (lv->owner != current && spin_trylock_local(&lv->lock)) {
7054 +               LL_WARN(lv->owner);
7055 +               LL_WARN(lv->nestcnt);
7056 +               lv->owner = current;
7057 +               lv->nestcnt = 1;
7058 +               return 1;
7059 +       }
7060 +       return 0;
7061 +}
7062 +
7063 +#define local_trylock(lvar)                                            \
7064 +       ({                                                              \
7065 +               int __locked;                                           \
7066 +               __locked = __local_trylock(&get_local_var(lvar));       \
7067 +               if (!__locked)                                          \
7068 +                       put_local_var(lvar);                            \
7069 +               __locked;                                               \
7070 +       })
7071 +
7072 +static inline void __local_unlock(struct local_irq_lock *lv)
7073 +{
7074 +       LL_WARN(lv->nestcnt == 0);
7075 +       LL_WARN(lv->owner != current);
7076 +       if (--lv->nestcnt)
7077 +               return;
7078 +
7079 +       lv->owner = NULL;
7080 +       spin_unlock_local(&lv->lock);
7081 +}
7082 +
7083 +#define local_unlock(lvar)                                     \
7084 +       do {                                                    \
7085 +               __local_unlock(this_cpu_ptr(&lvar));            \
7086 +               put_local_var(lvar);                            \
7087 +       } while (0)
7088 +
7089 +#define local_unlock_on(lvar, cpu)                       \
7090 +       do { __local_unlock(&per_cpu(lvar, cpu)); } while (0)
7091 +
7092 +static inline void __local_lock_irq(struct local_irq_lock *lv)
7093 +{
7094 +       spin_lock_irqsave(&lv->lock, lv->flags);
7095 +       LL_WARN(lv->owner);
7096 +       LL_WARN(lv->nestcnt);
7097 +       lv->owner = current;
7098 +       lv->nestcnt = 1;
7099 +}
7100 +
7101 +#define local_lock_irq(lvar)                                           \
7102 +       do { __local_lock_irq(&get_local_var(lvar)); } while (0)
7103 +
7104 +#define local_lock_irq_on(lvar, cpu)                                   \
7105 +       do { __local_lock_irq(&per_cpu(lvar, cpu)); } while (0)
7106 +
7107 +static inline void __local_unlock_irq(struct local_irq_lock *lv)
7108 +{
7109 +       LL_WARN(!lv->nestcnt);
7110 +       LL_WARN(lv->owner != current);
7111 +       lv->owner = NULL;
7112 +       lv->nestcnt = 0;
7113 +       spin_unlock_irq(&lv->lock);
7114 +}
7115 +
7116 +#define local_unlock_irq(lvar)                                         \
7117 +       do {                                                            \
7118 +               __local_unlock_irq(this_cpu_ptr(&lvar));                \
7119 +               put_local_var(lvar);                                    \
7120 +       } while (0)
7121 +
7122 +#define local_unlock_irq_on(lvar, cpu)                                 \
7123 +       do {                                                            \
7124 +               __local_unlock_irq(&per_cpu(lvar, cpu));                \
7125 +       } while (0)
7126 +
7127 +static inline int __local_lock_irqsave(struct local_irq_lock *lv)
7128 +{
7129 +       if (lv->owner != current) {
7130 +               __local_lock_irq(lv);
7131 +               return 0;
7132 +       } else {
7133 +               lv->nestcnt++;
7134 +               return 1;
7135 +       }
7136 +}
7137 +
7138 +#define local_lock_irqsave(lvar, _flags)                               \
7139 +       do {                                                            \
7140 +               if (__local_lock_irqsave(&get_local_var(lvar)))         \
7141 +                       put_local_var(lvar);                            \
7142 +               _flags = __this_cpu_read(lvar.flags);                   \
7143 +       } while (0)
7144 +
7145 +#define local_lock_irqsave_on(lvar, _flags, cpu)                       \
7146 +       do {                                                            \
7147 +               __local_lock_irqsave(&per_cpu(lvar, cpu));              \
7148 +               _flags = per_cpu(lvar, cpu).flags;                      \
7149 +       } while (0)
7150 +
7151 +static inline int __local_unlock_irqrestore(struct local_irq_lock *lv,
7152 +                                           unsigned long flags)
7153 +{
7154 +       LL_WARN(!lv->nestcnt);
7155 +       LL_WARN(lv->owner != current);
7156 +       if (--lv->nestcnt)
7157 +               return 0;
7158 +
7159 +       lv->owner = NULL;
7160 +       spin_unlock_irqrestore(&lv->lock, lv->flags);
7161 +       return 1;
7162 +}
7163 +
7164 +#define local_unlock_irqrestore(lvar, flags)                           \
7165 +       do {                                                            \
7166 +               if (__local_unlock_irqrestore(this_cpu_ptr(&lvar), flags)) \
7167 +                       put_local_var(lvar);                            \
7168 +       } while (0)
7169 +
7170 +#define local_unlock_irqrestore_on(lvar, flags, cpu)                   \
7171 +       do {                                                            \
7172 +               __local_unlock_irqrestore(&per_cpu(lvar, cpu), flags);  \
7173 +       } while (0)
7174 +
7175 +#define local_spin_trylock_irq(lvar, lock)                             \
7176 +       ({                                                              \
7177 +               int __locked;                                           \
7178 +               local_lock_irq(lvar);                                   \
7179 +               __locked = spin_trylock(lock);                          \
7180 +               if (!__locked)                                          \
7181 +                       local_unlock_irq(lvar);                         \
7182 +               __locked;                                               \
7183 +       })
7184 +
7185 +#define local_spin_lock_irq(lvar, lock)                                        \
7186 +       do {                                                            \
7187 +               local_lock_irq(lvar);                                   \
7188 +               spin_lock(lock);                                        \
7189 +       } while (0)
7190 +
7191 +#define local_spin_unlock_irq(lvar, lock)                              \
7192 +       do {                                                            \
7193 +               spin_unlock(lock);                                      \
7194 +               local_unlock_irq(lvar);                                 \
7195 +       } while (0)
7196 +
7197 +#define local_spin_lock_irqsave(lvar, lock, flags)                     \
7198 +       do {                                                            \
7199 +               local_lock_irqsave(lvar, flags);                        \
7200 +               spin_lock(lock);                                        \
7201 +       } while (0)
7202 +
7203 +#define local_spin_unlock_irqrestore(lvar, lock, flags)                        \
7204 +       do {                                                            \
7205 +               spin_unlock(lock);                                      \
7206 +               local_unlock_irqrestore(lvar, flags);                   \
7207 +       } while (0)
7208 +
7209 +#define get_locked_var(lvar, var)                                      \
7210 +       (*({                                                            \
7211 +               local_lock(lvar);                                       \
7212 +               this_cpu_ptr(&var);                                     \
7213 +       }))
7214 +
7215 +#define put_locked_var(lvar, var)      local_unlock(lvar);
7216 +
7217 +#define local_lock_cpu(lvar)                                           \
7218 +       ({                                                              \
7219 +               local_lock(lvar);                                       \
7220 +               smp_processor_id();                                     \
7221 +       })
7222 +
7223 +#define local_unlock_cpu(lvar)                 local_unlock(lvar)
7224 +
7225 +#else /* PREEMPT_RT_BASE */
7226 +
7227 +#define DEFINE_LOCAL_IRQ_LOCK(lvar)            __typeof__(const int) lvar
7228 +#define DECLARE_LOCAL_IRQ_LOCK(lvar)           extern __typeof__(const int) lvar
7229 +
7230 +static inline void local_irq_lock_init(int lvar) { }
7231 +
7232 +#define local_lock(lvar)                       preempt_disable()
7233 +#define local_unlock(lvar)                     preempt_enable()
7234 +#define local_lock_irq(lvar)                   local_irq_disable()
7235 +#define local_lock_irq_on(lvar, cpu)           local_irq_disable()
7236 +#define local_unlock_irq(lvar)                 local_irq_enable()
7237 +#define local_unlock_irq_on(lvar, cpu)         local_irq_enable()
7238 +#define local_lock_irqsave(lvar, flags)                local_irq_save(flags)
7239 +#define local_unlock_irqrestore(lvar, flags)   local_irq_restore(flags)
7240 +
7241 +#define local_spin_trylock_irq(lvar, lock)     spin_trylock_irq(lock)
7242 +#define local_spin_lock_irq(lvar, lock)                spin_lock_irq(lock)
7243 +#define local_spin_unlock_irq(lvar, lock)      spin_unlock_irq(lock)
7244 +#define local_spin_lock_irqsave(lvar, lock, flags)     \
7245 +       spin_lock_irqsave(lock, flags)
7246 +#define local_spin_unlock_irqrestore(lvar, lock, flags)        \
7247 +       spin_unlock_irqrestore(lock, flags)
7248 +
7249 +#define get_locked_var(lvar, var)              get_cpu_var(var)
7250 +#define put_locked_var(lvar, var)              put_cpu_var(var)
7251 +
7252 +#define local_lock_cpu(lvar)                   get_cpu()
7253 +#define local_unlock_cpu(lvar)                 put_cpu()
7254 +
7255 +#endif
7256 +
7257 +#endif
7258 diff -Nur linux-4.9.6.orig/include/linux/mm_types.h linux-4.9.6/include/linux/mm_types.h
7259 --- linux-4.9.6.orig/include/linux/mm_types.h   2017-01-26 08:25:24.000000000 +0100
7260 +++ linux-4.9.6/include/linux/mm_types.h        2017-01-28 13:59:09.939656496 +0100
7261 @@ -11,6 +11,7 @@
7262  #include <linux/completion.h>
7263  #include <linux/cpumask.h>
7264  #include <linux/uprobes.h>
7265 +#include <linux/rcupdate.h>
7266  #include <linux/page-flags-layout.h>
7267  #include <linux/workqueue.h>
7268  #include <asm/page.h>
7269 @@ -509,6 +510,9 @@
7270         bool tlb_flush_pending;
7271  #endif
7272         struct uprobes_state uprobes_state;
7273 +#ifdef CONFIG_PREEMPT_RT_BASE
7274 +       struct rcu_head delayed_drop;
7275 +#endif
7276  #ifdef CONFIG_X86_INTEL_MPX
7277         /* address of the bounds directory */
7278         void __user *bd_addr;
7279 diff -Nur linux-4.9.6.orig/include/linux/mutex.h linux-4.9.6/include/linux/mutex.h
7280 --- linux-4.9.6.orig/include/linux/mutex.h      2017-01-26 08:25:24.000000000 +0100
7281 +++ linux-4.9.6/include/linux/mutex.h   2017-01-28 13:59:09.939656496 +0100
7282 @@ -19,6 +19,17 @@
7283  #include <asm/processor.h>
7284  #include <linux/osq_lock.h>
7285
7286 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
7287 +# define __DEP_MAP_MUTEX_INITIALIZER(lockname) \
7288 +       , .dep_map = { .name = #lockname }
7289 +#else
7290 +# define __DEP_MAP_MUTEX_INITIALIZER(lockname)
7291 +#endif
7292 +
7293 +#ifdef CONFIG_PREEMPT_RT_FULL
7294 +# include <linux/mutex_rt.h>
7295 +#else
7296 +
7297  /*
7298   * Simple, straightforward mutexes with strict semantics:
7299   *
7300 @@ -99,13 +110,6 @@
7301  static inline void mutex_destroy(struct mutex *lock) {}
7302  #endif
7303
7304 -#ifdef CONFIG_DEBUG_LOCK_ALLOC
7305 -# define __DEP_MAP_MUTEX_INITIALIZER(lockname) \
7306 -               , .dep_map = { .name = #lockname }
7307 -#else
7308 -# define __DEP_MAP_MUTEX_INITIALIZER(lockname)
7309 -#endif
7310 -
7311  #define __MUTEX_INITIALIZER(lockname) \
7312                 { .count = ATOMIC_INIT(1) \
7313                 , .wait_lock = __SPIN_LOCK_UNLOCKED(lockname.wait_lock) \
7314 @@ -173,6 +177,8 @@
7315  extern int mutex_trylock(struct mutex *lock);
7316  extern void mutex_unlock(struct mutex *lock);
7317
7318 +#endif /* !PREEMPT_RT_FULL */
7319 +
7320  extern int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock);
7321
7322  #endif /* __LINUX_MUTEX_H */
7323 diff -Nur linux-4.9.6.orig/include/linux/mutex_rt.h linux-4.9.6/include/linux/mutex_rt.h
7324 --- linux-4.9.6.orig/include/linux/mutex_rt.h   1970-01-01 01:00:00.000000000 +0100
7325 +++ linux-4.9.6/include/linux/mutex_rt.h        2017-01-28 13:59:09.939656496 +0100
7326 @@ -0,0 +1,84 @@
7327 +#ifndef __LINUX_MUTEX_RT_H
7328 +#define __LINUX_MUTEX_RT_H
7329 +
7330 +#ifndef __LINUX_MUTEX_H
7331 +#error "Please include mutex.h"
7332 +#endif
7333 +
7334 +#include <linux/rtmutex.h>
7335 +
7336 +/* FIXME: Just for __lockfunc */
7337 +#include <linux/spinlock.h>
7338 +
7339 +struct mutex {
7340 +       struct rt_mutex         lock;
7341 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
7342 +       struct lockdep_map      dep_map;
7343 +#endif
7344 +};
7345 +
7346 +#define __MUTEX_INITIALIZER(mutexname)                                 \
7347 +       {                                                               \
7348 +               .lock = __RT_MUTEX_INITIALIZER(mutexname.lock)          \
7349 +               __DEP_MAP_MUTEX_INITIALIZER(mutexname)                  \
7350 +       }
7351 +
7352 +#define DEFINE_MUTEX(mutexname)                                                \
7353 +       struct mutex mutexname = __MUTEX_INITIALIZER(mutexname)
7354 +
7355 +extern void __mutex_do_init(struct mutex *lock, const char *name, struct lock_class_key *key);
7356 +extern void __lockfunc _mutex_lock(struct mutex *lock);
7357 +extern int __lockfunc _mutex_lock_interruptible(struct mutex *lock);
7358 +extern int __lockfunc _mutex_lock_killable(struct mutex *lock);
7359 +extern void __lockfunc _mutex_lock_nested(struct mutex *lock, int subclass);
7360 +extern void __lockfunc _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest_lock);
7361 +extern int __lockfunc _mutex_lock_interruptible_nested(struct mutex *lock, int subclass);
7362 +extern int __lockfunc _mutex_lock_killable_nested(struct mutex *lock, int subclass);
7363 +extern int __lockfunc _mutex_trylock(struct mutex *lock);
7364 +extern void __lockfunc _mutex_unlock(struct mutex *lock);
7365 +
7366 +#define mutex_is_locked(l)             rt_mutex_is_locked(&(l)->lock)
7367 +#define mutex_lock(l)                  _mutex_lock(l)
7368 +#define mutex_lock_interruptible(l)    _mutex_lock_interruptible(l)
7369 +#define mutex_lock_killable(l)         _mutex_lock_killable(l)
7370 +#define mutex_trylock(l)               _mutex_trylock(l)
7371 +#define mutex_unlock(l)                        _mutex_unlock(l)
7372 +#define mutex_destroy(l)               rt_mutex_destroy(&(l)->lock)
7373 +
7374 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
7375 +# define mutex_lock_nested(l, s)       _mutex_lock_nested(l, s)
7376 +# define mutex_lock_interruptible_nested(l, s) \
7377 +                                       _mutex_lock_interruptible_nested(l, s)
7378 +# define mutex_lock_killable_nested(l, s) \
7379 +                                       _mutex_lock_killable_nested(l, s)
7380 +
7381 +# define mutex_lock_nest_lock(lock, nest_lock)                         \
7382 +do {                                                                   \
7383 +       typecheck(struct lockdep_map *, &(nest_lock)->dep_map);         \
7384 +       _mutex_lock_nest_lock(lock, &(nest_lock)->dep_map);             \
7385 +} while (0)
7386 +
7387 +#else
7388 +# define mutex_lock_nested(l, s)       _mutex_lock(l)
7389 +# define mutex_lock_interruptible_nested(l, s) \
7390 +                                       _mutex_lock_interruptible(l)
7391 +# define mutex_lock_killable_nested(l, s) \
7392 +                                       _mutex_lock_killable(l)
7393 +# define mutex_lock_nest_lock(lock, nest_lock) mutex_lock(lock)
7394 +#endif
7395 +
7396 +# define mutex_init(mutex)                             \
7397 +do {                                                   \
7398 +       static struct lock_class_key __key;             \
7399 +                                                       \
7400 +       rt_mutex_init(&(mutex)->lock);                  \
7401 +       __mutex_do_init((mutex), #mutex, &__key);       \
7402 +} while (0)
7403 +
7404 +# define __mutex_init(mutex, name, key)                        \
7405 +do {                                                   \
7406 +       rt_mutex_init(&(mutex)->lock);                  \
7407 +       __mutex_do_init((mutex), name, key);            \
7408 +} while (0)
7409 +
7410 +#endif
7411 diff -Nur linux-4.9.6.orig/include/linux/netdevice.h linux-4.9.6/include/linux/netdevice.h
7412 --- linux-4.9.6.orig/include/linux/netdevice.h  2017-01-26 08:25:24.000000000 +0100
7413 +++ linux-4.9.6/include/linux/netdevice.h       2017-01-28 13:59:09.939656496 +0100
7414 @@ -396,7 +396,19 @@
7415  typedef rx_handler_result_t rx_handler_func_t(struct sk_buff **pskb);
7416
7417  void __napi_schedule(struct napi_struct *n);
7418 +
7419 +/*
7420 + * When PREEMPT_RT_FULL is defined, all device interrupt handlers
7421 + * run as threads, and they can also be preempted (without PREEMPT_RT
7422 + * interrupt threads can not be preempted). Which means that calling
7423 + * __napi_schedule_irqoff() from an interrupt handler can be preempted
7424 + * and can corrupt the napi->poll_list.
7425 + */
7426 +#ifdef CONFIG_PREEMPT_RT_FULL
7427 +#define __napi_schedule_irqoff(n) __napi_schedule(n)
7428 +#else
7429  void __napi_schedule_irqoff(struct napi_struct *n);
7430 +#endif
7431
7432  static inline bool napi_disable_pending(struct napi_struct *n)
7433  {
7434 @@ -2461,14 +2473,53 @@
7435  void synchronize_net(void);
7436  int init_dummy_netdev(struct net_device *dev);
7437
7438 -DECLARE_PER_CPU(int, xmit_recursion);
7439  #define XMIT_RECURSION_LIMIT   10
7440 +#ifdef CONFIG_PREEMPT_RT_FULL
7441 +static inline int dev_recursion_level(void)
7442 +{
7443 +       return current->xmit_recursion;
7444 +}
7445 +
7446 +static inline int xmit_rec_read(void)
7447 +{
7448 +       return current->xmit_recursion;
7449 +}
7450 +
7451 +static inline void xmit_rec_inc(void)
7452 +{
7453 +       current->xmit_recursion++;
7454 +}
7455 +
7456 +static inline void xmit_rec_dec(void)
7457 +{
7458 +       current->xmit_recursion--;
7459 +}
7460 +
7461 +#else
7462 +
7463 +DECLARE_PER_CPU(int, xmit_recursion);
7464
7465  static inline int dev_recursion_level(void)
7466  {
7467         return this_cpu_read(xmit_recursion);
7468  }
7469
7470 +static inline int xmit_rec_read(void)
7471 +{
7472 +       return __this_cpu_read(xmit_recursion);
7473 +}
7474 +
7475 +static inline void xmit_rec_inc(void)
7476 +{
7477 +       __this_cpu_inc(xmit_recursion);
7478 +}
7479 +
7480 +static inline void xmit_rec_dec(void)
7481 +{
7482 +       __this_cpu_dec(xmit_recursion);
7483 +}
7484 +#endif
7485 +
7486  struct net_device *dev_get_by_index(struct net *net, int ifindex);
7487  struct net_device *__dev_get_by_index(struct net *net, int ifindex);
7488  struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex);
7489 @@ -2851,6 +2902,7 @@
7490         unsigned int            dropped;
7491         struct sk_buff_head     input_pkt_queue;
7492         struct napi_struct      backlog;
7493 +       struct sk_buff_head     tofree_queue;
7494
7495  };
7496
7497 diff -Nur linux-4.9.6.orig/include/linux/netfilter/x_tables.h linux-4.9.6/include/linux/netfilter/x_tables.h
7498 --- linux-4.9.6.orig/include/linux/netfilter/x_tables.h 2017-01-26 08:25:24.000000000 +0100
7499 +++ linux-4.9.6/include/linux/netfilter/x_tables.h      2017-01-28 13:59:09.939656496 +0100
7500 @@ -4,6 +4,7 @@
7501
7502  #include <linux/netdevice.h>
7503  #include <linux/static_key.h>
7504 +#include <linux/locallock.h>
7505  #include <uapi/linux/netfilter/x_tables.h>
7506
7507  /* Test a struct->invflags and a boolean for inequality */
7508 @@ -300,6 +301,8 @@
7509   */
7510  DECLARE_PER_CPU(seqcount_t, xt_recseq);
7511
7512 +DECLARE_LOCAL_IRQ_LOCK(xt_write_lock);
7513 +
7514  /* xt_tee_enabled - true if x_tables needs to handle reentrancy
7515   *
7516   * Enabled if current ip(6)tables ruleset has at least one -j TEE rule.
7517 @@ -320,6 +323,9 @@
7518  {
7519         unsigned int addend;
7520
7521 +       /* RT protection */
7522 +       local_lock(xt_write_lock);
7523 +
7524         /*
7525          * Low order bit of sequence is set if we already
7526          * called xt_write_recseq_begin().
7527 @@ -350,6 +356,7 @@
7528         /* this is kind of a write_seqcount_end(), but addend is 0 or 1 */
7529         smp_wmb();
7530         __this_cpu_add(xt_recseq.sequence, addend);
7531 +       local_unlock(xt_write_lock);
7532  }
7533
7534  /*
7535 diff -Nur linux-4.9.6.orig/include/linux/nfs_fs.h linux-4.9.6/include/linux/nfs_fs.h
7536 --- linux-4.9.6.orig/include/linux/nfs_fs.h     2017-01-26 08:25:24.000000000 +0100
7537 +++ linux-4.9.6/include/linux/nfs_fs.h  2017-01-28 13:59:09.939656496 +0100
7538 @@ -165,7 +165,11 @@
7539
7540         /* Readers: in-flight sillydelete RPC calls */
7541         /* Writers: rmdir */
7542 +#ifdef CONFIG_PREEMPT_RT_BASE
7543 +       struct semaphore        rmdir_sem;
7544 +#else
7545         struct rw_semaphore     rmdir_sem;
7546 +#endif
7547
7548  #if IS_ENABLED(CONFIG_NFS_V4)
7549         struct nfs4_cached_acl  *nfs4_acl;
7550 diff -Nur linux-4.9.6.orig/include/linux/nfs_xdr.h linux-4.9.6/include/linux/nfs_xdr.h
7551 --- linux-4.9.6.orig/include/linux/nfs_xdr.h    2017-01-26 08:25:24.000000000 +0100
7552 +++ linux-4.9.6/include/linux/nfs_xdr.h 2017-01-28 13:59:09.939656496 +0100
7553 @@ -1490,7 +1490,7 @@
7554         struct nfs_removeargs args;
7555         struct nfs_removeres res;
7556         struct dentry *dentry;
7557 -       wait_queue_head_t wq;
7558 +       struct swait_queue_head wq;
7559         struct rpc_cred *cred;
7560         struct nfs_fattr dir_attr;
7561         long timeout;
7562 diff -Nur linux-4.9.6.orig/include/linux/notifier.h linux-4.9.6/include/linux/notifier.h
7563 --- linux-4.9.6.orig/include/linux/notifier.h   2017-01-26 08:25:24.000000000 +0100
7564 +++ linux-4.9.6/include/linux/notifier.h        2017-01-28 13:59:09.939656496 +0100
7565 @@ -6,7 +6,7 @@
7566   *
7567   *                             Alan Cox <Alan.Cox@linux.org>
7568   */
7569 -
7570 +
7571  #ifndef _LINUX_NOTIFIER_H
7572  #define _LINUX_NOTIFIER_H
7573  #include <linux/errno.h>
7574 @@ -42,9 +42,7 @@
7575   * in srcu_notifier_call_chain(): no cache bounces and no memory barriers.
7576   * As compensation, srcu_notifier_chain_unregister() is rather expensive.
7577   * SRCU notifier chains should be used when the chain will be called very
7578 - * often but notifier_blocks will seldom be removed.  Also, SRCU notifier
7579 - * chains are slightly more difficult to use because they require special
7580 - * runtime initialization.
7581 + * often but notifier_blocks will seldom be removed.
7582   */
7583
7584  struct notifier_block;
7585 @@ -90,7 +88,7 @@
7586                 (name)->head = NULL;            \
7587         } while (0)
7588
7589 -/* srcu_notifier_heads must be initialized and cleaned up dynamically */
7590 +/* srcu_notifier_heads must be cleaned up dynamically */
7591  extern void srcu_init_notifier_head(struct srcu_notifier_head *nh);
7592  #define srcu_cleanup_notifier_head(name)       \
7593                 cleanup_srcu_struct(&(name)->srcu);
7594 @@ -103,7 +101,13 @@
7595                 .head = NULL }
7596  #define RAW_NOTIFIER_INIT(name)        {                               \
7597                 .head = NULL }
7598 -/* srcu_notifier_heads cannot be initialized statically */
7599 +
7600 +#define SRCU_NOTIFIER_INIT(name, pcpu)                         \
7601 +       {                                                       \
7602 +               .mutex = __MUTEX_INITIALIZER(name.mutex),       \
7603 +               .head = NULL,                                   \
7604 +               .srcu = __SRCU_STRUCT_INIT(name.srcu, pcpu),    \
7605 +       }
7606
7607  #define ATOMIC_NOTIFIER_HEAD(name)                             \
7608         struct atomic_notifier_head name =                      \
7609 @@ -115,6 +119,18 @@
7610         struct raw_notifier_head name =                         \
7611                 RAW_NOTIFIER_INIT(name)
7612
7613 +#define _SRCU_NOTIFIER_HEAD(name, mod)                         \
7614 +       static DEFINE_PER_CPU(struct srcu_struct_array,         \
7615 +                       name##_head_srcu_array);                \
7616 +       mod struct srcu_notifier_head name =                    \
7617 +                       SRCU_NOTIFIER_INIT(name, name##_head_srcu_array)
7618 +
7619 +#define SRCU_NOTIFIER_HEAD(name)                               \
7620 +       _SRCU_NOTIFIER_HEAD(name, )
7621 +
7622 +#define SRCU_NOTIFIER_HEAD_STATIC(name)                                \
7623 +       _SRCU_NOTIFIER_HEAD(name, static)
7624 +
7625  #ifdef __KERNEL__
7626
7627  extern int atomic_notifier_chain_register(struct atomic_notifier_head *nh,
7628 @@ -184,12 +200,12 @@
7629
7630  /*
7631   *     Declared notifiers so far. I can imagine quite a few more chains
7632 - *     over time (eg laptop power reset chains, reboot chain (to clean
7633 + *     over time (eg laptop power reset chains, reboot chain (to clean
7634   *     device units up), device [un]mount chain, module load/unload chain,
7635 - *     low memory chain, screenblank chain (for plug in modular screenblankers)
7636 + *     low memory chain, screenblank chain (for plug in modular screenblankers)
7637   *     VC switch chains (for loadable kernel svgalib VC switch helpers) etc...
7638   */
7639 -
7640 +
7641  /* CPU notfiers are defined in include/linux/cpu.h. */
7642
7643  /* netdevice notifiers are defined in include/linux/netdevice.h */
7644 diff -Nur linux-4.9.6.orig/include/linux/percpu.h linux-4.9.6/include/linux/percpu.h
7645 --- linux-4.9.6.orig/include/linux/percpu.h     2017-01-26 08:25:24.000000000 +0100
7646 +++ linux-4.9.6/include/linux/percpu.h  2017-01-28 13:59:09.939656496 +0100
7647 @@ -18,6 +18,35 @@
7648  #define PERCPU_MODULE_RESERVE          0
7649  #endif
7650
7651 +#ifdef CONFIG_PREEMPT_RT_FULL
7652 +
7653 +#define get_local_var(var) (*({        \
7654 +       migrate_disable();      \
7655 +       this_cpu_ptr(&var);     }))
7656 +
7657 +#define put_local_var(var) do {        \
7658 +       (void)&(var);           \
7659 +       migrate_enable();       \
7660 +} while (0)
7661 +
7662 +# define get_local_ptr(var) ({ \
7663 +       migrate_disable();      \
7664 +       this_cpu_ptr(var);      })
7665 +
7666 +# define put_local_ptr(var) do {       \
7667 +       (void)(var);                    \
7668 +       migrate_enable();               \
7669 +} while (0)
7670 +
7671 +#else
7672 +
7673 +#define get_local_var(var)     get_cpu_var(var)
7674 +#define put_local_var(var)     put_cpu_var(var)
7675 +#define get_local_ptr(var)     get_cpu_ptr(var)
7676 +#define put_local_ptr(var)     put_cpu_ptr(var)
7677 +
7678 +#endif
7679 +
7680  /* minimum unit size, also is the maximum supported allocation size */
7681  #define PCPU_MIN_UNIT_SIZE             PFN_ALIGN(32 << 10)
7682
7683 diff -Nur linux-4.9.6.orig/include/linux/percpu-rwsem.h linux-4.9.6/include/linux/percpu-rwsem.h
7684 --- linux-4.9.6.orig/include/linux/percpu-rwsem.h       2017-01-26 08:25:24.000000000 +0100
7685 +++ linux-4.9.6/include/linux/percpu-rwsem.h    2017-01-28 13:59:09.939656496 +0100
7686 @@ -4,7 +4,7 @@
7687  #include <linux/atomic.h>
7688  #include <linux/rwsem.h>
7689  #include <linux/percpu.h>
7690 -#include <linux/wait.h>
7691 +#include <linux/swait.h>
7692  #include <linux/rcu_sync.h>
7693  #include <linux/lockdep.h>
7694
7695 @@ -12,7 +12,7 @@
7696         struct rcu_sync         rss;
7697         unsigned int __percpu   *read_count;
7698         struct rw_semaphore     rw_sem;
7699 -       wait_queue_head_t       writer;
7700 +       struct swait_queue_head writer;
7701         int                     readers_block;
7702  };
7703
7704 @@ -22,13 +22,13 @@
7705         .rss = __RCU_SYNC_INITIALIZER(name.rss, RCU_SCHED_SYNC),        \
7706         .read_count = &__percpu_rwsem_rc_##name,                        \
7707         .rw_sem = __RWSEM_INITIALIZER(name.rw_sem),                     \
7708 -       .writer = __WAIT_QUEUE_HEAD_INITIALIZER(name.writer),           \
7709 +       .writer = __SWAIT_QUEUE_HEAD_INITIALIZER(name.writer),          \
7710  }
7711
7712  extern int __percpu_down_read(struct percpu_rw_semaphore *, int);
7713  extern void __percpu_up_read(struct percpu_rw_semaphore *);
7714
7715 -static inline void percpu_down_read_preempt_disable(struct percpu_rw_semaphore *sem)
7716 +static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
7717  {
7718         might_sleep();
7719
7720 @@ -46,16 +46,10 @@
7721         __this_cpu_inc(*sem->read_count);
7722         if (unlikely(!rcu_sync_is_idle(&sem->rss)))
7723                 __percpu_down_read(sem, false); /* Unconditional memory barrier */
7724 -       barrier();
7725         /*
7726 -        * The barrier() prevents the compiler from
7727 +        * The preempt_enable() prevents the compiler from
7728          * bleeding the critical section out.
7729          */
7730 -}
7731 -
7732 -static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
7733 -{
7734 -       percpu_down_read_preempt_disable(sem);
7735         preempt_enable();
7736  }
7737
7738 @@ -82,13 +76,9 @@
7739         return ret;
7740  }
7741
7742 -static inline void percpu_up_read_preempt_enable(struct percpu_rw_semaphore *sem)
7743 +static inline void percpu_up_read(struct percpu_rw_semaphore *sem)
7744  {
7745 -       /*
7746 -        * The barrier() prevents the compiler from
7747 -        * bleeding the critical section out.
7748 -        */
7749 -       barrier();
7750 +       preempt_disable();
7751         /*
7752          * Same as in percpu_down_read().
7753          */
7754 @@ -101,12 +91,6 @@
7755         rwsem_release(&sem->rw_sem.dep_map, 1, _RET_IP_);
7756  }
7757
7758 -static inline void percpu_up_read(struct percpu_rw_semaphore *sem)
7759 -{
7760 -       preempt_disable();
7761 -       percpu_up_read_preempt_enable(sem);
7762 -}
7763 -
7764  extern void percpu_down_write(struct percpu_rw_semaphore *);
7765  extern void percpu_up_write(struct percpu_rw_semaphore *);
7766
7767 diff -Nur linux-4.9.6.orig/include/linux/pid.h linux-4.9.6/include/linux/pid.h
7768 --- linux-4.9.6.orig/include/linux/pid.h        2017-01-26 08:25:24.000000000 +0100
7769 +++ linux-4.9.6/include/linux/pid.h     2017-01-28 13:59:09.939656496 +0100
7770 @@ -2,6 +2,7 @@
7771  #define _LINUX_PID_H
7772
7773  #include <linux/rcupdate.h>
7774 +#include <linux/atomic.h>
7775
7776  enum pid_type
7777  {
7778 diff -Nur linux-4.9.6.orig/include/linux/preempt.h linux-4.9.6/include/linux/preempt.h
7779 --- linux-4.9.6.orig/include/linux/preempt.h    2017-01-26 08:25:24.000000000 +0100
7780 +++ linux-4.9.6/include/linux/preempt.h 2017-01-28 13:59:09.939656496 +0100
7781 @@ -50,7 +50,11 @@
7782  #define HARDIRQ_OFFSET (1UL << HARDIRQ_SHIFT)
7783  #define NMI_OFFSET     (1UL << NMI_SHIFT)
7784
7785 -#define SOFTIRQ_DISABLE_OFFSET (2 * SOFTIRQ_OFFSET)
7786 +#ifndef CONFIG_PREEMPT_RT_FULL
7787 +# define SOFTIRQ_DISABLE_OFFSET                (2 * SOFTIRQ_OFFSET)
7788 +#else
7789 +# define SOFTIRQ_DISABLE_OFFSET                (0)
7790 +#endif
7791
7792  /* We use the MSB mostly because its available */
7793  #define PREEMPT_NEED_RESCHED   0x80000000
7794 @@ -59,9 +63,15 @@
7795  #include <asm/preempt.h>
7796
7797  #define hardirq_count()        (preempt_count() & HARDIRQ_MASK)
7798 -#define softirq_count()        (preempt_count() & SOFTIRQ_MASK)
7799  #define irq_count()    (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK \
7800                                  | NMI_MASK))
7801 +#ifndef CONFIG_PREEMPT_RT_FULL
7802 +# define softirq_count()       (preempt_count() & SOFTIRQ_MASK)
7803 +# define in_serving_softirq()  (softirq_count() & SOFTIRQ_OFFSET)
7804 +#else
7805 +# define softirq_count()       (0UL)
7806 +extern int in_serving_softirq(void);
7807 +#endif
7808
7809  /*
7810   * Are we doing bottom half or hardware interrupt processing?
7811 @@ -72,7 +82,6 @@
7812  #define in_irq()               (hardirq_count())
7813  #define in_softirq()           (softirq_count())
7814  #define in_interrupt()         (irq_count())
7815 -#define in_serving_softirq()   (softirq_count() & SOFTIRQ_OFFSET)
7816
7817  /*
7818   * Are we in NMI context?
7819 @@ -91,7 +100,11 @@
7820  /*
7821   * The preempt_count offset after spin_lock()
7822   */
7823 +#if !defined(CONFIG_PREEMPT_RT_FULL)
7824  #define PREEMPT_LOCK_OFFSET    PREEMPT_DISABLE_OFFSET
7825 +#else
7826 +#define PREEMPT_LOCK_OFFSET    0
7827 +#endif
7828
7829  /*
7830   * The preempt_count offset needed for things like:
7831 @@ -140,6 +153,20 @@
7832  #define preempt_count_inc() preempt_count_add(1)
7833  #define preempt_count_dec() preempt_count_sub(1)
7834
7835 +#ifdef CONFIG_PREEMPT_LAZY
7836 +#define add_preempt_lazy_count(val)    do { preempt_lazy_count() += (val); } while (0)
7837 +#define sub_preempt_lazy_count(val)    do { preempt_lazy_count() -= (val); } while (0)
7838 +#define inc_preempt_lazy_count()       add_preempt_lazy_count(1)
7839 +#define dec_preempt_lazy_count()       sub_preempt_lazy_count(1)
7840 +#define preempt_lazy_count()           (current_thread_info()->preempt_lazy_count)
7841 +#else
7842 +#define add_preempt_lazy_count(val)    do { } while (0)
7843 +#define sub_preempt_lazy_count(val)    do { } while (0)
7844 +#define inc_preempt_lazy_count()       do { } while (0)
7845 +#define dec_preempt_lazy_count()       do { } while (0)
7846 +#define preempt_lazy_count()           (0)
7847 +#endif
7848 +
7849  #ifdef CONFIG_PREEMPT_COUNT
7850
7851  #define preempt_disable() \
7852 @@ -148,13 +175,25 @@
7853         barrier(); \
7854  } while (0)
7855
7856 +#define preempt_lazy_disable() \
7857 +do { \
7858 +       inc_preempt_lazy_count(); \
7859 +       barrier(); \
7860 +} while (0)
7861 +
7862  #define sched_preempt_enable_no_resched() \
7863  do { \
7864         barrier(); \
7865         preempt_count_dec(); \
7866  } while (0)
7867
7868 -#define preempt_enable_no_resched() sched_preempt_enable_no_resched()
7869 +#ifdef CONFIG_PREEMPT_RT_BASE
7870 +# define preempt_enable_no_resched() sched_preempt_enable_no_resched()
7871 +# define preempt_check_resched_rt() preempt_check_resched()
7872 +#else
7873 +# define preempt_enable_no_resched() preempt_enable()
7874 +# define preempt_check_resched_rt() barrier();
7875 +#endif
7876
7877  #define preemptible()  (preempt_count() == 0 && !irqs_disabled())
7878
7879 @@ -179,6 +218,13 @@
7880                 __preempt_schedule(); \
7881  } while (0)
7882
7883 +#define preempt_lazy_enable() \
7884 +do { \
7885 +       dec_preempt_lazy_count(); \
7886 +       barrier(); \
7887 +       preempt_check_resched(); \
7888 +} while (0)
7889 +
7890  #else /* !CONFIG_PREEMPT */
7891  #define preempt_enable() \
7892  do { \
7893 @@ -224,6 +270,7 @@
7894  #define preempt_disable_notrace()              barrier()
7895  #define preempt_enable_no_resched_notrace()    barrier()
7896  #define preempt_enable_notrace()               barrier()
7897 +#define preempt_check_resched_rt()             barrier()
7898  #define preemptible()                          0
7899
7900  #endif /* CONFIG_PREEMPT_COUNT */
7901 @@ -244,10 +291,31 @@
7902  } while (0)
7903  #define preempt_fold_need_resched() \
7904  do { \
7905 -       if (tif_need_resched()) \
7906 +       if (tif_need_resched_now()) \
7907                 set_preempt_need_resched(); \
7908  } while (0)
7909
7910 +#ifdef CONFIG_PREEMPT_RT_FULL
7911 +# define preempt_disable_rt()          preempt_disable()
7912 +# define preempt_enable_rt()           preempt_enable()
7913 +# define preempt_disable_nort()                barrier()
7914 +# define preempt_enable_nort()         barrier()
7915 +# ifdef CONFIG_SMP
7916 +   extern void migrate_disable(void);
7917 +   extern void migrate_enable(void);
7918 +# else /* CONFIG_SMP */
7919 +#  define migrate_disable()            barrier()
7920 +#  define migrate_enable()             barrier()
7921 +# endif /* CONFIG_SMP */
7922 +#else
7923 +# define preempt_disable_rt()          barrier()
7924 +# define preempt_enable_rt()           barrier()
7925 +# define preempt_disable_nort()                preempt_disable()
7926 +# define preempt_enable_nort()         preempt_enable()
7927 +# define migrate_disable()             preempt_disable()
7928 +# define migrate_enable()              preempt_enable()
7929 +#endif
7930 +
7931  #ifdef CONFIG_PREEMPT_NOTIFIERS
7932
7933  struct preempt_notifier;
7934 diff -Nur linux-4.9.6.orig/include/linux/printk.h linux-4.9.6/include/linux/printk.h
7935 --- linux-4.9.6.orig/include/linux/printk.h     2017-01-26 08:25:24.000000000 +0100
7936 +++ linux-4.9.6/include/linux/printk.h  2017-01-28 13:59:09.939656496 +0100
7937 @@ -126,9 +126,11 @@
7938  #ifdef CONFIG_EARLY_PRINTK
7939  extern asmlinkage __printf(1, 2)
7940  void early_printk(const char *fmt, ...);
7941 +extern void printk_kill(void);
7942  #else
7943  static inline __printf(1, 2) __cold
7944  void early_printk(const char *s, ...) { }
7945 +static inline void printk_kill(void) { }
7946  #endif
7947
7948  #ifdef CONFIG_PRINTK_NMI
7949 diff -Nur linux-4.9.6.orig/include/linux/radix-tree.h linux-4.9.6/include/linux/radix-tree.h
7950 --- linux-4.9.6.orig/include/linux/radix-tree.h 2017-01-26 08:25:24.000000000 +0100
7951 +++ linux-4.9.6/include/linux/radix-tree.h      2017-01-28 13:59:09.939656496 +0100
7952 @@ -289,9 +289,19 @@
7953  unsigned int radix_tree_gang_lookup_slot(struct radix_tree_root *root,
7954                         void ***results, unsigned long *indices,
7955                         unsigned long first_index, unsigned int max_items);
7956 +#ifdef CONFIG_PREEMPT_RT_FULL
7957 +static inline int radix_tree_preload(gfp_t gm) { return 0; }
7958 +static inline int radix_tree_maybe_preload(gfp_t gfp_mask) { return 0; }
7959 +static inline int radix_tree_maybe_preload_order(gfp_t gfp_mask, int order)
7960 +{
7961 +       return 0;
7962 +};
7963 +
7964 +#else
7965  int radix_tree_preload(gfp_t gfp_mask);
7966  int radix_tree_maybe_preload(gfp_t gfp_mask);
7967  int radix_tree_maybe_preload_order(gfp_t gfp_mask, int order);
7968 +#endif
7969  void radix_tree_init(void);
7970  void *radix_tree_tag_set(struct radix_tree_root *root,
7971                         unsigned long index, unsigned int tag);
7972 @@ -316,7 +326,7 @@
7973
7974  static inline void radix_tree_preload_end(void)
7975  {
7976 -       preempt_enable();
7977 +       preempt_enable_nort();
7978  }
7979
7980  /**
7981 diff -Nur linux-4.9.6.orig/include/linux/random.h linux-4.9.6/include/linux/random.h
7982 --- linux-4.9.6.orig/include/linux/random.h     2017-01-26 08:25:24.000000000 +0100
7983 +++ linux-4.9.6/include/linux/random.h  2017-01-28 13:59:09.943656649 +0100
7984 @@ -31,7 +31,7 @@
7985
7986  extern void add_input_randomness(unsigned int type, unsigned int code,
7987                                  unsigned int value) __latent_entropy;
7988 -extern void add_interrupt_randomness(int irq, int irq_flags) __latent_entropy;
7989 +extern void add_interrupt_randomness(int irq, int irq_flags, __u64 ip) __latent_entropy;
7990
7991  extern void get_random_bytes(void *buf, int nbytes);
7992  extern int add_random_ready_callback(struct random_ready_callback *rdy);
7993 diff -Nur linux-4.9.6.orig/include/linux/rbtree_augmented.h linux-4.9.6/include/linux/rbtree_augmented.h
7994 --- linux-4.9.6.orig/include/linux/rbtree_augmented.h   2017-01-26 08:25:24.000000000 +0100
7995 +++ linux-4.9.6/include/linux/rbtree_augmented.h        2017-01-28 13:59:09.943656649 +0100
7996 @@ -26,6 +26,7 @@
7997
7998  #include <linux/compiler.h>
7999  #include <linux/rbtree.h>
8000 +#include <linux/rcupdate.h>
8001
8002  /*
8003   * Please note - only struct rb_augment_callbacks and the prototypes for
8004 diff -Nur linux-4.9.6.orig/include/linux/rbtree.h linux-4.9.6/include/linux/rbtree.h
8005 --- linux-4.9.6.orig/include/linux/rbtree.h     2017-01-26 08:25:24.000000000 +0100
8006 +++ linux-4.9.6/include/linux/rbtree.h  2017-01-28 13:59:09.943656649 +0100
8007 @@ -31,7 +31,7 @@
8008
8009  #include <linux/kernel.h>
8010  #include <linux/stddef.h>
8011 -#include <linux/rcupdate.h>
8012 +#include <linux/rcu_assign_pointer.h>
8013
8014  struct rb_node {
8015         unsigned long  __rb_parent_color;
8016 diff -Nur linux-4.9.6.orig/include/linux/rcu_assign_pointer.h linux-4.9.6/include/linux/rcu_assign_pointer.h
8017 --- linux-4.9.6.orig/include/linux/rcu_assign_pointer.h 1970-01-01 01:00:00.000000000 +0100
8018 +++ linux-4.9.6/include/linux/rcu_assign_pointer.h      2017-01-28 13:59:09.943656649 +0100
8019 @@ -0,0 +1,54 @@
8020 +#ifndef __LINUX_RCU_ASSIGN_POINTER_H__
8021 +#define __LINUX_RCU_ASSIGN_POINTER_H__
8022 +#include <linux/compiler.h>
8023 +#include <asm/barrier.h>
8024 +
8025 +/**
8026 + * RCU_INITIALIZER() - statically initialize an RCU-protected global variable
8027 + * @v: The value to statically initialize with.
8028 + */
8029 +#define RCU_INITIALIZER(v) (typeof(*(v)) __force __rcu *)(v)
8030 +
8031 +/**
8032 + * rcu_assign_pointer() - assign to RCU-protected pointer
8033 + * @p: pointer to assign to
8034 + * @v: value to assign (publish)
8035 + *
8036 + * Assigns the specified value to the specified RCU-protected
8037 + * pointer, ensuring that any concurrent RCU readers will see
8038 + * any prior initialization.
8039 + *
8040 + * Inserts memory barriers on architectures that require them
8041 + * (which is most of them), and also prevents the compiler from
8042 + * reordering the code that initializes the structure after the pointer
8043 + * assignment.  More importantly, this call documents which pointers
8044 + * will be dereferenced by RCU read-side code.
8045 + *
8046 + * In some special cases, you may use RCU_INIT_POINTER() instead
8047 + * of rcu_assign_pointer().  RCU_INIT_POINTER() is a bit faster due
8048 + * to the fact that it does not constrain either the CPU or the compiler.
8049 + * That said, using RCU_INIT_POINTER() when you should have used
8050 + * rcu_assign_pointer() is a very bad thing that results in
8051 + * impossible-to-diagnose memory corruption.  So please be careful.
8052 + * See the RCU_INIT_POINTER() comment header for details.
8053 + *
8054 + * Note that rcu_assign_pointer() evaluates each of its arguments only
8055 + * once, appearances notwithstanding.  One of the "extra" evaluations
8056 + * is in typeof() and the other visible only to sparse (__CHECKER__),
8057 + * neither of which actually execute the argument.  As with most cpp
8058 + * macros, this execute-arguments-only-once property is important, so
8059 + * please be careful when making changes to rcu_assign_pointer() and the
8060 + * other macros that it invokes.
8061 + */
8062 +#define rcu_assign_pointer(p, v)                                             \
8063 +({                                                                           \
8064 +       uintptr_t _r_a_p__v = (uintptr_t)(v);                                 \
8065 +                                                                             \
8066 +       if (__builtin_constant_p(v) && (_r_a_p__v) == (uintptr_t)NULL)        \
8067 +               WRITE_ONCE((p), (typeof(p))(_r_a_p__v));                      \
8068 +       else                                                                  \
8069 +               smp_store_release(&p, RCU_INITIALIZER((typeof(p))_r_a_p__v)); \
8070 +       _r_a_p__v;                                                            \
8071 +})
8072 +
8073 +#endif
8074 diff -Nur linux-4.9.6.orig/include/linux/rcupdate.h linux-4.9.6/include/linux/rcupdate.h
8075 --- linux-4.9.6.orig/include/linux/rcupdate.h   2017-01-26 08:25:24.000000000 +0100
8076 +++ linux-4.9.6/include/linux/rcupdate.h        2017-01-28 13:59:09.943656649 +0100
8077 @@ -46,6 +46,7 @@
8078  #include <linux/compiler.h>
8079  #include <linux/ktime.h>
8080  #include <linux/irqflags.h>
8081 +#include <linux/rcu_assign_pointer.h>
8082
8083  #include <asm/barrier.h>
8084
8085 @@ -178,6 +179,9 @@
8086
8087  #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
8088
8089 +#ifdef CONFIG_PREEMPT_RT_FULL
8090 +#define call_rcu_bh    call_rcu
8091 +#else
8092  /**
8093   * call_rcu_bh() - Queue an RCU for invocation after a quicker grace period.
8094   * @head: structure to be used for queueing the RCU updates.
8095 @@ -201,6 +205,7 @@
8096   */
8097  void call_rcu_bh(struct rcu_head *head,
8098                  rcu_callback_t func);
8099 +#endif
8100
8101  /**
8102   * call_rcu_sched() - Queue an RCU for invocation after sched grace period.
8103 @@ -301,6 +306,11 @@
8104   * types of kernel builds, the rcu_read_lock() nesting depth is unknowable.
8105   */
8106  #define rcu_preempt_depth() (current->rcu_read_lock_nesting)
8107 +#ifndef CONFIG_PREEMPT_RT_FULL
8108 +#define sched_rcu_preempt_depth()      rcu_preempt_depth()
8109 +#else
8110 +static inline int sched_rcu_preempt_depth(void) { return 0; }
8111 +#endif
8112
8113  #else /* #ifdef CONFIG_PREEMPT_RCU */
8114
8115 @@ -326,6 +336,8 @@
8116         return 0;
8117  }
8118
8119 +#define sched_rcu_preempt_depth()      rcu_preempt_depth()
8120 +
8121  #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
8122
8123  /* Internal to kernel */
8124 @@ -505,7 +517,14 @@
8125  int debug_lockdep_rcu_enabled(void);
8126
8127  int rcu_read_lock_held(void);
8128 +#ifdef CONFIG_PREEMPT_RT_FULL
8129 +static inline int rcu_read_lock_bh_held(void)
8130 +{
8131 +       return rcu_read_lock_held();
8132 +}
8133 +#else
8134  int rcu_read_lock_bh_held(void);
8135 +#endif
8136
8137  /**
8138   * rcu_read_lock_sched_held() - might we be in RCU-sched read-side critical section?
8139 @@ -626,54 +645,6 @@
8140  })
8141
8142  /**
8143 - * RCU_INITIALIZER() - statically initialize an RCU-protected global variable
8144 - * @v: The value to statically initialize with.
8145 - */
8146 -#define RCU_INITIALIZER(v) (typeof(*(v)) __force __rcu *)(v)
8147 -
8148 -/**
8149 - * rcu_assign_pointer() - assign to RCU-protected pointer
8150 - * @p: pointer to assign to
8151 - * @v: value to assign (publish)
8152 - *
8153 - * Assigns the specified value to the specified RCU-protected
8154 - * pointer, ensuring that any concurrent RCU readers will see
8155 - * any prior initialization.
8156 - *
8157 - * Inserts memory barriers on architectures that require them
8158 - * (which is most of them), and also prevents the compiler from
8159 - * reordering the code that initializes the structure after the pointer
8160 - * assignment.  More importantly, this call documents which pointers
8161 - * will be dereferenced by RCU read-side code.
8162 - *
8163 - * In some special cases, you may use RCU_INIT_POINTER() instead
8164 - * of rcu_assign_pointer().  RCU_INIT_POINTER() is a bit faster due
8165 - * to the fact that it does not constrain either the CPU or the compiler.
8166 - * That said, using RCU_INIT_POINTER() when you should have used
8167 - * rcu_assign_pointer() is a very bad thing that results in
8168 - * impossible-to-diagnose memory corruption.  So please be careful.
8169 - * See the RCU_INIT_POINTER() comment header for details.
8170 - *
8171 - * Note that rcu_assign_pointer() evaluates each of its arguments only
8172 - * once, appearances notwithstanding.  One of the "extra" evaluations
8173 - * is in typeof() and the other visible only to sparse (__CHECKER__),
8174 - * neither of which actually execute the argument.  As with most cpp
8175 - * macros, this execute-arguments-only-once property is important, so
8176 - * please be careful when making changes to rcu_assign_pointer() and the
8177 - * other macros that it invokes.
8178 - */
8179 -#define rcu_assign_pointer(p, v)                                             \
8180 -({                                                                           \
8181 -       uintptr_t _r_a_p__v = (uintptr_t)(v);                                 \
8182 -                                                                             \
8183 -       if (__builtin_constant_p(v) && (_r_a_p__v) == (uintptr_t)NULL)        \
8184 -               WRITE_ONCE((p), (typeof(p))(_r_a_p__v));                      \
8185 -       else                                                                  \
8186 -               smp_store_release(&p, RCU_INITIALIZER((typeof(p))_r_a_p__v)); \
8187 -       _r_a_p__v;                                                            \
8188 -})
8189 -
8190 -/**
8191   * rcu_access_pointer() - fetch RCU pointer with no dereferencing
8192   * @p: The pointer to read
8193   *
8194 @@ -951,10 +922,14 @@
8195  static inline void rcu_read_lock_bh(void)
8196  {
8197         local_bh_disable();
8198 +#ifdef CONFIG_PREEMPT_RT_FULL
8199 +       rcu_read_lock();
8200 +#else
8201         __acquire(RCU_BH);
8202         rcu_lock_acquire(&rcu_bh_lock_map);
8203         RCU_LOCKDEP_WARN(!rcu_is_watching(),
8204                          "rcu_read_lock_bh() used illegally while idle");
8205 +#endif
8206  }
8207
8208  /*
8209 @@ -964,10 +939,14 @@
8210   */
8211  static inline void rcu_read_unlock_bh(void)
8212  {
8213 +#ifdef CONFIG_PREEMPT_RT_FULL
8214 +       rcu_read_unlock();
8215 +#else
8216         RCU_LOCKDEP_WARN(!rcu_is_watching(),
8217                          "rcu_read_unlock_bh() used illegally while idle");
8218         rcu_lock_release(&rcu_bh_lock_map);
8219         __release(RCU_BH);
8220 +#endif
8221         local_bh_enable();
8222  }
8223
8224 diff -Nur linux-4.9.6.orig/include/linux/rcutree.h linux-4.9.6/include/linux/rcutree.h
8225 --- linux-4.9.6.orig/include/linux/rcutree.h    2017-01-26 08:25:24.000000000 +0100
8226 +++ linux-4.9.6/include/linux/rcutree.h 2017-01-28 13:59:09.943656649 +0100
8227 @@ -44,7 +44,11 @@
8228         rcu_note_context_switch();
8229  }
8230
8231 +#ifdef CONFIG_PREEMPT_RT_FULL
8232 +# define synchronize_rcu_bh    synchronize_rcu
8233 +#else
8234  void synchronize_rcu_bh(void);
8235 +#endif
8236  void synchronize_sched_expedited(void);
8237  void synchronize_rcu_expedited(void);
8238
8239 @@ -72,7 +76,11 @@
8240  }
8241
8242  void rcu_barrier(void);
8243 +#ifdef CONFIG_PREEMPT_RT_FULL
8244 +# define rcu_barrier_bh                rcu_barrier
8245 +#else
8246  void rcu_barrier_bh(void);
8247 +#endif
8248  void rcu_barrier_sched(void);
8249  unsigned long get_state_synchronize_rcu(void);
8250  void cond_synchronize_rcu(unsigned long oldstate);
8251 @@ -82,17 +90,14 @@
8252  extern unsigned long rcutorture_testseq;
8253  extern unsigned long rcutorture_vernum;
8254  unsigned long rcu_batches_started(void);
8255 -unsigned long rcu_batches_started_bh(void);
8256  unsigned long rcu_batches_started_sched(void);
8257  unsigned long rcu_batches_completed(void);
8258 -unsigned long rcu_batches_completed_bh(void);
8259  unsigned long rcu_batches_completed_sched(void);
8260  unsigned long rcu_exp_batches_completed(void);
8261  unsigned long rcu_exp_batches_completed_sched(void);
8262  void show_rcu_gp_kthreads(void);
8263
8264  void rcu_force_quiescent_state(void);
8265 -void rcu_bh_force_quiescent_state(void);
8266  void rcu_sched_force_quiescent_state(void);
8267
8268  void rcu_idle_enter(void);
8269 @@ -109,6 +114,16 @@
8270
8271  bool rcu_is_watching(void);
8272
8273 +#ifndef CONFIG_PREEMPT_RT_FULL
8274 +void rcu_bh_force_quiescent_state(void);
8275 +unsigned long rcu_batches_started_bh(void);
8276 +unsigned long rcu_batches_completed_bh(void);
8277 +#else
8278 +# define rcu_bh_force_quiescent_state  rcu_force_quiescent_state
8279 +# define rcu_batches_completed_bh      rcu_batches_completed
8280 +# define rcu_batches_started_bh                rcu_batches_completed
8281 +#endif
8282 +
8283  void rcu_all_qs(void);
8284
8285  /* RCUtree hotplug events */
8286 diff -Nur linux-4.9.6.orig/include/linux/rtmutex.h linux-4.9.6/include/linux/rtmutex.h
8287 --- linux-4.9.6.orig/include/linux/rtmutex.h    2017-01-26 08:25:24.000000000 +0100
8288 +++ linux-4.9.6/include/linux/rtmutex.h 2017-01-28 13:59:09.943656649 +0100
8289 @@ -13,11 +13,15 @@
8290  #define __LINUX_RT_MUTEX_H
8291
8292  #include <linux/linkage.h>
8293 +#include <linux/spinlock_types_raw.h>
8294  #include <linux/rbtree.h>
8295 -#include <linux/spinlock_types.h>
8296
8297  extern int max_lock_depth; /* for sysctl */
8298
8299 +#ifdef CONFIG_DEBUG_MUTEXES
8300 +#include <linux/debug_locks.h>
8301 +#endif
8302 +
8303  /**
8304   * The rt_mutex structure
8305   *
8306 @@ -31,8 +35,8 @@
8307         struct rb_root          waiters;
8308         struct rb_node          *waiters_leftmost;
8309         struct task_struct      *owner;
8310 -#ifdef CONFIG_DEBUG_RT_MUTEXES
8311         int                     save_state;
8312 +#ifdef CONFIG_DEBUG_RT_MUTEXES
8313         const char              *name, *file;
8314         int                     line;
8315         void                    *magic;
8316 @@ -55,22 +59,33 @@
8317  # define rt_mutex_debug_check_no_locks_held(task)      do { } while (0)
8318  #endif
8319
8320 +# define rt_mutex_init(mutex)                                  \
8321 +       do {                                                    \
8322 +               raw_spin_lock_init(&(mutex)->wait_lock);        \
8323 +               __rt_mutex_init(mutex, #mutex);                 \
8324 +       } while (0)
8325 +
8326  #ifdef CONFIG_DEBUG_RT_MUTEXES
8327  # define __DEBUG_RT_MUTEX_INITIALIZER(mutexname) \
8328         , .name = #mutexname, .file = __FILE__, .line = __LINE__
8329 -# define rt_mutex_init(mutex)                  __rt_mutex_init(mutex, __func__)
8330   extern void rt_mutex_debug_task_free(struct task_struct *tsk);
8331  #else
8332  # define __DEBUG_RT_MUTEX_INITIALIZER(mutexname)
8333 -# define rt_mutex_init(mutex)                  __rt_mutex_init(mutex, NULL)
8334  # define rt_mutex_debug_task_free(t)                   do { } while (0)
8335  #endif
8336
8337 -#define __RT_MUTEX_INITIALIZER(mutexname) \
8338 -       { .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \
8339 +#define __RT_MUTEX_INITIALIZER_PLAIN(mutexname) \
8340 +        .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \
8341         , .waiters = RB_ROOT \
8342         , .owner = NULL \
8343 -       __DEBUG_RT_MUTEX_INITIALIZER(mutexname)}
8344 +       __DEBUG_RT_MUTEX_INITIALIZER(mutexname)
8345 +
8346 +#define __RT_MUTEX_INITIALIZER(mutexname) \
8347 +       { __RT_MUTEX_INITIALIZER_PLAIN(mutexname) }
8348 +
8349 +#define __RT_MUTEX_INITIALIZER_SAVE_STATE(mutexname) \
8350 +       { __RT_MUTEX_INITIALIZER_PLAIN(mutexname)    \
8351 +       , .save_state = 1 }
8352
8353  #define DEFINE_RT_MUTEX(mutexname) \
8354         struct rt_mutex mutexname = __RT_MUTEX_INITIALIZER(mutexname)
8355 @@ -91,6 +106,7 @@
8356
8357  extern void rt_mutex_lock(struct rt_mutex *lock);
8358  extern int rt_mutex_lock_interruptible(struct rt_mutex *lock);
8359 +extern int rt_mutex_lock_killable(struct rt_mutex *lock);
8360  extern int rt_mutex_timed_lock(struct rt_mutex *lock,
8361                                struct hrtimer_sleeper *timeout);
8362
8363 diff -Nur linux-4.9.6.orig/include/linux/rwlock_rt.h linux-4.9.6/include/linux/rwlock_rt.h
8364 --- linux-4.9.6.orig/include/linux/rwlock_rt.h  1970-01-01 01:00:00.000000000 +0100
8365 +++ linux-4.9.6/include/linux/rwlock_rt.h       2017-01-28 13:59:09.943656649 +0100
8366 @@ -0,0 +1,99 @@
8367 +#ifndef __LINUX_RWLOCK_RT_H
8368 +#define __LINUX_RWLOCK_RT_H
8369 +
8370 +#ifndef __LINUX_SPINLOCK_H
8371 +#error Do not include directly. Use spinlock.h
8372 +#endif
8373 +
8374 +#define rwlock_init(rwl)                               \
8375 +do {                                                   \
8376 +       static struct lock_class_key __key;             \
8377 +                                                       \
8378 +       rt_mutex_init(&(rwl)->lock);                    \
8379 +       __rt_rwlock_init(rwl, #rwl, &__key);            \
8380 +} while (0)
8381 +
8382 +extern void __lockfunc rt_write_lock(rwlock_t *rwlock);
8383 +extern void __lockfunc rt_read_lock(rwlock_t *rwlock);
8384 +extern int __lockfunc rt_write_trylock(rwlock_t *rwlock);
8385 +extern int __lockfunc rt_write_trylock_irqsave(rwlock_t *trylock, unsigned long *flags);
8386 +extern int __lockfunc rt_read_trylock(rwlock_t *rwlock);
8387 +extern void __lockfunc rt_write_unlock(rwlock_t *rwlock);
8388 +extern void __lockfunc rt_read_unlock(rwlock_t *rwlock);
8389 +extern unsigned long __lockfunc rt_write_lock_irqsave(rwlock_t *rwlock);
8390 +extern unsigned long __lockfunc rt_read_lock_irqsave(rwlock_t *rwlock);
8391 +extern void __rt_rwlock_init(rwlock_t *rwlock, char *name, struct lock_class_key *key);
8392 +
8393 +#define read_trylock(lock)     __cond_lock(lock, rt_read_trylock(lock))
8394 +#define write_trylock(lock)    __cond_lock(lock, rt_write_trylock(lock))
8395 +
8396 +#define write_trylock_irqsave(lock, flags)     \
8397 +       __cond_lock(lock, rt_write_trylock_irqsave(lock, &flags))
8398 +
8399 +#define read_lock_irqsave(lock, flags)                 \
8400 +       do {                                            \
8401 +               typecheck(unsigned long, flags);        \
8402 +               flags = rt_read_lock_irqsave(lock);     \
8403 +       } while (0)
8404 +
8405 +#define write_lock_irqsave(lock, flags)                        \
8406 +       do {                                            \
8407 +               typecheck(unsigned long, flags);        \
8408 +               flags = rt_write_lock_irqsave(lock);    \
8409 +       } while (0)
8410 +
8411 +#define read_lock(lock)                rt_read_lock(lock)
8412 +
8413 +#define read_lock_bh(lock)                             \
8414 +       do {                                            \
8415 +               local_bh_disable();                     \
8416 +               rt_read_lock(lock);                     \
8417 +       } while (0)
8418 +
8419 +#define read_lock_irq(lock)    read_lock(lock)
8420 +
8421 +#define write_lock(lock)       rt_write_lock(lock)
8422 +
8423 +#define write_lock_bh(lock)                            \
8424 +       do {                                            \
8425 +               local_bh_disable();                     \
8426 +               rt_write_lock(lock);                    \
8427 +       } while (0)
8428 +
8429 +#define write_lock_irq(lock)   write_lock(lock)
8430 +
8431 +#define read_unlock(lock)      rt_read_unlock(lock)
8432 +
8433 +#define read_unlock_bh(lock)                           \
8434 +       do {                                            \
8435 +               rt_read_unlock(lock);                   \
8436 +               local_bh_enable();                      \
8437 +       } while (0)
8438 +
8439 +#define read_unlock_irq(lock)  read_unlock(lock)
8440 +
8441 +#define write_unlock(lock)     rt_write_unlock(lock)
8442 +
8443 +#define write_unlock_bh(lock)                          \
8444 +       do {                                            \
8445 +               rt_write_unlock(lock);                  \
8446 +               local_bh_enable();                      \
8447 +       } while (0)
8448 +
8449 +#define write_unlock_irq(lock) write_unlock(lock)
8450 +
8451 +#define read_unlock_irqrestore(lock, flags)            \
8452 +       do {                                            \
8453 +               typecheck(unsigned long, flags);        \
8454 +               (void) flags;                           \
8455 +               rt_read_unlock(lock);                   \
8456 +       } while (0)
8457 +
8458 +#define write_unlock_irqrestore(lock, flags) \
8459 +       do {                                            \
8460 +               typecheck(unsigned long, flags);        \
8461 +               (void) flags;                           \
8462 +               rt_write_unlock(lock);                  \
8463 +       } while (0)
8464 +
8465 +#endif
8466 diff -Nur linux-4.9.6.orig/include/linux/rwlock_types.h linux-4.9.6/include/linux/rwlock_types.h
8467 --- linux-4.9.6.orig/include/linux/rwlock_types.h       2017-01-26 08:25:24.000000000 +0100
8468 +++ linux-4.9.6/include/linux/rwlock_types.h    2017-01-28 13:59:09.943656649 +0100
8469 @@ -1,6 +1,10 @@
8470  #ifndef __LINUX_RWLOCK_TYPES_H
8471  #define __LINUX_RWLOCK_TYPES_H
8472
8473 +#if !defined(__LINUX_SPINLOCK_TYPES_H)
8474 +# error "Do not include directly, include spinlock_types.h"
8475 +#endif
8476 +
8477  /*
8478   * include/linux/rwlock_types.h - generic rwlock type definitions
8479   *                               and initializers
8480 diff -Nur linux-4.9.6.orig/include/linux/rwlock_types_rt.h linux-4.9.6/include/linux/rwlock_types_rt.h
8481 --- linux-4.9.6.orig/include/linux/rwlock_types_rt.h    1970-01-01 01:00:00.000000000 +0100
8482 +++ linux-4.9.6/include/linux/rwlock_types_rt.h 2017-01-28 13:59:09.943656649 +0100
8483 @@ -0,0 +1,33 @@
8484 +#ifndef __LINUX_RWLOCK_TYPES_RT_H
8485 +#define __LINUX_RWLOCK_TYPES_RT_H
8486 +
8487 +#ifndef __LINUX_SPINLOCK_TYPES_H
8488 +#error "Do not include directly. Include spinlock_types.h instead"
8489 +#endif
8490 +
8491 +/*
8492 + * rwlocks - rtmutex which allows single reader recursion
8493 + */
8494 +typedef struct {
8495 +       struct rt_mutex         lock;
8496 +       int                     read_depth;
8497 +       unsigned int            break_lock;
8498 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
8499 +       struct lockdep_map      dep_map;
8500 +#endif
8501 +} rwlock_t;
8502 +
8503 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
8504 +# define RW_DEP_MAP_INIT(lockname)     .dep_map = { .name = #lockname }
8505 +#else
8506 +# define RW_DEP_MAP_INIT(lockname)
8507 +#endif
8508 +
8509 +#define __RW_LOCK_UNLOCKED(name) \
8510 +       { .lock = __RT_MUTEX_INITIALIZER_SAVE_STATE(name.lock), \
8511 +         RW_DEP_MAP_INIT(name) }
8512 +
8513 +#define DEFINE_RWLOCK(name) \
8514 +       rwlock_t name = __RW_LOCK_UNLOCKED(name)
8515 +
8516 +#endif
8517 diff -Nur linux-4.9.6.orig/include/linux/rwsem.h linux-4.9.6/include/linux/rwsem.h
8518 --- linux-4.9.6.orig/include/linux/rwsem.h      2017-01-26 08:25:24.000000000 +0100
8519 +++ linux-4.9.6/include/linux/rwsem.h   2017-01-28 13:59:09.943656649 +0100
8520 @@ -19,6 +19,10 @@
8521  #include <linux/osq_lock.h>
8522  #endif
8523
8524 +#ifdef CONFIG_PREEMPT_RT_FULL
8525 +#include <linux/rwsem_rt.h>
8526 +#else /* PREEMPT_RT_FULL */
8527 +
8528  struct rw_semaphore;
8529
8530  #ifdef CONFIG_RWSEM_GENERIC_SPINLOCK
8531 @@ -184,4 +188,6 @@
8532  # define up_read_non_owner(sem)                        up_read(sem)
8533  #endif
8534
8535 +#endif /* !PREEMPT_RT_FULL */
8536 +
8537  #endif /* _LINUX_RWSEM_H */
8538 diff -Nur linux-4.9.6.orig/include/linux/rwsem_rt.h linux-4.9.6/include/linux/rwsem_rt.h
8539 --- linux-4.9.6.orig/include/linux/rwsem_rt.h   1970-01-01 01:00:00.000000000 +0100
8540 +++ linux-4.9.6/include/linux/rwsem_rt.h        2017-01-28 13:59:09.943656649 +0100
8541 @@ -0,0 +1,167 @@
8542 +#ifndef _LINUX_RWSEM_RT_H
8543 +#define _LINUX_RWSEM_RT_H
8544 +
8545 +#ifndef _LINUX_RWSEM_H
8546 +#error "Include rwsem.h"
8547 +#endif
8548 +
8549 +/*
8550 + * RW-semaphores are a spinlock plus a reader-depth count.
8551 + *
8552 + * Note that the semantics are different from the usual
8553 + * Linux rw-sems, in PREEMPT_RT mode we do not allow
8554 + * multiple readers to hold the lock at once, we only allow
8555 + * a read-lock owner to read-lock recursively. This is
8556 + * better for latency, makes the implementation inherently
8557 + * fair and makes it simpler as well.
8558 + */
8559 +
8560 +#include <linux/rtmutex.h>
8561 +
8562 +struct rw_semaphore {
8563 +       struct rt_mutex         lock;
8564 +       int                     read_depth;
8565 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
8566 +       struct lockdep_map      dep_map;
8567 +#endif
8568 +};
8569 +
8570 +#define __RWSEM_INITIALIZER(name) \
8571 +       { .lock = __RT_MUTEX_INITIALIZER(name.lock), \
8572 +         RW_DEP_MAP_INIT(name) }
8573 +
8574 +#define DECLARE_RWSEM(lockname) \
8575 +       struct rw_semaphore lockname = __RWSEM_INITIALIZER(lockname)
8576 +
8577 +extern void  __rt_rwsem_init(struct rw_semaphore *rwsem, const char *name,
8578 +                                    struct lock_class_key *key);
8579 +
8580 +#define __rt_init_rwsem(sem, name, key)                        \
8581 +       do {                                            \
8582 +               rt_mutex_init(&(sem)->lock);            \
8583 +               __rt_rwsem_init((sem), (name), (key));\
8584 +       } while (0)
8585 +
8586 +#define __init_rwsem(sem, name, key) __rt_init_rwsem(sem, name, key)
8587 +
8588 +# define rt_init_rwsem(sem)                            \
8589 +do {                                                   \
8590 +       static struct lock_class_key __key;             \
8591 +                                                       \
8592 +       __rt_init_rwsem((sem), #sem, &__key);           \
8593 +} while (0)
8594 +
8595 +extern void rt_down_write(struct rw_semaphore *rwsem);
8596 +extern int  rt_down_write_killable(struct rw_semaphore *rwsem);
8597 +extern void rt_down_read_nested(struct rw_semaphore *rwsem, int subclass);
8598 +extern void rt_down_write_nested(struct rw_semaphore *rwsem, int subclass);
8599 +extern int  rt_down_write_killable_nested(struct rw_semaphore *rwsem,
8600 +                                         int subclass);
8601 +extern void rt_down_write_nested_lock(struct rw_semaphore *rwsem,
8602 +                                     struct lockdep_map *nest);
8603 +extern void rt__down_read(struct rw_semaphore *rwsem);
8604 +extern void rt_down_read(struct rw_semaphore *rwsem);
8605 +extern int  rt_down_write_trylock(struct rw_semaphore *rwsem);
8606 +extern int  rt__down_read_trylock(struct rw_semaphore *rwsem);
8607 +extern int  rt_down_read_trylock(struct rw_semaphore *rwsem);
8608 +extern void __rt_up_read(struct rw_semaphore *rwsem);
8609 +extern void rt_up_read(struct rw_semaphore *rwsem);
8610 +extern void rt_up_write(struct rw_semaphore *rwsem);
8611 +extern void rt_downgrade_write(struct rw_semaphore *rwsem);
8612 +
8613 +#define init_rwsem(sem)                rt_init_rwsem(sem)
8614 +#define rwsem_is_locked(s)     rt_mutex_is_locked(&(s)->lock)
8615 +
8616 +static inline int rwsem_is_contended(struct rw_semaphore *sem)
8617 +{
8618 +       /* rt_mutex_has_waiters() */
8619 +       return !RB_EMPTY_ROOT(&sem->lock.waiters);
8620 +}
8621 +
8622 +static inline void __down_read(struct rw_semaphore *sem)
8623 +{
8624 +       rt__down_read(sem);
8625 +}
8626 +
8627 +static inline void down_read(struct rw_semaphore *sem)
8628 +{
8629 +       rt_down_read(sem);
8630 +}
8631 +
8632 +static inline int __down_read_trylock(struct rw_semaphore *sem)
8633 +{
8634 +       return rt__down_read_trylock(sem);
8635 +}
8636 +
8637 +static inline int down_read_trylock(struct rw_semaphore *sem)
8638 +{
8639 +       return rt_down_read_trylock(sem);
8640 +}
8641 +
8642 +static inline void down_write(struct rw_semaphore *sem)
8643 +{
8644 +       rt_down_write(sem);
8645 +}
8646 +
8647 +static inline int down_write_killable(struct rw_semaphore *sem)
8648 +{
8649 +       return rt_down_write_killable(sem);
8650 +}
8651 +
8652 +static inline int down_write_trylock(struct rw_semaphore *sem)
8653 +{
8654 +       return rt_down_write_trylock(sem);
8655 +}
8656 +
8657 +static inline void __up_read(struct rw_semaphore *sem)
8658 +{
8659 +       __rt_up_read(sem);
8660 +}
8661 +
8662 +static inline void up_read(struct rw_semaphore *sem)
8663 +{
8664 +       rt_up_read(sem);
8665 +}
8666 +
8667 +static inline void up_write(struct rw_semaphore *sem)
8668 +{
8669 +       rt_up_write(sem);
8670 +}
8671 +
8672 +static inline void downgrade_write(struct rw_semaphore *sem)
8673 +{
8674 +       rt_downgrade_write(sem);
8675 +}
8676 +
8677 +static inline void down_read_nested(struct rw_semaphore *sem, int subclass)
8678 +{
8679 +       return rt_down_read_nested(sem, subclass);
8680 +}
8681 +
8682 +static inline void down_write_nested(struct rw_semaphore *sem, int subclass)
8683 +{
8684 +       rt_down_write_nested(sem, subclass);
8685 +}
8686 +
8687 +static inline int down_write_killable_nested(struct rw_semaphore *sem,
8688 +                                            int subclass)
8689 +{
8690 +       return rt_down_write_killable_nested(sem, subclass);
8691 +}
8692 +
8693 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
8694 +static inline void down_write_nest_lock(struct rw_semaphore *sem,
8695 +               struct rw_semaphore *nest_lock)
8696 +{
8697 +       rt_down_write_nested_lock(sem, &nest_lock->dep_map);
8698 +}
8699 +
8700 +#else
8701 +
8702 +static inline void down_write_nest_lock(struct rw_semaphore *sem,
8703 +               struct rw_semaphore *nest_lock)
8704 +{
8705 +       rt_down_write_nested_lock(sem, NULL);
8706 +}
8707 +#endif
8708 +#endif
8709 diff -Nur linux-4.9.6.orig/include/linux/sched.h linux-4.9.6/include/linux/sched.h
8710 --- linux-4.9.6.orig/include/linux/sched.h      2017-01-26 08:25:24.000000000 +0100
8711 +++ linux-4.9.6/include/linux/sched.h   2017-01-28 13:59:09.943656649 +0100
8712 @@ -26,6 +26,7 @@
8713  #include <linux/nodemask.h>
8714  #include <linux/mm_types.h>
8715  #include <linux/preempt.h>
8716 +#include <asm/kmap_types.h>
8717
8718  #include <asm/page.h>
8719  #include <asm/ptrace.h>
8720 @@ -243,10 +244,7 @@
8721                                  TASK_UNINTERRUPTIBLE | __TASK_STOPPED | \
8722                                  __TASK_TRACED | EXIT_ZOMBIE | EXIT_DEAD)
8723
8724 -#define task_is_traced(task)   ((task->state & __TASK_TRACED) != 0)
8725  #define task_is_stopped(task)  ((task->state & __TASK_STOPPED) != 0)
8726 -#define task_is_stopped_or_traced(task)        \
8727 -                       ((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0)
8728  #define task_contributes_to_load(task) \
8729                                 ((task->state & TASK_UNINTERRUPTIBLE) != 0 && \
8730                                  (task->flags & PF_FROZEN) == 0 && \
8731 @@ -312,6 +310,11 @@
8732
8733  #endif
8734
8735 +#define __set_current_state_no_track(state_value)      \
8736 +       do { current->state = (state_value); } while (0)
8737 +#define set_current_state_no_track(state_value)                \
8738 +       set_mb(current->state, (state_value))
8739 +
8740  /* Task command name length */
8741  #define TASK_COMM_LEN 16
8742
8743 @@ -1013,8 +1016,18 @@
8744         struct wake_q_head name = { WAKE_Q_TAIL, &name.first }
8745
8746  extern void wake_q_add(struct wake_q_head *head,
8747 -                      struct task_struct *task);
8748 -extern void wake_up_q(struct wake_q_head *head);
8749 +                             struct task_struct *task);
8750 +extern void __wake_up_q(struct wake_q_head *head, bool sleeper);
8751 +
8752 +static inline void wake_up_q(struct wake_q_head *head)
8753 +{
8754 +       __wake_up_q(head, false);
8755 +}
8756 +
8757 +static inline void wake_up_q_sleeper(struct wake_q_head *head)
8758 +{
8759 +       __wake_up_q(head, true);
8760 +}
8761
8762  /*
8763   * sched-domains (multiprocessor balancing) declarations:
8764 @@ -1481,6 +1494,7 @@
8765         struct thread_info thread_info;
8766  #endif
8767         volatile long state;    /* -1 unrunnable, 0 runnable, >0 stopped */
8768 +       volatile long saved_state; /* saved state for "spinlock sleepers" */
8769         void *stack;
8770         atomic_t usage;
8771         unsigned int flags;     /* per process flags, defined below */
8772 @@ -1520,6 +1534,12 @@
8773  #endif
8774
8775         unsigned int policy;
8776 +#ifdef CONFIG_PREEMPT_RT_FULL
8777 +       int migrate_disable;
8778 +# ifdef CONFIG_SCHED_DEBUG
8779 +       int migrate_disable_atomic;
8780 +# endif
8781 +#endif
8782         int nr_cpus_allowed;
8783         cpumask_t cpus_allowed;
8784
8785 @@ -1654,6 +1674,9 @@
8786
8787         struct task_cputime cputime_expires;
8788         struct list_head cpu_timers[3];
8789 +#ifdef CONFIG_PREEMPT_RT_BASE
8790 +       struct task_struct *posix_timer_list;
8791 +#endif
8792
8793  /* process credentials */
8794         const struct cred __rcu *ptracer_cred; /* Tracer's credentials at attach */
8795 @@ -1685,10 +1708,15 @@
8796  /* signal handlers */
8797         struct signal_struct *signal;
8798         struct sighand_struct *sighand;
8799 +       struct sigqueue *sigqueue_cache;
8800
8801         sigset_t blocked, real_blocked;
8802         sigset_t saved_sigmask; /* restored if set_restore_sigmask() was used */
8803         struct sigpending pending;
8804 +#ifdef CONFIG_PREEMPT_RT_FULL
8805 +       /* TODO: move me into ->restart_block ? */
8806 +       struct siginfo forced_info;
8807 +#endif
8808
8809         unsigned long sas_ss_sp;
8810         size_t sas_ss_size;
8811 @@ -1917,6 +1945,12 @@
8812         /* bitmask and counter of trace recursion */
8813         unsigned long trace_recursion;
8814  #endif /* CONFIG_TRACING */
8815 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
8816 +       u64 preempt_timestamp_hist;
8817 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
8818 +       long timer_offset;
8819 +#endif
8820 +#endif
8821  #ifdef CONFIG_KCOV
8822         /* Coverage collection mode enabled for this task (0 if disabled). */
8823         enum kcov_mode kcov_mode;
8824 @@ -1942,9 +1976,23 @@
8825         unsigned int    sequential_io;
8826         unsigned int    sequential_io_avg;
8827  #endif
8828 +#ifdef CONFIG_PREEMPT_RT_BASE
8829 +       struct rcu_head put_rcu;
8830 +       int softirq_nestcnt;
8831 +       unsigned int softirqs_raised;
8832 +#endif
8833 +#ifdef CONFIG_PREEMPT_RT_FULL
8834 +# if defined CONFIG_HIGHMEM || defined CONFIG_X86_32
8835 +       int kmap_idx;
8836 +       pte_t kmap_pte[KM_TYPE_NR];
8837 +# endif
8838 +#endif
8839  #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
8840         unsigned long   task_state_change;
8841  #endif
8842 +#ifdef CONFIG_PREEMPT_RT_FULL
8843 +       int xmit_recursion;
8844 +#endif
8845         int pagefault_disabled;
8846  #ifdef CONFIG_MMU
8847         struct task_struct *oom_reaper_list;
8848 @@ -1984,14 +2032,6 @@
8849  }
8850  #endif
8851
8852 -/* Future-safe accessor for struct task_struct's cpus_allowed. */
8853 -#define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
8854 -
8855 -static inline int tsk_nr_cpus_allowed(struct task_struct *p)
8856 -{
8857 -       return p->nr_cpus_allowed;
8858 -}
8859 -
8860  #define TNF_MIGRATED   0x01
8861  #define TNF_NO_GROUP   0x02
8862  #define TNF_SHARED     0x04
8863 @@ -2207,6 +2247,15 @@
8864  extern void free_task(struct task_struct *tsk);
8865  #define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0)
8866
8867 +#ifdef CONFIG_PREEMPT_RT_BASE
8868 +extern void __put_task_struct_cb(struct rcu_head *rhp);
8869 +
8870 +static inline void put_task_struct(struct task_struct *t)
8871 +{
8872 +       if (atomic_dec_and_test(&t->usage))
8873 +               call_rcu(&t->put_rcu, __put_task_struct_cb);
8874 +}
8875 +#else
8876  extern void __put_task_struct(struct task_struct *t);
8877
8878  static inline void put_task_struct(struct task_struct *t)
8879 @@ -2214,6 +2263,7 @@
8880         if (atomic_dec_and_test(&t->usage))
8881                 __put_task_struct(t);
8882  }
8883 +#endif
8884
8885  struct task_struct *task_rcu_dereference(struct task_struct **ptask);
8886  struct task_struct *try_get_task_struct(struct task_struct **ptask);
8887 @@ -2255,6 +2305,7 @@
8888  /*
8889   * Per process flags
8890   */
8891 +#define PF_IN_SOFTIRQ  0x00000001      /* Task is serving softirq */
8892  #define PF_EXITING     0x00000004      /* getting shut down */
8893  #define PF_EXITPIDONE  0x00000008      /* pi exit done on shut down */
8894  #define PF_VCPU                0x00000010      /* I'm a virtual CPU */
8895 @@ -2423,6 +2474,10 @@
8896
8897  extern int set_cpus_allowed_ptr(struct task_struct *p,
8898                                 const struct cpumask *new_mask);
8899 +int migrate_me(void);
8900 +void tell_sched_cpu_down_begin(int cpu);
8901 +void tell_sched_cpu_down_done(int cpu);
8902 +
8903  #else
8904  static inline void do_set_cpus_allowed(struct task_struct *p,
8905                                       const struct cpumask *new_mask)
8906 @@ -2435,6 +2490,9 @@
8907                 return -EINVAL;
8908         return 0;
8909  }
8910 +static inline int migrate_me(void) { return 0; }
8911 +static inline void tell_sched_cpu_down_begin(int cpu) { }
8912 +static inline void tell_sched_cpu_down_done(int cpu) { }
8913  #endif
8914
8915  #ifdef CONFIG_NO_HZ_COMMON
8916 @@ -2673,6 +2731,7 @@
8917
8918  extern int wake_up_state(struct task_struct *tsk, unsigned int state);
8919  extern int wake_up_process(struct task_struct *tsk);
8920 +extern int wake_up_lock_sleeper(struct task_struct * tsk);
8921  extern void wake_up_new_task(struct task_struct *tsk);
8922  #ifdef CONFIG_SMP
8923   extern void kick_process(struct task_struct *tsk);
8924 @@ -2881,6 +2940,17 @@
8925                 __mmdrop(mm);
8926  }
8927
8928 +#ifdef CONFIG_PREEMPT_RT_BASE
8929 +extern void __mmdrop_delayed(struct rcu_head *rhp);
8930 +static inline void mmdrop_delayed(struct mm_struct *mm)
8931 +{
8932 +       if (atomic_dec_and_test(&mm->mm_count))
8933 +               call_rcu(&mm->delayed_drop, __mmdrop_delayed);
8934 +}
8935 +#else
8936 +# define mmdrop_delayed(mm)    mmdrop(mm)
8937 +#endif
8938 +
8939  static inline void mmdrop_async_fn(struct work_struct *work)
8940  {
8941         struct mm_struct *mm = container_of(work, struct mm_struct, async_put_work);
8942 @@ -3273,6 +3343,43 @@
8943         return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED));
8944  }
8945
8946 +#ifdef CONFIG_PREEMPT_LAZY
8947 +static inline void set_tsk_need_resched_lazy(struct task_struct *tsk)
8948 +{
8949 +       set_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY);
8950 +}
8951 +
8952 +static inline void clear_tsk_need_resched_lazy(struct task_struct *tsk)
8953 +{
8954 +       clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY);
8955 +}
8956 +
8957 +static inline int test_tsk_need_resched_lazy(struct task_struct *tsk)
8958 +{
8959 +       return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY));
8960 +}
8961 +
8962 +static inline int need_resched_lazy(void)
8963 +{
8964 +       return test_thread_flag(TIF_NEED_RESCHED_LAZY);
8965 +}
8966 +
8967 +static inline int need_resched_now(void)
8968 +{
8969 +       return test_thread_flag(TIF_NEED_RESCHED);
8970 +}
8971 +
8972 +#else
8973 +static inline void clear_tsk_need_resched_lazy(struct task_struct *tsk) { }
8974 +static inline int need_resched_lazy(void) { return 0; }
8975 +
8976 +static inline int need_resched_now(void)
8977 +{
8978 +       return test_thread_flag(TIF_NEED_RESCHED);
8979 +}
8980 +
8981 +#endif
8982 +
8983  static inline int restart_syscall(void)
8984  {
8985         set_tsk_thread_flag(current, TIF_SIGPENDING);
8986 @@ -3304,6 +3411,51 @@
8987         return (state & TASK_INTERRUPTIBLE) || __fatal_signal_pending(p);
8988  }
8989
8990 +static inline bool __task_is_stopped_or_traced(struct task_struct *task)
8991 +{
8992 +       if (task->state & (__TASK_STOPPED | __TASK_TRACED))
8993 +               return true;
8994 +#ifdef CONFIG_PREEMPT_RT_FULL
8995 +       if (task->saved_state & (__TASK_STOPPED | __TASK_TRACED))
8996 +               return true;
8997 +#endif
8998 +       return false;
8999 +}
9000 +
9001 +static inline bool task_is_stopped_or_traced(struct task_struct *task)
9002 +{
9003 +       bool traced_stopped;
9004 +
9005 +#ifdef CONFIG_PREEMPT_RT_FULL
9006 +       unsigned long flags;
9007 +
9008 +       raw_spin_lock_irqsave(&task->pi_lock, flags);
9009 +       traced_stopped = __task_is_stopped_or_traced(task);
9010 +       raw_spin_unlock_irqrestore(&task->pi_lock, flags);
9011 +#else
9012 +       traced_stopped = __task_is_stopped_or_traced(task);
9013 +#endif
9014 +       return traced_stopped;
9015 +}
9016 +
9017 +static inline bool task_is_traced(struct task_struct *task)
9018 +{
9019 +       bool traced = false;
9020 +
9021 +       if (task->state & __TASK_TRACED)
9022 +               return true;
9023 +#ifdef CONFIG_PREEMPT_RT_FULL
9024 +       /* in case the task is sleeping on tasklist_lock */
9025 +       raw_spin_lock_irq(&task->pi_lock);
9026 +       if (task->state & __TASK_TRACED)
9027 +               traced = true;
9028 +       else if (task->saved_state & __TASK_TRACED)
9029 +               traced = true;
9030 +       raw_spin_unlock_irq(&task->pi_lock);
9031 +#endif
9032 +       return traced;
9033 +}
9034 +
9035  /*
9036   * cond_resched() and cond_resched_lock(): latency reduction via
9037   * explicit rescheduling in places that are safe. The return
9038 @@ -3329,12 +3481,16 @@
9039         __cond_resched_lock(lock);                              \
9040  })
9041
9042 +#ifndef CONFIG_PREEMPT_RT_FULL
9043  extern int __cond_resched_softirq(void);
9044
9045  #define cond_resched_softirq() ({                                      \
9046         ___might_sleep(__FILE__, __LINE__, SOFTIRQ_DISABLE_OFFSET);     \
9047         __cond_resched_softirq();                                       \
9048  })
9049 +#else
9050 +# define cond_resched_softirq()                cond_resched()
9051 +#endif
9052
9053  static inline void cond_resched_rcu(void)
9054  {
9055 @@ -3509,6 +3665,31 @@
9056
9057  #endif /* CONFIG_SMP */
9058
9059 +static inline int __migrate_disabled(struct task_struct *p)
9060 +{
9061 +#ifdef CONFIG_PREEMPT_RT_FULL
9062 +       return p->migrate_disable;
9063 +#else
9064 +       return 0;
9065 +#endif
9066 +}
9067 +
9068 +/* Future-safe accessor for struct task_struct's cpus_allowed. */
9069 +static inline const struct cpumask *tsk_cpus_allowed(struct task_struct *p)
9070 +{
9071 +       if (__migrate_disabled(p))
9072 +               return cpumask_of(task_cpu(p));
9073 +
9074 +       return &p->cpus_allowed;
9075 +}
9076 +
9077 +static inline int tsk_nr_cpus_allowed(struct task_struct *p)
9078 +{
9079 +       if (__migrate_disabled(p))
9080 +               return 1;
9081 +       return p->nr_cpus_allowed;
9082 +}
9083 +
9084  extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask);
9085  extern long sched_getaffinity(pid_t pid, struct cpumask *mask);
9086
9087 diff -Nur linux-4.9.6.orig/include/linux/seqlock.h linux-4.9.6/include/linux/seqlock.h
9088 --- linux-4.9.6.orig/include/linux/seqlock.h    2017-01-26 08:25:24.000000000 +0100
9089 +++ linux-4.9.6/include/linux/seqlock.h 2017-01-28 13:59:09.943656649 +0100
9090 @@ -220,20 +220,30 @@
9091         return __read_seqcount_retry(s, start);
9092  }
9093
9094 -
9095 -
9096 -static inline void raw_write_seqcount_begin(seqcount_t *s)
9097 +static inline void __raw_write_seqcount_begin(seqcount_t *s)
9098  {
9099         s->sequence++;
9100         smp_wmb();
9101  }
9102
9103 -static inline void raw_write_seqcount_end(seqcount_t *s)
9104 +static inline void raw_write_seqcount_begin(seqcount_t *s)
9105 +{
9106 +       preempt_disable_rt();
9107 +       __raw_write_seqcount_begin(s);
9108 +}
9109 +
9110 +static inline void __raw_write_seqcount_end(seqcount_t *s)
9111  {
9112         smp_wmb();
9113         s->sequence++;
9114  }
9115
9116 +static inline void raw_write_seqcount_end(seqcount_t *s)
9117 +{
9118 +       __raw_write_seqcount_end(s);
9119 +       preempt_enable_rt();
9120 +}
9121 +
9122  /**
9123   * raw_write_seqcount_barrier - do a seq write barrier
9124   * @s: pointer to seqcount_t
9125 @@ -428,10 +438,32 @@
9126  /*
9127   * Read side functions for starting and finalizing a read side section.
9128   */
9129 +#ifndef CONFIG_PREEMPT_RT_FULL
9130  static inline unsigned read_seqbegin(const seqlock_t *sl)
9131  {
9132         return read_seqcount_begin(&sl->seqcount);
9133  }
9134 +#else
9135 +/*
9136 + * Starvation safe read side for RT
9137 + */
9138 +static inline unsigned read_seqbegin(seqlock_t *sl)
9139 +{
9140 +       unsigned ret;
9141 +
9142 +repeat:
9143 +       ret = ACCESS_ONCE(sl->seqcount.sequence);
9144 +       if (unlikely(ret & 1)) {
9145 +               /*
9146 +                * Take the lock and let the writer proceed (i.e. evtl
9147 +                * boost it), otherwise we could loop here forever.
9148 +                */
9149 +               spin_unlock_wait(&sl->lock);
9150 +               goto repeat;
9151 +       }
9152 +       return ret;
9153 +}
9154 +#endif
9155
9156  static inline unsigned read_seqretry(const seqlock_t *sl, unsigned start)
9157  {
9158 @@ -446,36 +478,45 @@
9159  static inline void write_seqlock(seqlock_t *sl)
9160  {
9161         spin_lock(&sl->lock);
9162 -       write_seqcount_begin(&sl->seqcount);
9163 +       __raw_write_seqcount_begin(&sl->seqcount);
9164 +}
9165 +
9166 +static inline int try_write_seqlock(seqlock_t *sl)
9167 +{
9168 +       if (spin_trylock(&sl->lock)) {
9169 +               __raw_write_seqcount_begin(&sl->seqcount);
9170 +               return 1;
9171 +       }
9172 +       return 0;
9173  }
9174
9175  static inline void write_sequnlock(seqlock_t *sl)
9176  {
9177 -       write_seqcount_end(&sl->seqcount);
9178 +       __raw_write_seqcount_end(&sl->seqcount);
9179         spin_unlock(&sl->lock);
9180  }
9181
9182  static inline void write_seqlock_bh(seqlock_t *sl)
9183  {
9184         spin_lock_bh(&sl->lock);
9185 -       write_seqcount_begin(&sl->seqcount);
9186 +       __raw_write_seqcount_begin(&sl->seqcount);
9187  }
9188
9189  static inline void write_sequnlock_bh(seqlock_t *sl)
9190  {
9191 -       write_seqcount_end(&sl->seqcount);
9192 +       __raw_write_seqcount_end(&sl->seqcount);
9193         spin_unlock_bh(&sl->lock);
9194  }
9195
9196  static inline void write_seqlock_irq(seqlock_t *sl)
9197  {
9198         spin_lock_irq(&sl->lock);
9199 -       write_seqcount_begin(&sl->seqcount);
9200 +       __raw_write_seqcount_begin(&sl->seqcount);
9201  }
9202
9203  static inline void write_sequnlock_irq(seqlock_t *sl)
9204  {
9205 -       write_seqcount_end(&sl->seqcount);
9206 +       __raw_write_seqcount_end(&sl->seqcount);
9207         spin_unlock_irq(&sl->lock);
9208  }
9209
9210 @@ -484,7 +525,7 @@
9211         unsigned long flags;
9212
9213         spin_lock_irqsave(&sl->lock, flags);
9214 -       write_seqcount_begin(&sl->seqcount);
9215 +       __raw_write_seqcount_begin(&sl->seqcount);
9216         return flags;
9217  }
9218
9219 @@ -494,7 +535,7 @@
9220  static inline void
9221  write_sequnlock_irqrestore(seqlock_t *sl, unsigned long flags)
9222  {
9223 -       write_seqcount_end(&sl->seqcount);
9224 +       __raw_write_seqcount_end(&sl->seqcount);
9225         spin_unlock_irqrestore(&sl->lock, flags);
9226  }
9227
9228 diff -Nur linux-4.9.6.orig/include/linux/signal.h linux-4.9.6/include/linux/signal.h
9229 --- linux-4.9.6.orig/include/linux/signal.h     2017-01-26 08:25:24.000000000 +0100
9230 +++ linux-4.9.6/include/linux/signal.h  2017-01-28 13:59:09.947656802 +0100
9231 @@ -233,6 +233,7 @@
9232  }
9233
9234  extern void flush_sigqueue(struct sigpending *queue);
9235 +extern void flush_task_sigqueue(struct task_struct *tsk);
9236
9237  /* Test if 'sig' is valid signal. Use this instead of testing _NSIG directly */
9238  static inline int valid_signal(unsigned long sig)
9239 diff -Nur linux-4.9.6.orig/include/linux/skbuff.h linux-4.9.6/include/linux/skbuff.h
9240 --- linux-4.9.6.orig/include/linux/skbuff.h     2017-01-26 08:25:24.000000000 +0100
9241 +++ linux-4.9.6/include/linux/skbuff.h  2017-01-28 13:59:09.947656802 +0100
9242 @@ -284,6 +284,7 @@
9243
9244         __u32           qlen;
9245         spinlock_t      lock;
9246 +       raw_spinlock_t  raw_lock;
9247  };
9248
9249  struct sk_buff;
9250 @@ -1573,6 +1574,12 @@
9251         __skb_queue_head_init(list);
9252  }
9253
9254 +static inline void skb_queue_head_init_raw(struct sk_buff_head *list)
9255 +{
9256 +       raw_spin_lock_init(&list->raw_lock);
9257 +       __skb_queue_head_init(list);
9258 +}
9259 +
9260  static inline void skb_queue_head_init_class(struct sk_buff_head *list,
9261                 struct lock_class_key *class)
9262  {
9263 diff -Nur linux-4.9.6.orig/include/linux/smp.h linux-4.9.6/include/linux/smp.h
9264 --- linux-4.9.6.orig/include/linux/smp.h        2017-01-26 08:25:24.000000000 +0100
9265 +++ linux-4.9.6/include/linux/smp.h     2017-01-28 13:59:09.947656802 +0100
9266 @@ -185,6 +185,9 @@
9267  #define get_cpu()              ({ preempt_disable(); smp_processor_id(); })
9268  #define put_cpu()              preempt_enable()
9269
9270 +#define get_cpu_light()                ({ migrate_disable(); smp_processor_id(); })
9271 +#define put_cpu_light()                migrate_enable()
9272 +
9273  /*
9274   * Callback to arch code if there's nosmp or maxcpus=0 on the
9275   * boot command line:
9276 diff -Nur linux-4.9.6.orig/include/linux/spinlock_api_smp.h linux-4.9.6/include/linux/spinlock_api_smp.h
9277 --- linux-4.9.6.orig/include/linux/spinlock_api_smp.h   2017-01-26 08:25:24.000000000 +0100
9278 +++ linux-4.9.6/include/linux/spinlock_api_smp.h        2017-01-28 13:59:09.947656802 +0100
9279 @@ -189,6 +189,8 @@
9280         return 0;
9281  }
9282
9283 -#include <linux/rwlock_api_smp.h>
9284 +#ifndef CONFIG_PREEMPT_RT_FULL
9285 +# include <linux/rwlock_api_smp.h>
9286 +#endif
9287
9288  #endif /* __LINUX_SPINLOCK_API_SMP_H */
9289 diff -Nur linux-4.9.6.orig/include/linux/spinlock.h linux-4.9.6/include/linux/spinlock.h
9290 --- linux-4.9.6.orig/include/linux/spinlock.h   2017-01-26 08:25:24.000000000 +0100
9291 +++ linux-4.9.6/include/linux/spinlock.h        2017-01-28 13:59:09.947656802 +0100
9292 @@ -271,7 +271,11 @@
9293  #define raw_spin_can_lock(lock)        (!raw_spin_is_locked(lock))
9294
9295  /* Include rwlock functions */
9296 -#include <linux/rwlock.h>
9297 +#ifdef CONFIG_PREEMPT_RT_FULL
9298 +# include <linux/rwlock_rt.h>
9299 +#else
9300 +# include <linux/rwlock.h>
9301 +#endif
9302
9303  /*
9304   * Pull the _spin_*()/_read_*()/_write_*() functions/declarations:
9305 @@ -282,6 +286,10 @@
9306  # include <linux/spinlock_api_up.h>
9307  #endif
9308
9309 +#ifdef CONFIG_PREEMPT_RT_FULL
9310 +# include <linux/spinlock_rt.h>
9311 +#else /* PREEMPT_RT_FULL */
9312 +
9313  /*
9314   * Map the spin_lock functions to the raw variants for PREEMPT_RT=n
9315   */
9316 @@ -347,6 +355,12 @@
9317         raw_spin_unlock(&lock->rlock);
9318  }
9319
9320 +static __always_inline int spin_unlock_no_deboost(spinlock_t *lock)
9321 +{
9322 +       raw_spin_unlock(&lock->rlock);
9323 +       return 0;
9324 +}
9325 +
9326  static __always_inline void spin_unlock_bh(spinlock_t *lock)
9327  {
9328         raw_spin_unlock_bh(&lock->rlock);
9329 @@ -416,4 +430,6 @@
9330  #define atomic_dec_and_lock(atomic, lock) \
9331                 __cond_lock(lock, _atomic_dec_and_lock(atomic, lock))
9332
9333 +#endif /* !PREEMPT_RT_FULL */
9334 +
9335  #endif /* __LINUX_SPINLOCK_H */
9336 diff -Nur linux-4.9.6.orig/include/linux/spinlock_rt.h linux-4.9.6/include/linux/spinlock_rt.h
9337 --- linux-4.9.6.orig/include/linux/spinlock_rt.h        1970-01-01 01:00:00.000000000 +0100
9338 +++ linux-4.9.6/include/linux/spinlock_rt.h     2017-01-28 13:59:09.947656802 +0100
9339 @@ -0,0 +1,164 @@
9340 +#ifndef __LINUX_SPINLOCK_RT_H
9341 +#define __LINUX_SPINLOCK_RT_H
9342 +
9343 +#ifndef __LINUX_SPINLOCK_H
9344 +#error Do not include directly. Use spinlock.h
9345 +#endif
9346 +
9347 +#include <linux/bug.h>
9348 +
9349 +extern void
9350 +__rt_spin_lock_init(spinlock_t *lock, char *name, struct lock_class_key *key);
9351 +
9352 +#define spin_lock_init(slock)                          \
9353 +do {                                                   \
9354 +       static struct lock_class_key __key;             \
9355 +                                                       \
9356 +       rt_mutex_init(&(slock)->lock);                  \
9357 +       __rt_spin_lock_init(slock, #slock, &__key);     \
9358 +} while (0)
9359 +
9360 +void __lockfunc rt_spin_lock__no_mg(spinlock_t *lock);
9361 +void __lockfunc rt_spin_unlock__no_mg(spinlock_t *lock);
9362 +int __lockfunc rt_spin_trylock__no_mg(spinlock_t *lock);
9363 +
9364 +extern void __lockfunc rt_spin_lock(spinlock_t *lock);
9365 +extern unsigned long __lockfunc rt_spin_lock_trace_flags(spinlock_t *lock);
9366 +extern void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int subclass);
9367 +extern void __lockfunc rt_spin_unlock(spinlock_t *lock);
9368 +extern int __lockfunc rt_spin_unlock_no_deboost(spinlock_t *lock);
9369 +extern void __lockfunc rt_spin_unlock_wait(spinlock_t *lock);
9370 +extern int __lockfunc rt_spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags);
9371 +extern int __lockfunc rt_spin_trylock_bh(spinlock_t *lock);
9372 +extern int __lockfunc rt_spin_trylock(spinlock_t *lock);
9373 +extern int atomic_dec_and_spin_lock(atomic_t *atomic, spinlock_t *lock);
9374 +
9375 +/*
9376 + * lockdep-less calls, for derived types like rwlock:
9377 + * (for trylock they can use rt_mutex_trylock() directly.
9378 + */
9379 +extern void __lockfunc __rt_spin_lock__no_mg(struct rt_mutex *lock);
9380 +extern void __lockfunc __rt_spin_lock(struct rt_mutex *lock);
9381 +extern void __lockfunc __rt_spin_unlock(struct rt_mutex *lock);
9382 +
9383 +#define spin_lock(lock)                        rt_spin_lock(lock)
9384 +
9385 +#define spin_lock_bh(lock)                     \
9386 +       do {                                    \
9387 +               local_bh_disable();             \
9388 +               rt_spin_lock(lock);             \
9389 +       } while (0)
9390 +
9391 +#define spin_lock_irq(lock)            spin_lock(lock)
9392 +
9393 +#define spin_do_trylock(lock)          __cond_lock(lock, rt_spin_trylock(lock))
9394 +
9395 +#define spin_trylock(lock)                     \
9396 +({                                             \
9397 +       int __locked;                           \
9398 +       __locked = spin_do_trylock(lock);       \
9399 +       __locked;                               \
9400 +})
9401 +
9402 +#ifdef CONFIG_LOCKDEP
9403 +# define spin_lock_nested(lock, subclass)              \
9404 +       do {                                            \
9405 +               rt_spin_lock_nested(lock, subclass);    \
9406 +       } while (0)
9407 +
9408 +#define spin_lock_bh_nested(lock, subclass)            \
9409 +       do {                                            \
9410 +               local_bh_disable();                     \
9411 +               rt_spin_lock_nested(lock, subclass);    \
9412 +       } while (0)
9413 +
9414 +# define spin_lock_irqsave_nested(lock, flags, subclass) \
9415 +       do {                                             \
9416 +               typecheck(unsigned long, flags);         \
9417 +               flags = 0;                               \
9418 +               rt_spin_lock_nested(lock, subclass);     \
9419 +       } while (0)
9420 +#else
9421 +# define spin_lock_nested(lock, subclass)      spin_lock(lock)
9422 +# define spin_lock_bh_nested(lock, subclass)   spin_lock_bh(lock)
9423 +
9424 +# define spin_lock_irqsave_nested(lock, flags, subclass) \
9425 +       do {                                             \
9426 +               typecheck(unsigned long, flags);         \
9427 +               flags = 0;                               \
9428 +               spin_lock(lock);                         \
9429 +       } while (0)
9430 +#endif
9431 +
9432 +#define spin_lock_irqsave(lock, flags)                  \
9433 +       do {                                             \
9434 +               typecheck(unsigned long, flags);         \
9435 +               flags = 0;                               \
9436 +               spin_lock(lock);                         \
9437 +       } while (0)
9438 +
9439 +static inline unsigned long spin_lock_trace_flags(spinlock_t *lock)
9440 +{
9441 +       unsigned long flags = 0;
9442 +#ifdef CONFIG_TRACE_IRQFLAGS
9443 +       flags = rt_spin_lock_trace_flags(lock);
9444 +#else
9445 +       spin_lock(lock); /* lock_local */
9446 +#endif
9447 +       return flags;
9448 +}
9449 +
9450 +/* FIXME: we need rt_spin_lock_nest_lock */
9451 +#define spin_lock_nest_lock(lock, nest_lock) spin_lock_nested(lock, 0)
9452 +
9453 +#define spin_unlock(lock)                      rt_spin_unlock(lock)
9454 +#define spin_unlock_no_deboost(lock)           rt_spin_unlock_no_deboost(lock)
9455 +
9456 +#define spin_unlock_bh(lock)                           \
9457 +       do {                                            \
9458 +               rt_spin_unlock(lock);                   \
9459 +               local_bh_enable();                      \
9460 +       } while (0)
9461 +
9462 +#define spin_unlock_irq(lock)          spin_unlock(lock)
9463 +
9464 +#define spin_unlock_irqrestore(lock, flags)            \
9465 +       do {                                            \
9466 +               typecheck(unsigned long, flags);        \
9467 +               (void) flags;                           \
9468 +               spin_unlock(lock);                      \
9469 +       } while (0)
9470 +
9471 +#define spin_trylock_bh(lock)  __cond_lock(lock, rt_spin_trylock_bh(lock))
9472 +#define spin_trylock_irq(lock) spin_trylock(lock)
9473 +
9474 +#define spin_trylock_irqsave(lock, flags)      \
9475 +       rt_spin_trylock_irqsave(lock, &(flags))
9476 +
9477 +#define spin_unlock_wait(lock)         rt_spin_unlock_wait(lock)
9478 +
9479 +#ifdef CONFIG_GENERIC_LOCKBREAK
9480 +# define spin_is_contended(lock)       ((lock)->break_lock)
9481 +#else
9482 +# define spin_is_contended(lock)       (((void)(lock), 0))
9483 +#endif
9484 +
9485 +static inline int spin_can_lock(spinlock_t *lock)
9486 +{
9487 +       return !rt_mutex_is_locked(&lock->lock);
9488 +}
9489 +
9490 +static inline int spin_is_locked(spinlock_t *lock)
9491 +{
9492 +       return rt_mutex_is_locked(&lock->lock);
9493 +}
9494 +
9495 +static inline void assert_spin_locked(spinlock_t *lock)
9496 +{
9497 +       BUG_ON(!spin_is_locked(lock));
9498 +}
9499 +
9500 +#define atomic_dec_and_lock(atomic, lock) \
9501 +       atomic_dec_and_spin_lock(atomic, lock)
9502 +
9503 +#endif
9504 diff -Nur linux-4.9.6.orig/include/linux/spinlock_types.h linux-4.9.6/include/linux/spinlock_types.h
9505 --- linux-4.9.6.orig/include/linux/spinlock_types.h     2017-01-26 08:25:24.000000000 +0100
9506 +++ linux-4.9.6/include/linux/spinlock_types.h  2017-01-28 13:59:09.947656802 +0100
9507 @@ -9,80 +9,15 @@
9508   * Released under the General Public License (GPL).
9509   */
9510
9511 -#if defined(CONFIG_SMP)
9512 -# include <asm/spinlock_types.h>
9513 -#else
9514 -# include <linux/spinlock_types_up.h>
9515 -#endif
9516 -
9517 -#include <linux/lockdep.h>
9518 -
9519 -typedef struct raw_spinlock {
9520 -       arch_spinlock_t raw_lock;
9521 -#ifdef CONFIG_GENERIC_LOCKBREAK
9522 -       unsigned int break_lock;
9523 -#endif
9524 -#ifdef CONFIG_DEBUG_SPINLOCK
9525 -       unsigned int magic, owner_cpu;
9526 -       void *owner;
9527 -#endif
9528 -#ifdef CONFIG_DEBUG_LOCK_ALLOC
9529 -       struct lockdep_map dep_map;
9530 -#endif
9531 -} raw_spinlock_t;
9532 -
9533 -#define SPINLOCK_MAGIC         0xdead4ead
9534 -
9535 -#define SPINLOCK_OWNER_INIT    ((void *)-1L)
9536 -
9537 -#ifdef CONFIG_DEBUG_LOCK_ALLOC
9538 -# define SPIN_DEP_MAP_INIT(lockname)   .dep_map = { .name = #lockname }
9539 -#else
9540 -# define SPIN_DEP_MAP_INIT(lockname)
9541 -#endif
9542 +#include <linux/spinlock_types_raw.h>
9543
9544 -#ifdef CONFIG_DEBUG_SPINLOCK
9545 -# define SPIN_DEBUG_INIT(lockname)             \
9546 -       .magic = SPINLOCK_MAGIC,                \
9547 -       .owner_cpu = -1,                        \
9548 -       .owner = SPINLOCK_OWNER_INIT,
9549 +#ifndef CONFIG_PREEMPT_RT_FULL
9550 +# include <linux/spinlock_types_nort.h>
9551 +# include <linux/rwlock_types.h>
9552  #else
9553 -# define SPIN_DEBUG_INIT(lockname)
9554 +# include <linux/rtmutex.h>
9555 +# include <linux/spinlock_types_rt.h>
9556 +# include <linux/rwlock_types_rt.h>
9557  #endif
9558
9559 -#define __RAW_SPIN_LOCK_INITIALIZER(lockname)  \
9560 -       {                                       \
9561 -       .raw_lock = __ARCH_SPIN_LOCK_UNLOCKED,  \
9562 -       SPIN_DEBUG_INIT(lockname)               \
9563 -       SPIN_DEP_MAP_INIT(lockname) }
9564 -
9565 -#define __RAW_SPIN_LOCK_UNLOCKED(lockname)     \
9566 -       (raw_spinlock_t) __RAW_SPIN_LOCK_INITIALIZER(lockname)
9567 -
9568 -#define DEFINE_RAW_SPINLOCK(x) raw_spinlock_t x = __RAW_SPIN_LOCK_UNLOCKED(x)
9569 -
9570 -typedef struct spinlock {
9571 -       union {
9572 -               struct raw_spinlock rlock;
9573 -
9574 -#ifdef CONFIG_DEBUG_LOCK_ALLOC
9575 -# define LOCK_PADSIZE (offsetof(struct raw_spinlock, dep_map))
9576 -               struct {
9577 -                       u8 __padding[LOCK_PADSIZE];
9578 -                       struct lockdep_map dep_map;
9579 -               };
9580 -#endif
9581 -       };
9582 -} spinlock_t;
9583 -
9584 -#define __SPIN_LOCK_INITIALIZER(lockname) \
9585 -       { { .rlock = __RAW_SPIN_LOCK_INITIALIZER(lockname) } }
9586 -
9587 -#define __SPIN_LOCK_UNLOCKED(lockname) \
9588 -       (spinlock_t ) __SPIN_LOCK_INITIALIZER(lockname)
9589 -
9590 -#define DEFINE_SPINLOCK(x)     spinlock_t x = __SPIN_LOCK_UNLOCKED(x)
9591 -
9592 -#include <linux/rwlock_types.h>
9593 -
9594  #endif /* __LINUX_SPINLOCK_TYPES_H */
9595 diff -Nur linux-4.9.6.orig/include/linux/spinlock_types_nort.h linux-4.9.6/include/linux/spinlock_types_nort.h
9596 --- linux-4.9.6.orig/include/linux/spinlock_types_nort.h        1970-01-01 01:00:00.000000000 +0100
9597 +++ linux-4.9.6/include/linux/spinlock_types_nort.h     2017-01-28 13:59:09.947656802 +0100
9598 @@ -0,0 +1,33 @@
9599 +#ifndef __LINUX_SPINLOCK_TYPES_NORT_H
9600 +#define __LINUX_SPINLOCK_TYPES_NORT_H
9601 +
9602 +#ifndef __LINUX_SPINLOCK_TYPES_H
9603 +#error "Do not include directly. Include spinlock_types.h instead"
9604 +#endif
9605 +
9606 +/*
9607 + * The non RT version maps spinlocks to raw_spinlocks
9608 + */
9609 +typedef struct spinlock {
9610 +       union {
9611 +               struct raw_spinlock rlock;
9612 +
9613 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
9614 +# define LOCK_PADSIZE (offsetof(struct raw_spinlock, dep_map))
9615 +               struct {
9616 +                       u8 __padding[LOCK_PADSIZE];
9617 +                       struct lockdep_map dep_map;
9618 +               };
9619 +#endif
9620 +       };
9621 +} spinlock_t;
9622 +
9623 +#define __SPIN_LOCK_INITIALIZER(lockname) \
9624 +       { { .rlock = __RAW_SPIN_LOCK_INITIALIZER(lockname) } }
9625 +
9626 +#define __SPIN_LOCK_UNLOCKED(lockname) \
9627 +       (spinlock_t ) __SPIN_LOCK_INITIALIZER(lockname)
9628 +
9629 +#define DEFINE_SPINLOCK(x)     spinlock_t x = __SPIN_LOCK_UNLOCKED(x)
9630 +
9631 +#endif
9632 diff -Nur linux-4.9.6.orig/include/linux/spinlock_types_raw.h linux-4.9.6/include/linux/spinlock_types_raw.h
9633 --- linux-4.9.6.orig/include/linux/spinlock_types_raw.h 1970-01-01 01:00:00.000000000 +0100
9634 +++ linux-4.9.6/include/linux/spinlock_types_raw.h      2017-01-28 13:59:09.947656802 +0100
9635 @@ -0,0 +1,56 @@
9636 +#ifndef __LINUX_SPINLOCK_TYPES_RAW_H
9637 +#define __LINUX_SPINLOCK_TYPES_RAW_H
9638 +
9639 +#if defined(CONFIG_SMP)
9640 +# include <asm/spinlock_types.h>
9641 +#else
9642 +# include <linux/spinlock_types_up.h>
9643 +#endif
9644 +
9645 +#include <linux/lockdep.h>
9646 +
9647 +typedef struct raw_spinlock {
9648 +       arch_spinlock_t raw_lock;
9649 +#ifdef CONFIG_GENERIC_LOCKBREAK
9650 +       unsigned int break_lock;
9651 +#endif
9652 +#ifdef CONFIG_DEBUG_SPINLOCK
9653 +       unsigned int magic, owner_cpu;
9654 +       void *owner;
9655 +#endif
9656 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
9657 +       struct lockdep_map dep_map;
9658 +#endif
9659 +} raw_spinlock_t;
9660 +
9661 +#define SPINLOCK_MAGIC         0xdead4ead
9662 +
9663 +#define SPINLOCK_OWNER_INIT    ((void *)-1L)
9664 +
9665 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
9666 +# define SPIN_DEP_MAP_INIT(lockname)   .dep_map = { .name = #lockname }
9667 +#else
9668 +# define SPIN_DEP_MAP_INIT(lockname)
9669 +#endif
9670 +
9671 +#ifdef CONFIG_DEBUG_SPINLOCK
9672 +# define SPIN_DEBUG_INIT(lockname)             \
9673 +       .magic = SPINLOCK_MAGIC,                \
9674 +       .owner_cpu = -1,                        \
9675 +       .owner = SPINLOCK_OWNER_INIT,
9676 +#else
9677 +# define SPIN_DEBUG_INIT(lockname)
9678 +#endif
9679 +
9680 +#define __RAW_SPIN_LOCK_INITIALIZER(lockname)  \
9681 +       {                                       \
9682 +       .raw_lock = __ARCH_SPIN_LOCK_UNLOCKED,  \
9683 +       SPIN_DEBUG_INIT(lockname)               \
9684 +       SPIN_DEP_MAP_INIT(lockname) }
9685 +
9686 +#define __RAW_SPIN_LOCK_UNLOCKED(lockname)     \
9687 +       (raw_spinlock_t) __RAW_SPIN_LOCK_INITIALIZER(lockname)
9688 +
9689 +#define DEFINE_RAW_SPINLOCK(x) raw_spinlock_t x = __RAW_SPIN_LOCK_UNLOCKED(x)
9690 +
9691 +#endif
9692 diff -Nur linux-4.9.6.orig/include/linux/spinlock_types_rt.h linux-4.9.6/include/linux/spinlock_types_rt.h
9693 --- linux-4.9.6.orig/include/linux/spinlock_types_rt.h  1970-01-01 01:00:00.000000000 +0100
9694 +++ linux-4.9.6/include/linux/spinlock_types_rt.h       2017-01-28 13:59:09.947656802 +0100
9695 @@ -0,0 +1,48 @@
9696 +#ifndef __LINUX_SPINLOCK_TYPES_RT_H
9697 +#define __LINUX_SPINLOCK_TYPES_RT_H
9698 +
9699 +#ifndef __LINUX_SPINLOCK_TYPES_H
9700 +#error "Do not include directly. Include spinlock_types.h instead"
9701 +#endif
9702 +
9703 +#include <linux/cache.h>
9704 +
9705 +/*
9706 + * PREEMPT_RT: spinlocks - an RT mutex plus lock-break field:
9707 + */
9708 +typedef struct spinlock {
9709 +       struct rt_mutex         lock;
9710 +       unsigned int            break_lock;
9711 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
9712 +       struct lockdep_map      dep_map;
9713 +#endif
9714 +} spinlock_t;
9715 +
9716 +#ifdef CONFIG_DEBUG_RT_MUTEXES
9717 +# define __RT_SPIN_INITIALIZER(name) \
9718 +       { \
9719 +       .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), \
9720 +       .save_state = 1, \
9721 +       .file = __FILE__, \
9722 +       .line = __LINE__ , \
9723 +       }
9724 +#else
9725 +# define __RT_SPIN_INITIALIZER(name) \
9726 +       {                                                               \
9727 +       .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock),          \
9728 +       .save_state = 1, \
9729 +       }
9730 +#endif
9731 +
9732 +/*
9733 +.wait_list = PLIST_HEAD_INIT_RAW((name).lock.wait_list, (name).lock.wait_lock)
9734 +*/
9735 +
9736 +#define __SPIN_LOCK_UNLOCKED(name)                     \
9737 +       { .lock = __RT_SPIN_INITIALIZER(name.lock),             \
9738 +         SPIN_DEP_MAP_INIT(name) }
9739 +
9740 +#define DEFINE_SPINLOCK(name) \
9741 +       spinlock_t name = __SPIN_LOCK_UNLOCKED(name)
9742 +
9743 +#endif
9744 diff -Nur linux-4.9.6.orig/include/linux/srcu.h linux-4.9.6/include/linux/srcu.h
9745 --- linux-4.9.6.orig/include/linux/srcu.h       2017-01-26 08:25:24.000000000 +0100
9746 +++ linux-4.9.6/include/linux/srcu.h    2017-01-28 13:59:09.947656802 +0100
9747 @@ -84,10 +84,10 @@
9748
9749  void process_srcu(struct work_struct *work);
9750
9751 -#define __SRCU_STRUCT_INIT(name)                                       \
9752 +#define __SRCU_STRUCT_INIT(name, pcpu_name)                            \
9753         {                                                               \
9754                 .completed = -300,                                      \
9755 -               .per_cpu_ref = &name##_srcu_array,                      \
9756 +               .per_cpu_ref = &pcpu_name,                              \
9757                 .queue_lock = __SPIN_LOCK_UNLOCKED(name.queue_lock),    \
9758                 .running = false,                                       \
9759                 .batch_queue = RCU_BATCH_INIT(name.batch_queue),        \
9760 @@ -119,7 +119,7 @@
9761   */
9762  #define __DEFINE_SRCU(name, is_static)                                 \
9763         static DEFINE_PER_CPU(struct srcu_struct_array, name##_srcu_array);\
9764 -       is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name)
9765 +       is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name, name##_srcu_array)
9766  #define DEFINE_SRCU(name)              __DEFINE_SRCU(name, /* not static */)
9767  #define DEFINE_STATIC_SRCU(name)       __DEFINE_SRCU(name, static)
9768
9769 diff -Nur linux-4.9.6.orig/include/linux/suspend.h linux-4.9.6/include/linux/suspend.h
9770 --- linux-4.9.6.orig/include/linux/suspend.h    2017-01-26 08:25:24.000000000 +0100
9771 +++ linux-4.9.6/include/linux/suspend.h 2017-01-28 13:59:09.947656802 +0100
9772 @@ -193,6 +193,12 @@
9773         void (*end)(void);
9774  };
9775
9776 +#if defined(CONFIG_SUSPEND) || defined(CONFIG_HIBERNATION)
9777 +extern bool pm_in_action;
9778 +#else
9779 +# define pm_in_action false
9780 +#endif
9781 +
9782  #ifdef CONFIG_SUSPEND
9783  /**
9784   * suspend_set_ops - set platform dependent suspend operations
9785 diff -Nur linux-4.9.6.orig/include/linux/swait.h linux-4.9.6/include/linux/swait.h
9786 --- linux-4.9.6.orig/include/linux/swait.h      2017-01-26 08:25:24.000000000 +0100
9787 +++ linux-4.9.6/include/linux/swait.h   2017-01-28 13:59:09.947656802 +0100
9788 @@ -87,6 +87,7 @@
9789  extern void swake_up(struct swait_queue_head *q);
9790  extern void swake_up_all(struct swait_queue_head *q);
9791  extern void swake_up_locked(struct swait_queue_head *q);
9792 +extern void swake_up_all_locked(struct swait_queue_head *q);
9793
9794  extern void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait);
9795  extern void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int state);
9796 diff -Nur linux-4.9.6.orig/include/linux/swap.h linux-4.9.6/include/linux/swap.h
9797 --- linux-4.9.6.orig/include/linux/swap.h       2017-01-26 08:25:24.000000000 +0100
9798 +++ linux-4.9.6/include/linux/swap.h    2017-01-28 13:59:09.947656802 +0100
9799 @@ -11,6 +11,7 @@
9800  #include <linux/fs.h>
9801  #include <linux/atomic.h>
9802  #include <linux/page-flags.h>
9803 +#include <linux/locallock.h>
9804  #include <asm/page.h>
9805
9806  struct notifier_block;
9807 @@ -247,7 +248,8 @@
9808  void *workingset_eviction(struct address_space *mapping, struct page *page);
9809  bool workingset_refault(void *shadow);
9810  void workingset_activation(struct page *page);
9811 -extern struct list_lru workingset_shadow_nodes;
9812 +extern struct list_lru __workingset_shadow_nodes;
9813 +DECLARE_LOCAL_IRQ_LOCK(workingset_shadow_lock);
9814
9815  static inline unsigned int workingset_node_pages(struct radix_tree_node *node)
9816  {
9817 @@ -292,6 +294,7 @@
9818
9819
9820  /* linux/mm/swap.c */
9821 +DECLARE_LOCAL_IRQ_LOCK(swapvec_lock);
9822  extern void lru_cache_add(struct page *);
9823  extern void lru_cache_add_anon(struct page *page);
9824  extern void lru_cache_add_file(struct page *page);
9825 diff -Nur linux-4.9.6.orig/include/linux/swork.h linux-4.9.6/include/linux/swork.h
9826 --- linux-4.9.6.orig/include/linux/swork.h      1970-01-01 01:00:00.000000000 +0100
9827 +++ linux-4.9.6/include/linux/swork.h   2017-01-28 13:59:09.947656802 +0100
9828 @@ -0,0 +1,24 @@
9829 +#ifndef _LINUX_SWORK_H
9830 +#define _LINUX_SWORK_H
9831 +
9832 +#include <linux/list.h>
9833 +
9834 +struct swork_event {
9835 +       struct list_head item;
9836 +       unsigned long flags;
9837 +       void (*func)(struct swork_event *);
9838 +};
9839 +
9840 +static inline void INIT_SWORK(struct swork_event *event,
9841 +                             void (*func)(struct swork_event *))
9842 +{
9843 +       event->flags = 0;
9844 +       event->func = func;
9845 +}
9846 +
9847 +bool swork_queue(struct swork_event *sev);
9848 +
9849 +int swork_get(void);
9850 +void swork_put(void);
9851 +
9852 +#endif /* _LINUX_SWORK_H */
9853 diff -Nur linux-4.9.6.orig/include/linux/thread_info.h linux-4.9.6/include/linux/thread_info.h
9854 --- linux-4.9.6.orig/include/linux/thread_info.h        2017-01-26 08:25:24.000000000 +0100
9855 +++ linux-4.9.6/include/linux/thread_info.h     2017-01-28 13:59:09.947656802 +0100
9856 @@ -107,7 +107,17 @@
9857  #define test_thread_flag(flag) \
9858         test_ti_thread_flag(current_thread_info(), flag)
9859
9860 -#define tif_need_resched() test_thread_flag(TIF_NEED_RESCHED)
9861 +#ifdef CONFIG_PREEMPT_LAZY
9862 +#define tif_need_resched()     (test_thread_flag(TIF_NEED_RESCHED) || \
9863 +                                test_thread_flag(TIF_NEED_RESCHED_LAZY))
9864 +#define tif_need_resched_now() (test_thread_flag(TIF_NEED_RESCHED))
9865 +#define tif_need_resched_lazy()        test_thread_flag(TIF_NEED_RESCHED_LAZY))
9866 +
9867 +#else
9868 +#define tif_need_resched()     test_thread_flag(TIF_NEED_RESCHED)
9869 +#define tif_need_resched_now() test_thread_flag(TIF_NEED_RESCHED)
9870 +#define tif_need_resched_lazy()        0
9871 +#endif
9872
9873  #ifndef CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES
9874  static inline int arch_within_stack_frames(const void * const stack,
9875 diff -Nur linux-4.9.6.orig/include/linux/timer.h linux-4.9.6/include/linux/timer.h
9876 --- linux-4.9.6.orig/include/linux/timer.h      2017-01-26 08:25:24.000000000 +0100
9877 +++ linux-4.9.6/include/linux/timer.h   2017-01-28 13:59:09.947656802 +0100
9878 @@ -241,7 +241,7 @@
9879
9880  extern int try_to_del_timer_sync(struct timer_list *timer);
9881
9882 -#ifdef CONFIG_SMP
9883 +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
9884    extern int del_timer_sync(struct timer_list *timer);
9885  #else
9886  # define del_timer_sync(t)             del_timer(t)
9887 diff -Nur linux-4.9.6.orig/include/linux/trace_events.h linux-4.9.6/include/linux/trace_events.h
9888 --- linux-4.9.6.orig/include/linux/trace_events.h       2017-01-26 08:25:24.000000000 +0100
9889 +++ linux-4.9.6/include/linux/trace_events.h    2017-01-28 13:59:09.947656802 +0100
9890 @@ -56,6 +56,9 @@
9891         unsigned char           flags;
9892         unsigned char           preempt_count;
9893         int                     pid;
9894 +       unsigned short          migrate_disable;
9895 +       unsigned short          padding;
9896 +       unsigned char           preempt_lazy_count;
9897  };
9898
9899  #define TRACE_EVENT_TYPE_MAX                                           \
9900 diff -Nur linux-4.9.6.orig/include/linux/uaccess.h linux-4.9.6/include/linux/uaccess.h
9901 --- linux-4.9.6.orig/include/linux/uaccess.h    2017-01-26 08:25:24.000000000 +0100
9902 +++ linux-4.9.6/include/linux/uaccess.h 2017-01-28 13:59:09.947656802 +0100
9903 @@ -24,6 +24,7 @@
9904   */
9905  static inline void pagefault_disable(void)
9906  {
9907 +       migrate_disable();
9908         pagefault_disabled_inc();
9909         /*
9910          * make sure to have issued the store before a pagefault
9911 @@ -40,6 +41,7 @@
9912          */
9913         barrier();
9914         pagefault_disabled_dec();
9915 +       migrate_enable();
9916  }
9917
9918  /*
9919 diff -Nur linux-4.9.6.orig/include/linux/uprobes.h linux-4.9.6/include/linux/uprobes.h
9920 --- linux-4.9.6.orig/include/linux/uprobes.h    2017-01-26 08:25:24.000000000 +0100
9921 +++ linux-4.9.6/include/linux/uprobes.h 2017-01-28 13:59:09.947656802 +0100
9922 @@ -27,6 +27,7 @@
9923  #include <linux/errno.h>
9924  #include <linux/rbtree.h>
9925  #include <linux/types.h>
9926 +#include <linux/wait.h>
9927
9928  struct vm_area_struct;
9929  struct mm_struct;
9930 diff -Nur linux-4.9.6.orig/include/linux/vmstat.h linux-4.9.6/include/linux/vmstat.h
9931 --- linux-4.9.6.orig/include/linux/vmstat.h     2017-01-26 08:25:24.000000000 +0100
9932 +++ linux-4.9.6/include/linux/vmstat.h  2017-01-28 13:59:09.947656802 +0100
9933 @@ -33,7 +33,9 @@
9934   */
9935  static inline void __count_vm_event(enum vm_event_item item)
9936  {
9937 +       preempt_disable_rt();
9938         raw_cpu_inc(vm_event_states.event[item]);
9939 +       preempt_enable_rt();
9940  }
9941
9942  static inline void count_vm_event(enum vm_event_item item)
9943 @@ -43,7 +45,9 @@
9944
9945  static inline void __count_vm_events(enum vm_event_item item, long delta)
9946  {
9947 +       preempt_disable_rt();
9948         raw_cpu_add(vm_event_states.event[item], delta);
9949 +       preempt_enable_rt();
9950  }
9951
9952  static inline void count_vm_events(enum vm_event_item item, long delta)
9953 diff -Nur linux-4.9.6.orig/include/linux/wait.h linux-4.9.6/include/linux/wait.h
9954 --- linux-4.9.6.orig/include/linux/wait.h       2017-01-26 08:25:24.000000000 +0100
9955 +++ linux-4.9.6/include/linux/wait.h    2017-01-28 13:59:09.947656802 +0100
9956 @@ -8,6 +8,7 @@
9957  #include <linux/spinlock.h>
9958  #include <asm/current.h>
9959  #include <uapi/linux/wait.h>
9960 +#include <linux/atomic.h>
9961
9962  typedef struct __wait_queue wait_queue_t;
9963  typedef int (*wait_queue_func_t)(wait_queue_t *wait, unsigned mode, int flags, void *key);
9964 diff -Nur linux-4.9.6.orig/include/net/dst.h linux-4.9.6/include/net/dst.h
9965 --- linux-4.9.6.orig/include/net/dst.h  2017-01-26 08:25:24.000000000 +0100
9966 +++ linux-4.9.6/include/net/dst.h       2017-01-28 13:59:09.947656802 +0100
9967 @@ -446,7 +446,7 @@
9968  static inline int dst_neigh_output(struct dst_entry *dst, struct neighbour *n,
9969                                    struct sk_buff *skb)
9970  {
9971 -       const struct hh_cache *hh;
9972 +       struct hh_cache *hh;
9973
9974         if (dst->pending_confirm) {
9975                 unsigned long now = jiffies;
9976 diff -Nur linux-4.9.6.orig/include/net/gen_stats.h linux-4.9.6/include/net/gen_stats.h
9977 --- linux-4.9.6.orig/include/net/gen_stats.h    2017-01-26 08:25:24.000000000 +0100
9978 +++ linux-4.9.6/include/net/gen_stats.h 2017-01-28 13:59:09.947656802 +0100
9979 @@ -5,6 +5,7 @@
9980  #include <linux/socket.h>
9981  #include <linux/rtnetlink.h>
9982  #include <linux/pkt_sched.h>
9983 +#include <net/net_seq_lock.h>
9984
9985  struct gnet_stats_basic_cpu {
9986         struct gnet_stats_basic_packed bstats;
9987 @@ -33,11 +34,11 @@
9988                                  spinlock_t *lock, struct gnet_dump *d,
9989                                  int padattr);
9990
9991 -int gnet_stats_copy_basic(const seqcount_t *running,
9992 +int gnet_stats_copy_basic(net_seqlock_t *running,
9993                           struct gnet_dump *d,
9994                           struct gnet_stats_basic_cpu __percpu *cpu,
9995                           struct gnet_stats_basic_packed *b);
9996 -void __gnet_stats_copy_basic(const seqcount_t *running,
9997 +void __gnet_stats_copy_basic(net_seqlock_t *running,
9998                              struct gnet_stats_basic_packed *bstats,
9999                              struct gnet_stats_basic_cpu __percpu *cpu,
10000                              struct gnet_stats_basic_packed *b);
10001 @@ -55,14 +56,14 @@
10002                       struct gnet_stats_basic_cpu __percpu *cpu_bstats,
10003                       struct gnet_stats_rate_est64 *rate_est,
10004                       spinlock_t *stats_lock,
10005 -                     seqcount_t *running, struct nlattr *opt);
10006 +                     net_seqlock_t *running, struct nlattr *opt);
10007  void gen_kill_estimator(struct gnet_stats_basic_packed *bstats,
10008                         struct gnet_stats_rate_est64 *rate_est);
10009  int gen_replace_estimator(struct gnet_stats_basic_packed *bstats,
10010                           struct gnet_stats_basic_cpu __percpu *cpu_bstats,
10011                           struct gnet_stats_rate_est64 *rate_est,
10012                           spinlock_t *stats_lock,
10013 -                         seqcount_t *running, struct nlattr *opt);
10014 +                         net_seqlock_t *running, struct nlattr *opt);
10015  bool gen_estimator_active(const struct gnet_stats_basic_packed *bstats,
10016                           const struct gnet_stats_rate_est64 *rate_est);
10017  #endif
10018 diff -Nur linux-4.9.6.orig/include/net/neighbour.h linux-4.9.6/include/net/neighbour.h
10019 --- linux-4.9.6.orig/include/net/neighbour.h    2017-01-26 08:25:24.000000000 +0100
10020 +++ linux-4.9.6/include/net/neighbour.h 2017-01-28 13:59:09.947656802 +0100
10021 @@ -446,7 +446,7 @@
10022  }
10023  #endif
10024
10025 -static inline int neigh_hh_output(const struct hh_cache *hh, struct sk_buff *skb)
10026 +static inline int neigh_hh_output(struct hh_cache *hh, struct sk_buff *skb)
10027  {
10028         unsigned int seq;
10029         int hh_len;
10030 @@ -501,7 +501,7 @@
10031
10032  #define NEIGH_CB(skb)  ((struct neighbour_cb *)(skb)->cb)
10033
10034 -static inline void neigh_ha_snapshot(char *dst, const struct neighbour *n,
10035 +static inline void neigh_ha_snapshot(char *dst, struct neighbour *n,
10036                                      const struct net_device *dev)
10037  {
10038         unsigned int seq;
10039 diff -Nur linux-4.9.6.orig/include/net/netns/ipv4.h linux-4.9.6/include/net/netns/ipv4.h
10040 --- linux-4.9.6.orig/include/net/netns/ipv4.h   2017-01-26 08:25:24.000000000 +0100
10041 +++ linux-4.9.6/include/net/netns/ipv4.h        2017-01-28 13:59:09.947656802 +0100
10042 @@ -69,6 +69,7 @@
10043
10044         int sysctl_icmp_echo_ignore_all;
10045         int sysctl_icmp_echo_ignore_broadcasts;
10046 +       int sysctl_icmp_echo_sysrq;
10047         int sysctl_icmp_ignore_bogus_error_responses;
10048         int sysctl_icmp_ratelimit;
10049         int sysctl_icmp_ratemask;
10050 diff -Nur linux-4.9.6.orig/include/net/net_seq_lock.h linux-4.9.6/include/net/net_seq_lock.h
10051 --- linux-4.9.6.orig/include/net/net_seq_lock.h 1970-01-01 01:00:00.000000000 +0100
10052 +++ linux-4.9.6/include/net/net_seq_lock.h      2017-01-28 13:59:09.947656802 +0100
10053 @@ -0,0 +1,15 @@
10054 +#ifndef __NET_NET_SEQ_LOCK_H__
10055 +#define __NET_NET_SEQ_LOCK_H__
10056 +
10057 +#ifdef CONFIG_PREEMPT_RT_BASE
10058 +# define net_seqlock_t                 seqlock_t
10059 +# define net_seq_begin(__r)            read_seqbegin(__r)
10060 +# define net_seq_retry(__r, __s)       read_seqretry(__r, __s)
10061 +
10062 +#else
10063 +# define net_seqlock_t                 seqcount_t
10064 +# define net_seq_begin(__r)            read_seqcount_begin(__r)
10065 +# define net_seq_retry(__r, __s)       read_seqcount_retry(__r, __s)
10066 +#endif
10067 +
10068 +#endif
10069 diff -Nur linux-4.9.6.orig/include/net/sch_generic.h linux-4.9.6/include/net/sch_generic.h
10070 --- linux-4.9.6.orig/include/net/sch_generic.h  2017-01-26 08:25:24.000000000 +0100
10071 +++ linux-4.9.6/include/net/sch_generic.h       2017-01-28 13:59:09.951656955 +0100
10072 @@ -10,6 +10,7 @@
10073  #include <linux/dynamic_queue_limits.h>
10074  #include <net/gen_stats.h>
10075  #include <net/rtnetlink.h>
10076 +#include <net/net_seq_lock.h>
10077
10078  struct Qdisc_ops;
10079  struct qdisc_walker;
10080 @@ -86,7 +87,7 @@
10081         struct sk_buff          *gso_skb ____cacheline_aligned_in_smp;
10082         struct qdisc_skb_head   q;
10083         struct gnet_stats_basic_packed bstats;
10084 -       seqcount_t              running;
10085 +       net_seqlock_t           running;
10086         struct gnet_stats_queue qstats;
10087         unsigned long           state;
10088         struct Qdisc            *next_sched;
10089 @@ -98,13 +99,22 @@
10090         spinlock_t              busylock ____cacheline_aligned_in_smp;
10091  };
10092
10093 -static inline bool qdisc_is_running(const struct Qdisc *qdisc)
10094 +static inline bool qdisc_is_running(struct Qdisc *qdisc)
10095  {
10096 +#ifdef CONFIG_PREEMPT_RT_BASE
10097 +       return spin_is_locked(&qdisc->running.lock) ? true : false;
10098 +#else
10099         return (raw_read_seqcount(&qdisc->running) & 1) ? true : false;
10100 +#endif
10101  }
10102
10103  static inline bool qdisc_run_begin(struct Qdisc *qdisc)
10104  {
10105 +#ifdef CONFIG_PREEMPT_RT_BASE
10106 +       if (try_write_seqlock(&qdisc->running))
10107 +               return true;
10108 +       return false;
10109 +#else
10110         if (qdisc_is_running(qdisc))
10111                 return false;
10112         /* Variant of write_seqcount_begin() telling lockdep a trylock
10113 @@ -113,11 +123,16 @@
10114         raw_write_seqcount_begin(&qdisc->running);
10115         seqcount_acquire(&qdisc->running.dep_map, 0, 1, _RET_IP_);
10116         return true;
10117 +#endif
10118  }
10119
10120  static inline void qdisc_run_end(struct Qdisc *qdisc)
10121  {
10122 +#ifdef CONFIG_PREEMPT_RT_BASE
10123 +       write_sequnlock(&qdisc->running);
10124 +#else
10125         write_seqcount_end(&qdisc->running);
10126 +#endif
10127  }
10128
10129  static inline bool qdisc_may_bulk(const struct Qdisc *qdisc)
10130 @@ -308,7 +323,7 @@
10131         return qdisc_lock(root);
10132  }
10133
10134 -static inline seqcount_t *qdisc_root_sleeping_running(const struct Qdisc *qdisc)
10135 +static inline net_seqlock_t *qdisc_root_sleeping_running(const struct Qdisc *qdisc)
10136  {
10137         struct Qdisc *root = qdisc_root_sleeping(qdisc);
10138
10139 diff -Nur linux-4.9.6.orig/include/trace/events/hist.h linux-4.9.6/include/trace/events/hist.h
10140 --- linux-4.9.6.orig/include/trace/events/hist.h        1970-01-01 01:00:00.000000000 +0100
10141 +++ linux-4.9.6/include/trace/events/hist.h     2017-01-28 13:59:09.951656955 +0100
10142 @@ -0,0 +1,73 @@
10143 +#undef TRACE_SYSTEM
10144 +#define TRACE_SYSTEM hist
10145 +
10146 +#if !defined(_TRACE_HIST_H) || defined(TRACE_HEADER_MULTI_READ)
10147 +#define _TRACE_HIST_H
10148 +
10149 +#include "latency_hist.h"
10150 +#include <linux/tracepoint.h>
10151 +
10152 +#if !defined(CONFIG_PREEMPT_OFF_HIST) && !defined(CONFIG_INTERRUPT_OFF_HIST)
10153 +#define trace_preemptirqsoff_hist(a, b)
10154 +#define trace_preemptirqsoff_hist_rcuidle(a, b)
10155 +#else
10156 +TRACE_EVENT(preemptirqsoff_hist,
10157 +
10158 +       TP_PROTO(int reason, int starthist),
10159 +
10160 +       TP_ARGS(reason, starthist),
10161 +
10162 +       TP_STRUCT__entry(
10163 +               __field(int,    reason)
10164 +               __field(int,    starthist)
10165 +       ),
10166 +
10167 +       TP_fast_assign(
10168 +               __entry->reason         = reason;
10169 +               __entry->starthist      = starthist;
10170 +       ),
10171 +
10172 +       TP_printk("reason=%s starthist=%s", getaction(__entry->reason),
10173 +                 __entry->starthist ? "start" : "stop")
10174 +);
10175 +#endif
10176 +
10177 +#ifndef CONFIG_MISSED_TIMER_OFFSETS_HIST
10178 +#define trace_hrtimer_interrupt(a, b, c, d)
10179 +#else
10180 +TRACE_EVENT(hrtimer_interrupt,
10181 +
10182 +       TP_PROTO(int cpu, long long offset, struct task_struct *curr,
10183 +               struct task_struct *task),
10184 +
10185 +       TP_ARGS(cpu, offset, curr, task),
10186 +
10187 +       TP_STRUCT__entry(
10188 +               __field(int,            cpu)
10189 +               __field(long long,      offset)
10190 +               __array(char,           ccomm,  TASK_COMM_LEN)
10191 +               __field(int,            cprio)
10192 +               __array(char,           tcomm,  TASK_COMM_LEN)
10193 +               __field(int,            tprio)
10194 +       ),
10195 +
10196 +       TP_fast_assign(
10197 +               __entry->cpu    = cpu;
10198 +               __entry->offset = offset;
10199 +               memcpy(__entry->ccomm, curr->comm, TASK_COMM_LEN);
10200 +               __entry->cprio  = curr->prio;
10201 +               memcpy(__entry->tcomm, task != NULL ? task->comm : "<none>",
10202 +                       task != NULL ? TASK_COMM_LEN : 7);
10203 +               __entry->tprio  = task != NULL ? task->prio : -1;
10204 +       ),
10205 +
10206 +       TP_printk("cpu=%d offset=%lld curr=%s[%d] thread=%s[%d]",
10207 +               __entry->cpu, __entry->offset, __entry->ccomm,
10208 +               __entry->cprio, __entry->tcomm, __entry->tprio)
10209 +);
10210 +#endif
10211 +
10212 +#endif /* _TRACE_HIST_H */
10213 +
10214 +/* This part must be outside protection */
10215 +#include <trace/define_trace.h>
10216 diff -Nur linux-4.9.6.orig/include/trace/events/latency_hist.h linux-4.9.6/include/trace/events/latency_hist.h
10217 --- linux-4.9.6.orig/include/trace/events/latency_hist.h        1970-01-01 01:00:00.000000000 +0100
10218 +++ linux-4.9.6/include/trace/events/latency_hist.h     2017-01-28 13:59:09.951656955 +0100
10219 @@ -0,0 +1,29 @@
10220 +#ifndef _LATENCY_HIST_H
10221 +#define _LATENCY_HIST_H
10222 +
10223 +enum hist_action {
10224 +       IRQS_ON,
10225 +       PREEMPT_ON,
10226 +       TRACE_STOP,
10227 +       IRQS_OFF,
10228 +       PREEMPT_OFF,
10229 +       TRACE_START,
10230 +};
10231 +
10232 +static char *actions[] = {
10233 +       "IRQS_ON",
10234 +       "PREEMPT_ON",
10235 +       "TRACE_STOP",
10236 +       "IRQS_OFF",
10237 +       "PREEMPT_OFF",
10238 +       "TRACE_START",
10239 +};
10240 +
10241 +static inline char *getaction(int action)
10242 +{
10243 +       if (action >= 0 && action <= sizeof(actions)/sizeof(actions[0]))
10244 +               return actions[action];
10245 +       return "unknown";
10246 +}
10247 +
10248 +#endif /* _LATENCY_HIST_H */
10249 diff -Nur linux-4.9.6.orig/init/Kconfig linux-4.9.6/init/Kconfig
10250 --- linux-4.9.6.orig/init/Kconfig       2017-01-26 08:25:24.000000000 +0100
10251 +++ linux-4.9.6/init/Kconfig    2017-01-28 13:59:09.951656955 +0100
10252 @@ -506,7 +506,7 @@
10253
10254  config RCU_EXPERT
10255         bool "Make expert-level adjustments to RCU configuration"
10256 -       default n
10257 +       default y if PREEMPT_RT_FULL
10258         help
10259           This option needs to be enabled if you wish to make
10260           expert-level adjustments to RCU configuration.  By default,
10261 @@ -623,7 +623,7 @@
10262
10263  config RCU_FAST_NO_HZ
10264         bool "Accelerate last non-dyntick-idle CPU's grace periods"
10265 -       depends on NO_HZ_COMMON && SMP && RCU_EXPERT
10266 +       depends on NO_HZ_COMMON && SMP && RCU_EXPERT && !PREEMPT_RT_FULL
10267         default n
10268         help
10269           This option permits CPUs to enter dynticks-idle state even if
10270 @@ -650,7 +650,7 @@
10271  config RCU_BOOST
10272         bool "Enable RCU priority boosting"
10273         depends on RT_MUTEXES && PREEMPT_RCU && RCU_EXPERT
10274 -       default n
10275 +       default y if PREEMPT_RT_FULL
10276         help
10277           This option boosts the priority of preempted RCU readers that
10278           block the current preemptible RCU grace period for too long.
10279 @@ -781,19 +781,6 @@
10280
10281  endchoice
10282
10283 -config RCU_EXPEDITE_BOOT
10284 -       bool
10285 -       default n
10286 -       help
10287 -         This option enables expedited grace periods at boot time,
10288 -         as if rcu_expedite_gp() had been invoked early in boot.
10289 -         The corresponding rcu_unexpedite_gp() is invoked from
10290 -         rcu_end_inkernel_boot(), which is intended to be invoked
10291 -         at the end of the kernel-only boot sequence, just before
10292 -         init is exec'ed.
10293 -
10294 -         Accept the default if unsure.
10295 -
10296  endmenu # "RCU Subsystem"
10297
10298  config BUILD_BIN2C
10299 @@ -1064,6 +1051,7 @@
10300  config RT_GROUP_SCHED
10301         bool "Group scheduling for SCHED_RR/FIFO"
10302         depends on CGROUP_SCHED
10303 +       depends on !PREEMPT_RT_FULL
10304         default n
10305         help
10306           This feature lets you explicitly allocate real CPU bandwidth
10307 @@ -1772,6 +1760,7 @@
10308
10309  config SLAB
10310         bool "SLAB"
10311 +       depends on !PREEMPT_RT_FULL
10312         select HAVE_HARDENED_USERCOPY_ALLOCATOR
10313         help
10314           The regular slab allocator that is established and known to work
10315 @@ -1792,6 +1781,7 @@
10316  config SLOB
10317         depends on EXPERT
10318         bool "SLOB (Simple Allocator)"
10319 +       depends on !PREEMPT_RT_FULL
10320         help
10321            SLOB replaces the stock allocator with a drastically simpler
10322            allocator. SLOB is generally more space efficient but
10323 @@ -1810,7 +1800,7 @@
10324
10325  config SLUB_CPU_PARTIAL
10326         default y
10327 -       depends on SLUB && SMP
10328 +       depends on SLUB && SMP && !PREEMPT_RT_FULL
10329         bool "SLUB per cpu partial cache"
10330         help
10331           Per cpu partial caches accellerate objects allocation and freeing
10332 diff -Nur linux-4.9.6.orig/init/main.c linux-4.9.6/init/main.c
10333 --- linux-4.9.6.orig/init/main.c        2017-01-26 08:25:24.000000000 +0100
10334 +++ linux-4.9.6/init/main.c     2017-01-28 13:59:09.951656955 +0100
10335 @@ -507,6 +507,7 @@
10336         setup_command_line(command_line);
10337         setup_nr_cpu_ids();
10338         setup_per_cpu_areas();
10339 +       softirq_early_init();
10340         boot_cpu_state_init();
10341         smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */
10342
10343 diff -Nur linux-4.9.6.orig/init/Makefile linux-4.9.6/init/Makefile
10344 --- linux-4.9.6.orig/init/Makefile      2017-01-26 08:25:24.000000000 +0100
10345 +++ linux-4.9.6/init/Makefile   2017-01-28 13:59:09.951656955 +0100
10346 @@ -35,4 +35,4 @@
10347  include/generated/compile.h: FORCE
10348         @$($(quiet)chk_compile.h)
10349         $(Q)$(CONFIG_SHELL) $(srctree)/scripts/mkcompile_h $@ \
10350 -       "$(UTS_MACHINE)" "$(CONFIG_SMP)" "$(CONFIG_PREEMPT)" "$(CC) $(KBUILD_CFLAGS)"
10351 +       "$(UTS_MACHINE)" "$(CONFIG_SMP)" "$(CONFIG_PREEMPT)" "$(CONFIG_PREEMPT_RT_FULL)" "$(CC) $(KBUILD_CFLAGS)"
10352 diff -Nur linux-4.9.6.orig/ipc/sem.c linux-4.9.6/ipc/sem.c
10353 --- linux-4.9.6.orig/ipc/sem.c  2017-01-26 08:25:24.000000000 +0100
10354 +++ linux-4.9.6/ipc/sem.c       2017-01-28 13:59:09.951656955 +0100
10355 @@ -712,6 +712,13 @@
10356  static void wake_up_sem_queue_prepare(struct list_head *pt,
10357                                 struct sem_queue *q, int error)
10358  {
10359 +#ifdef CONFIG_PREEMPT_RT_BASE
10360 +       struct task_struct *p = q->sleeper;
10361 +       get_task_struct(p);
10362 +       q->status = error;
10363 +       wake_up_process(p);
10364 +       put_task_struct(p);
10365 +#else
10366         if (list_empty(pt)) {
10367                 /*
10368                  * Hold preempt off so that we don't get preempted and have the
10369 @@ -723,6 +730,7 @@
10370         q->pid = error;
10371
10372         list_add_tail(&q->list, pt);
10373 +#endif
10374  }
10375
10376  /**
10377 @@ -736,6 +744,7 @@
10378   */
10379  static void wake_up_sem_queue_do(struct list_head *pt)
10380  {
10381 +#ifndef CONFIG_PREEMPT_RT_BASE
10382         struct sem_queue *q, *t;
10383         int did_something;
10384
10385 @@ -748,6 +757,7 @@
10386         }
10387         if (did_something)
10388                 preempt_enable();
10389 +#endif
10390  }
10391
10392  static void unlink_queue(struct sem_array *sma, struct sem_queue *q)
10393 diff -Nur linux-4.9.6.orig/kernel/cgroup.c linux-4.9.6/kernel/cgroup.c
10394 --- linux-4.9.6.orig/kernel/cgroup.c    2017-01-26 08:25:24.000000000 +0100
10395 +++ linux-4.9.6/kernel/cgroup.c 2017-01-28 13:59:09.951656955 +0100
10396 @@ -5040,10 +5040,10 @@
10397         queue_work(cgroup_destroy_wq, &css->destroy_work);
10398  }
10399
10400 -static void css_release_work_fn(struct work_struct *work)
10401 +static void css_release_work_fn(struct swork_event *sev)
10402  {
10403         struct cgroup_subsys_state *css =
10404 -               container_of(work, struct cgroup_subsys_state, destroy_work);
10405 +               container_of(sev, struct cgroup_subsys_state, destroy_swork);
10406         struct cgroup_subsys *ss = css->ss;
10407         struct cgroup *cgrp = css->cgroup;
10408
10409 @@ -5086,8 +5086,8 @@
10410         struct cgroup_subsys_state *css =
10411                 container_of(ref, struct cgroup_subsys_state, refcnt);
10412
10413 -       INIT_WORK(&css->destroy_work, css_release_work_fn);
10414 -       queue_work(cgroup_destroy_wq, &css->destroy_work);
10415 +       INIT_SWORK(&css->destroy_swork, css_release_work_fn);
10416 +       swork_queue(&css->destroy_swork);
10417  }
10418
10419  static void init_and_link_css(struct cgroup_subsys_state *css,
10420 @@ -5742,6 +5742,7 @@
10421          */
10422         cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
10423         BUG_ON(!cgroup_destroy_wq);
10424 +       BUG_ON(swork_get());
10425
10426         /*
10427          * Used to destroy pidlists and separate to serve as flush domain.
10428 diff -Nur linux-4.9.6.orig/kernel/cpu.c linux-4.9.6/kernel/cpu.c
10429 --- linux-4.9.6.orig/kernel/cpu.c       2017-01-26 08:25:24.000000000 +0100
10430 +++ linux-4.9.6/kernel/cpu.c    2017-01-28 13:59:09.951656955 +0100
10431 @@ -239,6 +239,289 @@
10432  #define cpuhp_lock_acquire()      lock_map_acquire(&cpu_hotplug.dep_map)
10433  #define cpuhp_lock_release()      lock_map_release(&cpu_hotplug.dep_map)
10434
10435 +/**
10436 + * hotplug_pcp - per cpu hotplug descriptor
10437 + * @unplug:    set when pin_current_cpu() needs to sync tasks
10438 + * @sync_tsk:  the task that waits for tasks to finish pinned sections
10439 + * @refcount:  counter of tasks in pinned sections
10440 + * @grab_lock: set when the tasks entering pinned sections should wait
10441 + * @synced:    notifier for @sync_tsk to tell cpu_down it's finished
10442 + * @mutex:     the mutex to make tasks wait (used when @grab_lock is true)
10443 + * @mutex_init:        zero if the mutex hasn't been initialized yet.
10444 + *
10445 + * Although @unplug and @sync_tsk may point to the same task, the @unplug
10446 + * is used as a flag and still exists after @sync_tsk has exited and
10447 + * @sync_tsk set to NULL.
10448 + */
10449 +struct hotplug_pcp {
10450 +       struct task_struct *unplug;
10451 +       struct task_struct *sync_tsk;
10452 +       int refcount;
10453 +       int grab_lock;
10454 +       struct completion synced;
10455 +       struct completion unplug_wait;
10456 +#ifdef CONFIG_PREEMPT_RT_FULL
10457 +       /*
10458 +        * Note, on PREEMPT_RT, the hotplug lock must save the state of
10459 +        * the task, otherwise the mutex will cause the task to fail
10460 +        * to sleep when required. (Because it's called from migrate_disable())
10461 +        *
10462 +        * The spinlock_t on PREEMPT_RT is a mutex that saves the task's
10463 +        * state.
10464 +        */
10465 +       spinlock_t lock;
10466 +#else
10467 +       struct mutex mutex;
10468 +#endif
10469 +       int mutex_init;
10470 +};
10471 +
10472 +#ifdef CONFIG_PREEMPT_RT_FULL
10473 +# define hotplug_lock(hp) rt_spin_lock__no_mg(&(hp)->lock)
10474 +# define hotplug_unlock(hp) rt_spin_unlock__no_mg(&(hp)->lock)
10475 +#else
10476 +# define hotplug_lock(hp) mutex_lock(&(hp)->mutex)
10477 +# define hotplug_unlock(hp) mutex_unlock(&(hp)->mutex)
10478 +#endif
10479 +
10480 +static DEFINE_PER_CPU(struct hotplug_pcp, hotplug_pcp);
10481 +
10482 +/**
10483 + * pin_current_cpu - Prevent the current cpu from being unplugged
10484 + *
10485 + * Lightweight version of get_online_cpus() to prevent cpu from being
10486 + * unplugged when code runs in a migration disabled region.
10487 + *
10488 + * Must be called with preemption disabled (preempt_count = 1)!
10489 + */
10490 +void pin_current_cpu(void)
10491 +{
10492 +       struct hotplug_pcp *hp;
10493 +       int force = 0;
10494 +
10495 +retry:
10496 +       hp = this_cpu_ptr(&hotplug_pcp);
10497 +
10498 +       if (!hp->unplug || hp->refcount || force || preempt_count() > 1 ||
10499 +           hp->unplug == current) {
10500 +               hp->refcount++;
10501 +               return;
10502 +       }
10503 +       if (hp->grab_lock) {
10504 +               preempt_enable();
10505 +               hotplug_lock(hp);
10506 +               hotplug_unlock(hp);
10507 +       } else {
10508 +               preempt_enable();
10509 +               /*
10510 +                * Try to push this task off of this CPU.
10511 +                */
10512 +               if (!migrate_me()) {
10513 +                       preempt_disable();
10514 +                       hp = this_cpu_ptr(&hotplug_pcp);
10515 +                       if (!hp->grab_lock) {
10516 +                               /*
10517 +                                * Just let it continue it's already pinned
10518 +                                * or about to sleep.
10519 +                                */
10520 +                               force = 1;
10521 +                               goto retry;
10522 +                       }
10523 +                       preempt_enable();
10524 +               }
10525 +       }
10526 +       preempt_disable();
10527 +       goto retry;
10528 +}
10529 +
10530 +/**
10531 + * unpin_current_cpu - Allow unplug of current cpu
10532 + *
10533 + * Must be called with preemption or interrupts disabled!
10534 + */
10535 +void unpin_current_cpu(void)
10536 +{
10537 +       struct hotplug_pcp *hp = this_cpu_ptr(&hotplug_pcp);
10538 +
10539 +       WARN_ON(hp->refcount <= 0);
10540 +
10541 +       /* This is safe. sync_unplug_thread is pinned to this cpu */
10542 +       if (!--hp->refcount && hp->unplug && hp->unplug != current)
10543 +               wake_up_process(hp->unplug);
10544 +}
10545 +
10546 +static void wait_for_pinned_cpus(struct hotplug_pcp *hp)
10547 +{
10548 +       set_current_state(TASK_UNINTERRUPTIBLE);
10549 +       while (hp->refcount) {
10550 +               schedule_preempt_disabled();
10551 +               set_current_state(TASK_UNINTERRUPTIBLE);
10552 +       }
10553 +}
10554 +
10555 +static int sync_unplug_thread(void *data)
10556 +{
10557 +       struct hotplug_pcp *hp = data;
10558 +
10559 +       wait_for_completion(&hp->unplug_wait);
10560 +       preempt_disable();
10561 +       hp->unplug = current;
10562 +       wait_for_pinned_cpus(hp);
10563 +
10564 +       /*
10565 +        * This thread will synchronize the cpu_down() with threads
10566 +        * that have pinned the CPU. When the pinned CPU count reaches
10567 +        * zero, we inform the cpu_down code to continue to the next step.
10568 +        */
10569 +       set_current_state(TASK_UNINTERRUPTIBLE);
10570 +       preempt_enable();
10571 +       complete(&hp->synced);
10572 +
10573 +       /*
10574 +        * If all succeeds, the next step will need tasks to wait till
10575 +        * the CPU is offline before continuing. To do this, the grab_lock
10576 +        * is set and tasks going into pin_current_cpu() will block on the
10577 +        * mutex. But we still need to wait for those that are already in
10578 +        * pinned CPU sections. If the cpu_down() failed, the kthread_should_stop()
10579 +        * will kick this thread out.
10580 +        */
10581 +       while (!hp->grab_lock && !kthread_should_stop()) {
10582 +               schedule();
10583 +               set_current_state(TASK_UNINTERRUPTIBLE);
10584 +       }
10585 +
10586 +       /* Make sure grab_lock is seen before we see a stale completion */
10587 +       smp_mb();
10588 +
10589 +       /*
10590 +        * Now just before cpu_down() enters stop machine, we need to make
10591 +        * sure all tasks that are in pinned CPU sections are out, and new
10592 +        * tasks will now grab the lock, keeping them from entering pinned
10593 +        * CPU sections.
10594 +        */
10595 +       if (!kthread_should_stop()) {
10596 +               preempt_disable();
10597 +               wait_for_pinned_cpus(hp);
10598 +               preempt_enable();
10599 +               complete(&hp->synced);
10600 +       }
10601 +
10602 +       set_current_state(TASK_UNINTERRUPTIBLE);
10603 +       while (!kthread_should_stop()) {
10604 +               schedule();
10605 +               set_current_state(TASK_UNINTERRUPTIBLE);
10606 +       }
10607 +       set_current_state(TASK_RUNNING);
10608 +
10609 +       /*
10610 +        * Force this thread off this CPU as it's going down and
10611 +        * we don't want any more work on this CPU.
10612 +        */
10613 +       current->flags &= ~PF_NO_SETAFFINITY;
10614 +       set_cpus_allowed_ptr(current, cpu_present_mask);
10615 +       migrate_me();
10616 +       return 0;
10617 +}
10618 +
10619 +static void __cpu_unplug_sync(struct hotplug_pcp *hp)
10620 +{
10621 +       wake_up_process(hp->sync_tsk);
10622 +       wait_for_completion(&hp->synced);
10623 +}
10624 +
10625 +static void __cpu_unplug_wait(unsigned int cpu)
10626 +{
10627 +       struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
10628 +
10629 +       complete(&hp->unplug_wait);
10630 +       wait_for_completion(&hp->synced);
10631 +}
10632 +
10633 +/*
10634 + * Start the sync_unplug_thread on the target cpu and wait for it to
10635 + * complete.
10636 + */
10637 +static int cpu_unplug_begin(unsigned int cpu)
10638 +{
10639 +       struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
10640 +       int err;
10641 +
10642 +       /* Protected by cpu_hotplug.lock */
10643 +       if (!hp->mutex_init) {
10644 +#ifdef CONFIG_PREEMPT_RT_FULL
10645 +               spin_lock_init(&hp->lock);
10646 +#else
10647 +               mutex_init(&hp->mutex);
10648 +#endif
10649 +               hp->mutex_init = 1;
10650 +       }
10651 +
10652 +       /* Inform the scheduler to migrate tasks off this CPU */
10653 +       tell_sched_cpu_down_begin(cpu);
10654 +
10655 +       init_completion(&hp->synced);
10656 +       init_completion(&hp->unplug_wait);
10657 +
10658 +       hp->sync_tsk = kthread_create(sync_unplug_thread, hp, "sync_unplug/%d", cpu);
10659 +       if (IS_ERR(hp->sync_tsk)) {
10660 +               err = PTR_ERR(hp->sync_tsk);
10661 +               hp->sync_tsk = NULL;
10662 +               return err;
10663 +       }
10664 +       kthread_bind(hp->sync_tsk, cpu);
10665 +
10666 +       /*
10667 +        * Wait for tasks to get out of the pinned sections,
10668 +        * it's still OK if new tasks enter. Some CPU notifiers will
10669 +        * wait for tasks that are going to enter these sections and
10670 +        * we must not have them block.
10671 +        */
10672 +       wake_up_process(hp->sync_tsk);
10673 +       return 0;
10674 +}
10675 +
10676 +static void cpu_unplug_sync(unsigned int cpu)
10677 +{
10678 +       struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
10679 +
10680 +       init_completion(&hp->synced);
10681 +       /* The completion needs to be initialzied before setting grab_lock */
10682 +       smp_wmb();
10683 +
10684 +       /* Grab the mutex before setting grab_lock */
10685 +       hotplug_lock(hp);
10686 +       hp->grab_lock = 1;
10687 +
10688 +       /*
10689 +        * The CPU notifiers have been completed.
10690 +        * Wait for tasks to get out of pinned CPU sections and have new
10691 +        * tasks block until the CPU is completely down.
10692 +        */
10693 +       __cpu_unplug_sync(hp);
10694 +
10695 +       /* All done with the sync thread */
10696 +       kthread_stop(hp->sync_tsk);
10697 +       hp->sync_tsk = NULL;
10698 +}
10699 +
10700 +static void cpu_unplug_done(unsigned int cpu)
10701 +{
10702 +       struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
10703 +
10704 +       hp->unplug = NULL;
10705 +       /* Let all tasks know cpu unplug is finished before cleaning up */
10706 +       smp_wmb();
10707 +
10708 +       if (hp->sync_tsk)
10709 +               kthread_stop(hp->sync_tsk);
10710 +
10711 +       if (hp->grab_lock) {
10712 +               hotplug_unlock(hp);
10713 +               /* protected by cpu_hotplug.lock */
10714 +               hp->grab_lock = 0;
10715 +       }
10716 +       tell_sched_cpu_down_done(cpu);
10717 +}
10718
10719  void get_online_cpus(void)
10720  {
10721 @@ -789,10 +1072,14 @@
10722         struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
10723         int err;
10724
10725 +       __cpu_unplug_wait(cpu);
10726         /* Park the smpboot threads */
10727         kthread_park(per_cpu_ptr(&cpuhp_state, cpu)->thread);
10728         smpboot_park_threads(cpu);
10729
10730 +       /* Notifiers are done. Don't let any more tasks pin this CPU. */
10731 +       cpu_unplug_sync(cpu);
10732 +
10733         /*
10734          * Prevent irq alloc/free while the dying cpu reorganizes the
10735          * interrupt affinities.
10736 @@ -877,6 +1164,9 @@
10737         struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
10738         int prev_state, ret = 0;
10739         bool hasdied = false;
10740 +       int mycpu;
10741 +       cpumask_var_t cpumask;
10742 +       cpumask_var_t cpumask_org;
10743
10744         if (num_online_cpus() == 1)
10745                 return -EBUSY;
10746 @@ -884,7 +1174,34 @@
10747         if (!cpu_present(cpu))
10748                 return -EINVAL;
10749
10750 +       /* Move the downtaker off the unplug cpu */
10751 +       if (!alloc_cpumask_var(&cpumask, GFP_KERNEL))
10752 +               return -ENOMEM;
10753 +       if (!alloc_cpumask_var(&cpumask_org, GFP_KERNEL))  {
10754 +               free_cpumask_var(cpumask);
10755 +               return -ENOMEM;
10756 +       }
10757 +
10758 +       cpumask_copy(cpumask_org, tsk_cpus_allowed(current));
10759 +       cpumask_andnot(cpumask, cpu_online_mask, cpumask_of(cpu));
10760 +       set_cpus_allowed_ptr(current, cpumask);
10761 +       free_cpumask_var(cpumask);
10762 +       migrate_disable();
10763 +       mycpu = smp_processor_id();
10764 +       if (mycpu == cpu) {
10765 +               printk(KERN_ERR "Yuck! Still on unplug CPU\n!");
10766 +               migrate_enable();
10767 +               ret = -EBUSY;
10768 +               goto restore_cpus;
10769 +       }
10770 +
10771 +       migrate_enable();
10772         cpu_hotplug_begin();
10773 +       ret = cpu_unplug_begin(cpu);
10774 +       if (ret) {
10775 +               printk("cpu_unplug_begin(%d) failed\n", cpu);
10776 +               goto out_cancel;
10777 +       }
10778
10779         cpuhp_tasks_frozen = tasks_frozen;
10780
10781 @@ -923,10 +1240,15 @@
10782
10783         hasdied = prev_state != st->state && st->state == CPUHP_OFFLINE;
10784  out:
10785 +       cpu_unplug_done(cpu);
10786 +out_cancel:
10787         cpu_hotplug_done();
10788         /* This post dead nonsense must die */
10789         if (!ret && hasdied)
10790                 cpu_notify_nofail(CPU_POST_DEAD, cpu);
10791 +restore_cpus:
10792 +       set_cpus_allowed_ptr(current, cpumask_org);
10793 +       free_cpumask_var(cpumask_org);
10794         return ret;
10795  }
10796
10797 diff -Nur linux-4.9.6.orig/kernel/debug/kdb/kdb_io.c linux-4.9.6/kernel/debug/kdb/kdb_io.c
10798 --- linux-4.9.6.orig/kernel/debug/kdb/kdb_io.c  2017-01-26 08:25:24.000000000 +0100
10799 +++ linux-4.9.6/kernel/debug/kdb/kdb_io.c       2017-01-28 13:59:09.951656955 +0100
10800 @@ -554,7 +554,6 @@
10801         int linecount;
10802         int colcount;
10803         int logging, saved_loglevel = 0;
10804 -       int saved_trap_printk;
10805         int got_printf_lock = 0;
10806         int retlen = 0;
10807         int fnd, len;
10808 @@ -565,8 +564,6 @@
10809         unsigned long uninitialized_var(flags);
10810
10811         preempt_disable();
10812 -       saved_trap_printk = kdb_trap_printk;
10813 -       kdb_trap_printk = 0;
10814
10815         /* Serialize kdb_printf if multiple cpus try to write at once.
10816          * But if any cpu goes recursive in kdb, just print the output,
10817 @@ -855,7 +852,6 @@
10818         } else {
10819                 __release(kdb_printf_lock);
10820         }
10821 -       kdb_trap_printk = saved_trap_printk;
10822         preempt_enable();
10823         return retlen;
10824  }
10825 @@ -865,9 +861,11 @@
10826         va_list ap;
10827         int r;
10828
10829 +       kdb_trap_printk++;
10830         va_start(ap, fmt);
10831         r = vkdb_printf(KDB_MSGSRC_INTERNAL, fmt, ap);
10832         va_end(ap);
10833 +       kdb_trap_printk--;
10834
10835         return r;
10836  }
10837 diff -Nur linux-4.9.6.orig/kernel/events/core.c linux-4.9.6/kernel/events/core.c
10838 --- linux-4.9.6.orig/kernel/events/core.c       2017-01-26 08:25:24.000000000 +0100
10839 +++ linux-4.9.6/kernel/events/core.c    2017-01-28 13:59:09.955657109 +0100
10840 @@ -1050,6 +1050,7 @@
10841         raw_spin_lock_init(&cpuctx->hrtimer_lock);
10842         hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
10843         timer->function = perf_mux_hrtimer_handler;
10844 +       timer->irqsafe = 1;
10845  }
10846
10847  static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx)
10848 @@ -8335,6 +8336,7 @@
10849
10850         hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
10851         hwc->hrtimer.function = perf_swevent_hrtimer;
10852 +       hwc->hrtimer.irqsafe = 1;
10853
10854         /*
10855          * Since hrtimers have a fixed rate, we can do a static freq->period
10856 diff -Nur linux-4.9.6.orig/kernel/exit.c linux-4.9.6/kernel/exit.c
10857 --- linux-4.9.6.orig/kernel/exit.c      2017-01-26 08:25:24.000000000 +0100
10858 +++ linux-4.9.6/kernel/exit.c   2017-01-28 13:59:09.955657109 +0100
10859 @@ -143,7 +143,7 @@
10860          * Do this under ->siglock, we can race with another thread
10861          * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals.
10862          */
10863 -       flush_sigqueue(&tsk->pending);
10864 +       flush_task_sigqueue(tsk);
10865         tsk->sighand = NULL;
10866         spin_unlock(&sighand->siglock);
10867
10868 diff -Nur linux-4.9.6.orig/kernel/fork.c linux-4.9.6/kernel/fork.c
10869 --- linux-4.9.6.orig/kernel/fork.c      2017-01-26 08:25:24.000000000 +0100
10870 +++ linux-4.9.6/kernel/fork.c   2017-01-28 13:59:09.955657109 +0100
10871 @@ -76,6 +76,7 @@
10872  #include <linux/compiler.h>
10873  #include <linux/sysctl.h>
10874  #include <linux/kcov.h>
10875 +#include <linux/kprobes.h>
10876
10877  #include <asm/pgtable.h>
10878  #include <asm/pgalloc.h>
10879 @@ -376,13 +377,24 @@
10880         if (atomic_dec_and_test(&sig->sigcnt))
10881                 free_signal_struct(sig);
10882  }
10883 -
10884 +#ifdef CONFIG_PREEMPT_RT_BASE
10885 +static
10886 +#endif
10887  void __put_task_struct(struct task_struct *tsk)
10888  {
10889         WARN_ON(!tsk->exit_state);
10890         WARN_ON(atomic_read(&tsk->usage));
10891         WARN_ON(tsk == current);
10892
10893 +       /*
10894 +        * Remove function-return probe instances associated with this
10895 +        * task and put them back on the free list.
10896 +        */
10897 +       kprobe_flush_task(tsk);
10898 +
10899 +       /* Task is done with its stack. */
10900 +       put_task_stack(tsk);
10901 +
10902         cgroup_free(tsk);
10903         task_numa_free(tsk);
10904         security_task_free(tsk);
10905 @@ -393,7 +405,18 @@
10906         if (!profile_handoff_task(tsk))
10907                 free_task(tsk);
10908  }
10909 +#ifndef CONFIG_PREEMPT_RT_BASE
10910  EXPORT_SYMBOL_GPL(__put_task_struct);
10911 +#else
10912 +void __put_task_struct_cb(struct rcu_head *rhp)
10913 +{
10914 +       struct task_struct *tsk = container_of(rhp, struct task_struct, put_rcu);
10915 +
10916 +       __put_task_struct(tsk);
10917 +
10918 +}
10919 +EXPORT_SYMBOL_GPL(__put_task_struct_cb);
10920 +#endif
10921
10922  void __init __weak arch_task_cache_init(void) { }
10923
10924 @@ -852,6 +875,19 @@
10925  }
10926  EXPORT_SYMBOL_GPL(__mmdrop);
10927
10928 +#ifdef CONFIG_PREEMPT_RT_BASE
10929 +/*
10930 + * RCU callback for delayed mm drop. Not strictly rcu, but we don't
10931 + * want another facility to make this work.
10932 + */
10933 +void __mmdrop_delayed(struct rcu_head *rhp)
10934 +{
10935 +       struct mm_struct *mm = container_of(rhp, struct mm_struct, delayed_drop);
10936 +
10937 +       __mmdrop(mm);
10938 +}
10939 +#endif
10940 +
10941  static inline void __mmput(struct mm_struct *mm)
10942  {
10943         VM_BUG_ON(atomic_read(&mm->mm_users));
10944 @@ -1426,6 +1462,9 @@
10945   */
10946  static void posix_cpu_timers_init(struct task_struct *tsk)
10947  {
10948 +#ifdef CONFIG_PREEMPT_RT_BASE
10949 +       tsk->posix_timer_list = NULL;
10950 +#endif
10951         tsk->cputime_expires.prof_exp = 0;
10952         tsk->cputime_expires.virt_exp = 0;
10953         tsk->cputime_expires.sched_exp = 0;
10954 @@ -1552,6 +1591,7 @@
10955         spin_lock_init(&p->alloc_lock);
10956
10957         init_sigpending(&p->pending);
10958 +       p->sigqueue_cache = NULL;
10959
10960         p->utime = p->stime = p->gtime = 0;
10961         p->utimescaled = p->stimescaled = 0;
10962 diff -Nur linux-4.9.6.orig/kernel/futex.c linux-4.9.6/kernel/futex.c
10963 --- linux-4.9.6.orig/kernel/futex.c     2017-01-26 08:25:24.000000000 +0100
10964 +++ linux-4.9.6/kernel/futex.c  2017-01-28 13:59:09.955657109 +0100
10965 @@ -904,7 +904,9 @@
10966                  * task still owns the PI-state:
10967                  */
10968                 if (head->next != next) {
10969 +                       raw_spin_unlock_irq(&curr->pi_lock);
10970                         spin_unlock(&hb->lock);
10971 +                       raw_spin_lock_irq(&curr->pi_lock);
10972                         continue;
10973                 }
10974
10975 @@ -1299,6 +1301,7 @@
10976         struct futex_pi_state *pi_state = this->pi_state;
10977         u32 uninitialized_var(curval), newval;
10978         WAKE_Q(wake_q);
10979 +       WAKE_Q(wake_sleeper_q);
10980         bool deboost;
10981         int ret = 0;
10982
10983 @@ -1365,7 +1368,8 @@
10984
10985         raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
10986
10987 -       deboost = rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q);
10988 +       deboost = rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q,
10989 +                                       &wake_sleeper_q);
10990
10991         /*
10992          * First unlock HB so the waiter does not spin on it once he got woken
10993 @@ -1373,8 +1377,9 @@
10994          * deboost first (and lose our higher priority), then the task might get
10995          * scheduled away before the wake up can take place.
10996          */
10997 -       spin_unlock(&hb->lock);
10998 +       deboost |= spin_unlock_no_deboost(&hb->lock);
10999         wake_up_q(&wake_q);
11000 +       wake_up_q_sleeper(&wake_sleeper_q);
11001         if (deboost)
11002                 rt_mutex_adjust_prio(current);
11003
11004 @@ -1924,6 +1929,16 @@
11005                                 requeue_pi_wake_futex(this, &key2, hb2);
11006                                 drop_count++;
11007                                 continue;
11008 +                       } else if (ret == -EAGAIN) {
11009 +                               /*
11010 +                                * Waiter was woken by timeout or
11011 +                                * signal and has set pi_blocked_on to
11012 +                                * PI_WAKEUP_INPROGRESS before we
11013 +                                * tried to enqueue it on the rtmutex.
11014 +                                */
11015 +                               this->pi_state = NULL;
11016 +                               put_pi_state(pi_state);
11017 +                               continue;
11018                         } else if (ret) {
11019                                 /*
11020                                  * rt_mutex_start_proxy_lock() detected a
11021 @@ -2814,7 +2829,7 @@
11022         struct hrtimer_sleeper timeout, *to = NULL;
11023         struct rt_mutex_waiter rt_waiter;
11024         struct rt_mutex *pi_mutex = NULL;
11025 -       struct futex_hash_bucket *hb;
11026 +       struct futex_hash_bucket *hb, *hb2;
11027         union futex_key key2 = FUTEX_KEY_INIT;
11028         struct futex_q q = futex_q_init;
11029         int res, ret;
11030 @@ -2839,10 +2854,7 @@
11031          * The waiter is allocated on our stack, manipulated by the requeue
11032          * code while we sleep on uaddr.
11033          */
11034 -       debug_rt_mutex_init_waiter(&rt_waiter);
11035 -       RB_CLEAR_NODE(&rt_waiter.pi_tree_entry);
11036 -       RB_CLEAR_NODE(&rt_waiter.tree_entry);
11037 -       rt_waiter.task = NULL;
11038 +       rt_mutex_init_waiter(&rt_waiter, false);
11039
11040         ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
11041         if (unlikely(ret != 0))
11042 @@ -2873,20 +2885,55 @@
11043         /* Queue the futex_q, drop the hb lock, wait for wakeup. */
11044         futex_wait_queue_me(hb, &q, to);
11045
11046 -       spin_lock(&hb->lock);
11047 -       ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
11048 -       spin_unlock(&hb->lock);
11049 -       if (ret)
11050 -               goto out_put_keys;
11051 +       /*
11052 +        * On RT we must avoid races with requeue and trying to block
11053 +        * on two mutexes (hb->lock and uaddr2's rtmutex) by
11054 +        * serializing access to pi_blocked_on with pi_lock.
11055 +        */
11056 +       raw_spin_lock_irq(&current->pi_lock);
11057 +       if (current->pi_blocked_on) {
11058 +               /*
11059 +                * We have been requeued or are in the process of
11060 +                * being requeued.
11061 +                */
11062 +               raw_spin_unlock_irq(&current->pi_lock);
11063 +       } else {
11064 +               /*
11065 +                * Setting pi_blocked_on to PI_WAKEUP_INPROGRESS
11066 +                * prevents a concurrent requeue from moving us to the
11067 +                * uaddr2 rtmutex. After that we can safely acquire
11068 +                * (and possibly block on) hb->lock.
11069 +                */
11070 +               current->pi_blocked_on = PI_WAKEUP_INPROGRESS;
11071 +               raw_spin_unlock_irq(&current->pi_lock);
11072 +
11073 +               spin_lock(&hb->lock);
11074 +
11075 +               /*
11076 +                * Clean up pi_blocked_on. We might leak it otherwise
11077 +                * when we succeeded with the hb->lock in the fast
11078 +                * path.
11079 +                */
11080 +               raw_spin_lock_irq(&current->pi_lock);
11081 +               current->pi_blocked_on = NULL;
11082 +               raw_spin_unlock_irq(&current->pi_lock);
11083 +
11084 +               ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
11085 +               spin_unlock(&hb->lock);
11086 +               if (ret)
11087 +                       goto out_put_keys;
11088 +       }
11089
11090         /*
11091 -        * In order for us to be here, we know our q.key == key2, and since
11092 -        * we took the hb->lock above, we also know that futex_requeue() has
11093 -        * completed and we no longer have to concern ourselves with a wakeup
11094 -        * race with the atomic proxy lock acquisition by the requeue code. The
11095 -        * futex_requeue dropped our key1 reference and incremented our key2
11096 -        * reference count.
11097 +        * In order to be here, we have either been requeued, are in
11098 +        * the process of being requeued, or requeue successfully
11099 +        * acquired uaddr2 on our behalf.  If pi_blocked_on was
11100 +        * non-null above, we may be racing with a requeue.  Do not
11101 +        * rely on q->lock_ptr to be hb2->lock until after blocking on
11102 +        * hb->lock or hb2->lock. The futex_requeue dropped our key1
11103 +        * reference and incremented our key2 reference count.
11104          */
11105 +       hb2 = hash_futex(&key2);
11106
11107         /* Check if the requeue code acquired the second futex for us. */
11108         if (!q.rt_waiter) {
11109 @@ -2895,14 +2942,15 @@
11110                  * did a lock-steal - fix up the PI-state in that case.
11111                  */
11112                 if (q.pi_state && (q.pi_state->owner != current)) {
11113 -                       spin_lock(q.lock_ptr);
11114 +                       spin_lock(&hb2->lock);
11115 +                       BUG_ON(&hb2->lock != q.lock_ptr);
11116                         ret = fixup_pi_state_owner(uaddr2, &q, current);
11117                         /*
11118                          * Drop the reference to the pi state which
11119                          * the requeue_pi() code acquired for us.
11120                          */
11121                         put_pi_state(q.pi_state);
11122 -                       spin_unlock(q.lock_ptr);
11123 +                       spin_unlock(&hb2->lock);
11124                 }
11125         } else {
11126                 /*
11127 @@ -2915,7 +2963,8 @@
11128                 ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter);
11129                 debug_rt_mutex_free_waiter(&rt_waiter);
11130
11131 -               spin_lock(q.lock_ptr);
11132 +               spin_lock(&hb2->lock);
11133 +               BUG_ON(&hb2->lock != q.lock_ptr);
11134                 /*
11135                  * Fixup the pi_state owner and possibly acquire the lock if we
11136                  * haven't already.
11137 diff -Nur linux-4.9.6.orig/kernel/irq/handle.c linux-4.9.6/kernel/irq/handle.c
11138 --- linux-4.9.6.orig/kernel/irq/handle.c        2017-01-26 08:25:24.000000000 +0100
11139 +++ linux-4.9.6/kernel/irq/handle.c     2017-01-28 13:59:09.955657109 +0100
11140 @@ -181,10 +181,16 @@
11141  {
11142         irqreturn_t retval;
11143         unsigned int flags = 0;
11144 +       struct pt_regs *regs = get_irq_regs();
11145 +       u64 ip = regs ? instruction_pointer(regs) : 0;
11146
11147         retval = __handle_irq_event_percpu(desc, &flags);
11148
11149 -       add_interrupt_randomness(desc->irq_data.irq, flags);
11150 +#ifdef CONFIG_PREEMPT_RT_FULL
11151 +       desc->random_ip = ip;
11152 +#else
11153 +       add_interrupt_randomness(desc->irq_data.irq, flags, ip);
11154 +#endif
11155
11156         if (!noirqdebug)
11157                 note_interrupt(desc, retval);
11158 diff -Nur linux-4.9.6.orig/kernel/irq/manage.c linux-4.9.6/kernel/irq/manage.c
11159 --- linux-4.9.6.orig/kernel/irq/manage.c        2017-01-26 08:25:24.000000000 +0100
11160 +++ linux-4.9.6/kernel/irq/manage.c     2017-01-28 13:59:09.955657109 +0100
11161 @@ -22,6 +22,7 @@
11162  #include "internals.h"
11163
11164  #ifdef CONFIG_IRQ_FORCED_THREADING
11165 +# ifndef CONFIG_PREEMPT_RT_BASE
11166  __read_mostly bool force_irqthreads;
11167
11168  static int __init setup_forced_irqthreads(char *arg)
11169 @@ -30,6 +31,7 @@
11170         return 0;
11171  }
11172  early_param("threadirqs", setup_forced_irqthreads);
11173 +# endif
11174  #endif
11175
11176  static void __synchronize_hardirq(struct irq_desc *desc)
11177 @@ -233,7 +235,12 @@
11178
11179         if (desc->affinity_notify) {
11180                 kref_get(&desc->affinity_notify->kref);
11181 +
11182 +#ifdef CONFIG_PREEMPT_RT_BASE
11183 +               swork_queue(&desc->affinity_notify->swork);
11184 +#else
11185                 schedule_work(&desc->affinity_notify->work);
11186 +#endif
11187         }
11188         irqd_set(data, IRQD_AFFINITY_SET);
11189
11190 @@ -271,10 +278,8 @@
11191  }
11192  EXPORT_SYMBOL_GPL(irq_set_affinity_hint);
11193
11194 -static void irq_affinity_notify(struct work_struct *work)
11195 +static void _irq_affinity_notify(struct irq_affinity_notify *notify)
11196  {
11197 -       struct irq_affinity_notify *notify =
11198 -               container_of(work, struct irq_affinity_notify, work);
11199         struct irq_desc *desc = irq_to_desc(notify->irq);
11200         cpumask_var_t cpumask;
11201         unsigned long flags;
11202 @@ -296,6 +301,35 @@
11203         kref_put(&notify->kref, notify->release);
11204  }
11205
11206 +#ifdef CONFIG_PREEMPT_RT_BASE
11207 +static void init_helper_thread(void)
11208 +{
11209 +       static int init_sworker_once;
11210 +
11211 +       if (init_sworker_once)
11212 +               return;
11213 +       if (WARN_ON(swork_get()))
11214 +               return;
11215 +       init_sworker_once = 1;
11216 +}
11217 +
11218 +static void irq_affinity_notify(struct swork_event *swork)
11219 +{
11220 +       struct irq_affinity_notify *notify =
11221 +               container_of(swork, struct irq_affinity_notify, swork);
11222 +       _irq_affinity_notify(notify);
11223 +}
11224 +
11225 +#else
11226 +
11227 +static void irq_affinity_notify(struct work_struct *work)
11228 +{
11229 +       struct irq_affinity_notify *notify =
11230 +               container_of(work, struct irq_affinity_notify, work);
11231 +       _irq_affinity_notify(notify);
11232 +}
11233 +#endif
11234 +
11235  /**
11236   *     irq_set_affinity_notifier - control notification of IRQ affinity changes
11237   *     @irq:           Interrupt for which to enable/disable notification
11238 @@ -324,7 +358,12 @@
11239         if (notify) {
11240                 notify->irq = irq;
11241                 kref_init(&notify->kref);
11242 +#ifdef CONFIG_PREEMPT_RT_BASE
11243 +               INIT_SWORK(&notify->swork, irq_affinity_notify);
11244 +               init_helper_thread();
11245 +#else
11246                 INIT_WORK(&notify->work, irq_affinity_notify);
11247 +#endif
11248         }
11249
11250         raw_spin_lock_irqsave(&desc->lock, flags);
11251 @@ -879,7 +918,15 @@
11252         local_bh_disable();
11253         ret = action->thread_fn(action->irq, action->dev_id);
11254         irq_finalize_oneshot(desc, action);
11255 -       local_bh_enable();
11256 +       /*
11257 +        * Interrupts which have real time requirements can be set up
11258 +        * to avoid softirq processing in the thread handler. This is
11259 +        * safe as these interrupts do not raise soft interrupts.
11260 +        */
11261 +       if (irq_settings_no_softirq_call(desc))
11262 +               _local_bh_enable();
11263 +       else
11264 +               local_bh_enable();
11265         return ret;
11266  }
11267
11268 @@ -976,6 +1023,12 @@
11269                 if (action_ret == IRQ_WAKE_THREAD)
11270                         irq_wake_secondary(desc, action);
11271
11272 +#ifdef CONFIG_PREEMPT_RT_FULL
11273 +               migrate_disable();
11274 +               add_interrupt_randomness(action->irq, 0,
11275 +                                desc->random_ip ^ (unsigned long) action);
11276 +               migrate_enable();
11277 +#endif
11278                 wake_threads_waitq(desc);
11279         }
11280
11281 @@ -1336,6 +1389,9 @@
11282                         irqd_set(&desc->irq_data, IRQD_NO_BALANCING);
11283                 }
11284
11285 +               if (new->flags & IRQF_NO_SOFTIRQ_CALL)
11286 +                       irq_settings_set_no_softirq_call(desc);
11287 +
11288                 /* Set default affinity mask once everything is setup */
11289                 setup_affinity(desc, mask);
11290
11291 @@ -2061,7 +2117,7 @@
11292   *     This call sets the internal irqchip state of an interrupt,
11293   *     depending on the value of @which.
11294   *
11295 - *     This function should be called with preemption disabled if the
11296 + *     This function should be called with migration disabled if the
11297   *     interrupt controller has per-cpu registers.
11298   */
11299  int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
11300 diff -Nur linux-4.9.6.orig/kernel/irq/settings.h linux-4.9.6/kernel/irq/settings.h
11301 --- linux-4.9.6.orig/kernel/irq/settings.h      2017-01-26 08:25:24.000000000 +0100
11302 +++ linux-4.9.6/kernel/irq/settings.h   2017-01-28 13:59:09.955657109 +0100
11303 @@ -16,6 +16,7 @@
11304         _IRQ_PER_CPU_DEVID      = IRQ_PER_CPU_DEVID,
11305         _IRQ_IS_POLLED          = IRQ_IS_POLLED,
11306         _IRQ_DISABLE_UNLAZY     = IRQ_DISABLE_UNLAZY,
11307 +       _IRQ_NO_SOFTIRQ_CALL    = IRQ_NO_SOFTIRQ_CALL,
11308         _IRQF_MODIFY_MASK       = IRQF_MODIFY_MASK,
11309  };
11310
11311 @@ -30,6 +31,7 @@
11312  #define IRQ_PER_CPU_DEVID      GOT_YOU_MORON
11313  #define IRQ_IS_POLLED          GOT_YOU_MORON
11314  #define IRQ_DISABLE_UNLAZY     GOT_YOU_MORON
11315 +#define IRQ_NO_SOFTIRQ_CALL    GOT_YOU_MORON
11316  #undef IRQF_MODIFY_MASK
11317  #define IRQF_MODIFY_MASK       GOT_YOU_MORON
11318
11319 @@ -40,6 +42,16 @@
11320         desc->status_use_accessors |= (set & _IRQF_MODIFY_MASK);
11321  }
11322
11323 +static inline bool irq_settings_no_softirq_call(struct irq_desc *desc)
11324 +{
11325 +       return desc->status_use_accessors & _IRQ_NO_SOFTIRQ_CALL;
11326 +}
11327 +
11328 +static inline void irq_settings_set_no_softirq_call(struct irq_desc *desc)
11329 +{
11330 +       desc->status_use_accessors |= _IRQ_NO_SOFTIRQ_CALL;
11331 +}
11332 +
11333  static inline bool irq_settings_is_per_cpu(struct irq_desc *desc)
11334  {
11335         return desc->status_use_accessors & _IRQ_PER_CPU;
11336 diff -Nur linux-4.9.6.orig/kernel/irq/spurious.c linux-4.9.6/kernel/irq/spurious.c
11337 --- linux-4.9.6.orig/kernel/irq/spurious.c      2017-01-26 08:25:24.000000000 +0100
11338 +++ linux-4.9.6/kernel/irq/spurious.c   2017-01-28 13:59:09.955657109 +0100
11339 @@ -442,6 +442,10 @@
11340
11341  static int __init irqfixup_setup(char *str)
11342  {
11343 +#ifdef CONFIG_PREEMPT_RT_BASE
11344 +       pr_warn("irqfixup boot option not supported w/ CONFIG_PREEMPT_RT_BASE\n");
11345 +       return 1;
11346 +#endif
11347         irqfixup = 1;
11348         printk(KERN_WARNING "Misrouted IRQ fixup support enabled.\n");
11349         printk(KERN_WARNING "This may impact system performance.\n");
11350 @@ -454,6 +458,10 @@
11351
11352  static int __init irqpoll_setup(char *str)
11353  {
11354 +#ifdef CONFIG_PREEMPT_RT_BASE
11355 +       pr_warn("irqpoll boot option not supported w/ CONFIG_PREEMPT_RT_BASE\n");
11356 +       return 1;
11357 +#endif
11358         irqfixup = 2;
11359         printk(KERN_WARNING "Misrouted IRQ fixup and polling support "
11360                                 "enabled\n");
11361 diff -Nur linux-4.9.6.orig/kernel/irq_work.c linux-4.9.6/kernel/irq_work.c
11362 --- linux-4.9.6.orig/kernel/irq_work.c  2017-01-26 08:25:24.000000000 +0100
11363 +++ linux-4.9.6/kernel/irq_work.c       2017-01-28 13:59:09.955657109 +0100
11364 @@ -17,6 +17,7 @@
11365  #include <linux/cpu.h>
11366  #include <linux/notifier.h>
11367  #include <linux/smp.h>
11368 +#include <linux/interrupt.h>
11369  #include <asm/processor.h>
11370
11371
11372 @@ -65,6 +66,8 @@
11373   */
11374  bool irq_work_queue_on(struct irq_work *work, int cpu)
11375  {
11376 +       struct llist_head *list;
11377 +
11378         /* All work should have been flushed before going offline */
11379         WARN_ON_ONCE(cpu_is_offline(cpu));
11380
11381 @@ -75,7 +78,12 @@
11382         if (!irq_work_claim(work))
11383                 return false;
11384
11385 -       if (llist_add(&work->llnode, &per_cpu(raised_list, cpu)))
11386 +       if (IS_ENABLED(CONFIG_PREEMPT_RT_FULL) && !(work->flags & IRQ_WORK_HARD_IRQ))
11387 +               list = &per_cpu(lazy_list, cpu);
11388 +       else
11389 +               list = &per_cpu(raised_list, cpu);
11390 +
11391 +       if (llist_add(&work->llnode, list))
11392                 arch_send_call_function_single_ipi(cpu);
11393
11394         return true;
11395 @@ -86,6 +94,9 @@
11396  /* Enqueue the irq work @work on the current CPU */
11397  bool irq_work_queue(struct irq_work *work)
11398  {
11399 +       struct llist_head *list;
11400 +       bool lazy_work, realtime = IS_ENABLED(CONFIG_PREEMPT_RT_FULL);
11401 +
11402         /* Only queue if not already pending */
11403         if (!irq_work_claim(work))
11404                 return false;
11405 @@ -93,13 +104,15 @@
11406         /* Queue the entry and raise the IPI if needed. */
11407         preempt_disable();
11408
11409 -       /* If the work is "lazy", handle it from next tick if any */
11410 -       if (work->flags & IRQ_WORK_LAZY) {
11411 -               if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) &&
11412 -                   tick_nohz_tick_stopped())
11413 -                       arch_irq_work_raise();
11414 -       } else {
11415 -               if (llist_add(&work->llnode, this_cpu_ptr(&raised_list)))
11416 +       lazy_work = work->flags & IRQ_WORK_LAZY;
11417 +
11418 +       if (lazy_work || (realtime && !(work->flags & IRQ_WORK_HARD_IRQ)))
11419 +               list = this_cpu_ptr(&lazy_list);
11420 +       else
11421 +               list = this_cpu_ptr(&raised_list);
11422 +
11423 +       if (llist_add(&work->llnode, list)) {
11424 +               if (!lazy_work || tick_nohz_tick_stopped())
11425                         arch_irq_work_raise();
11426         }
11427
11428 @@ -116,9 +129,8 @@
11429         raised = this_cpu_ptr(&raised_list);
11430         lazy = this_cpu_ptr(&lazy_list);
11431
11432 -       if (llist_empty(raised) || arch_irq_work_has_interrupt())
11433 -               if (llist_empty(lazy))
11434 -                       return false;
11435 +       if (llist_empty(raised) && llist_empty(lazy))
11436 +               return false;
11437
11438         /* All work should have been flushed before going offline */
11439         WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
11440 @@ -132,7 +144,7 @@
11441         struct irq_work *work;
11442         struct llist_node *llnode;
11443
11444 -       BUG_ON(!irqs_disabled());
11445 +       BUG_ON_NONRT(!irqs_disabled());
11446
11447         if (llist_empty(list))
11448                 return;
11449 @@ -169,7 +181,16 @@
11450  void irq_work_run(void)
11451  {
11452         irq_work_run_list(this_cpu_ptr(&raised_list));
11453 -       irq_work_run_list(this_cpu_ptr(&lazy_list));
11454 +       if (IS_ENABLED(CONFIG_PREEMPT_RT_FULL)) {
11455 +               /*
11456 +                * NOTE: we raise softirq via IPI for safety,
11457 +                * and execute in irq_work_tick() to move the
11458 +                * overhead from hard to soft irq context.
11459 +                */
11460 +               if (!llist_empty(this_cpu_ptr(&lazy_list)))
11461 +                       raise_softirq(TIMER_SOFTIRQ);
11462 +       } else
11463 +               irq_work_run_list(this_cpu_ptr(&lazy_list));
11464  }
11465  EXPORT_SYMBOL_GPL(irq_work_run);
11466
11467 @@ -179,8 +200,17 @@
11468
11469         if (!llist_empty(raised) && !arch_irq_work_has_interrupt())
11470                 irq_work_run_list(raised);
11471 +
11472 +       if (!IS_ENABLED(CONFIG_PREEMPT_RT_FULL))
11473 +               irq_work_run_list(this_cpu_ptr(&lazy_list));
11474 +}
11475 +
11476 +#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_PREEMPT_RT_FULL)
11477 +void irq_work_tick_soft(void)
11478 +{
11479         irq_work_run_list(this_cpu_ptr(&lazy_list));
11480  }
11481 +#endif
11482
11483  /*
11484   * Synchronize against the irq_work @entry, ensures the entry is not
11485 diff -Nur linux-4.9.6.orig/kernel/Kconfig.locks linux-4.9.6/kernel/Kconfig.locks
11486 --- linux-4.9.6.orig/kernel/Kconfig.locks       2017-01-26 08:25:24.000000000 +0100
11487 +++ linux-4.9.6/kernel/Kconfig.locks    2017-01-28 13:59:09.951656955 +0100
11488 @@ -225,11 +225,11 @@
11489
11490  config MUTEX_SPIN_ON_OWNER
11491         def_bool y
11492 -       depends on SMP && !DEBUG_MUTEXES && ARCH_SUPPORTS_ATOMIC_RMW
11493 +       depends on SMP && !DEBUG_MUTEXES && ARCH_SUPPORTS_ATOMIC_RMW && !PREEMPT_RT_FULL
11494
11495  config RWSEM_SPIN_ON_OWNER
11496         def_bool y
11497 -       depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW
11498 +       depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW && !PREEMPT_RT_FULL
11499
11500  config LOCK_SPIN_ON_OWNER
11501         def_bool y
11502 diff -Nur linux-4.9.6.orig/kernel/Kconfig.preempt linux-4.9.6/kernel/Kconfig.preempt
11503 --- linux-4.9.6.orig/kernel/Kconfig.preempt     2017-01-26 08:25:24.000000000 +0100
11504 +++ linux-4.9.6/kernel/Kconfig.preempt  2017-01-28 13:59:09.951656955 +0100
11505 @@ -1,3 +1,16 @@
11506 +config PREEMPT
11507 +       bool
11508 +       select PREEMPT_COUNT
11509 +
11510 +config PREEMPT_RT_BASE
11511 +       bool
11512 +       select PREEMPT
11513 +
11514 +config HAVE_PREEMPT_LAZY
11515 +       bool
11516 +
11517 +config PREEMPT_LAZY
11518 +       def_bool y if HAVE_PREEMPT_LAZY && PREEMPT_RT_FULL
11519
11520  choice
11521         prompt "Preemption Model"
11522 @@ -33,9 +46,9 @@
11523
11524           Select this if you are building a kernel for a desktop system.
11525
11526 -config PREEMPT
11527 +config PREEMPT__LL
11528         bool "Preemptible Kernel (Low-Latency Desktop)"
11529 -       select PREEMPT_COUNT
11530 +       select PREEMPT
11531         select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK
11532         help
11533           This option reduces the latency of the kernel by making
11534 @@ -52,6 +65,22 @@
11535           embedded system with latency requirements in the milliseconds
11536           range.
11537
11538 +config PREEMPT_RTB
11539 +       bool "Preemptible Kernel (Basic RT)"
11540 +       select PREEMPT_RT_BASE
11541 +       help
11542 +         This option is basically the same as (Low-Latency Desktop) but
11543 +         enables changes which are preliminary for the full preemptible
11544 +         RT kernel.
11545 +
11546 +config PREEMPT_RT_FULL
11547 +       bool "Fully Preemptible Kernel (RT)"
11548 +       depends on IRQ_FORCED_THREADING
11549 +       select PREEMPT_RT_BASE
11550 +       select PREEMPT_RCU
11551 +       help
11552 +         All and everything
11553 +
11554  endchoice
11555
11556  config PREEMPT_COUNT
11557 diff -Nur linux-4.9.6.orig/kernel/ksysfs.c linux-4.9.6/kernel/ksysfs.c
11558 --- linux-4.9.6.orig/kernel/ksysfs.c    2017-01-26 08:25:24.000000000 +0100
11559 +++ linux-4.9.6/kernel/ksysfs.c 2017-01-28 13:59:09.955657109 +0100
11560 @@ -136,6 +136,15 @@
11561
11562  #endif /* CONFIG_KEXEC_CORE */
11563
11564 +#if defined(CONFIG_PREEMPT_RT_FULL)
11565 +static ssize_t  realtime_show(struct kobject *kobj,
11566 +                             struct kobj_attribute *attr, char *buf)
11567 +{
11568 +       return sprintf(buf, "%d\n", 1);
11569 +}
11570 +KERNEL_ATTR_RO(realtime);
11571 +#endif
11572 +
11573  /* whether file capabilities are enabled */
11574  static ssize_t fscaps_show(struct kobject *kobj,
11575                                   struct kobj_attribute *attr, char *buf)
11576 @@ -225,6 +234,9 @@
11577         &rcu_expedited_attr.attr,
11578         &rcu_normal_attr.attr,
11579  #endif
11580 +#ifdef CONFIG_PREEMPT_RT_FULL
11581 +       &realtime_attr.attr,
11582 +#endif
11583         NULL
11584  };
11585
11586 diff -Nur linux-4.9.6.orig/kernel/locking/lockdep.c linux-4.9.6/kernel/locking/lockdep.c
11587 --- linux-4.9.6.orig/kernel/locking/lockdep.c   2017-01-26 08:25:24.000000000 +0100
11588 +++ linux-4.9.6/kernel/locking/lockdep.c        2017-01-28 13:59:09.955657109 +0100
11589 @@ -3689,6 +3689,7 @@
11590                 }
11591         }
11592
11593 +#ifndef CONFIG_PREEMPT_RT_FULL
11594         /*
11595          * We dont accurately track softirq state in e.g.
11596          * hardirq contexts (such as on 4KSTACKS), so only
11597 @@ -3703,6 +3704,7 @@
11598                         DEBUG_LOCKS_WARN_ON(!current->softirqs_enabled);
11599                 }
11600         }
11601 +#endif
11602
11603         if (!debug_locks)
11604                 print_irqtrace_events(current);
11605 diff -Nur linux-4.9.6.orig/kernel/locking/locktorture.c linux-4.9.6/kernel/locking/locktorture.c
11606 --- linux-4.9.6.orig/kernel/locking/locktorture.c       2017-01-26 08:25:24.000000000 +0100
11607 +++ linux-4.9.6/kernel/locking/locktorture.c    2017-01-28 13:59:09.959657263 +0100
11608 @@ -26,7 +26,6 @@
11609  #include <linux/kthread.h>
11610  #include <linux/sched/rt.h>
11611  #include <linux/spinlock.h>
11612 -#include <linux/rwlock.h>
11613  #include <linux/mutex.h>
11614  #include <linux/rwsem.h>
11615  #include <linux/smp.h>
11616 diff -Nur linux-4.9.6.orig/kernel/locking/Makefile linux-4.9.6/kernel/locking/Makefile
11617 --- linux-4.9.6.orig/kernel/locking/Makefile    2017-01-26 08:25:24.000000000 +0100
11618 +++ linux-4.9.6/kernel/locking/Makefile 2017-01-28 13:59:09.955657109 +0100
11619 @@ -2,7 +2,7 @@
11620  # and is generally not a function of system call inputs.
11621  KCOV_INSTRUMENT                := n
11622
11623 -obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o
11624 +obj-y += semaphore.o percpu-rwsem.o
11625
11626  ifdef CONFIG_FUNCTION_TRACER
11627  CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE)
11628 @@ -11,7 +11,11 @@
11629  CFLAGS_REMOVE_rtmutex-debug.o = $(CC_FLAGS_FTRACE)
11630  endif
11631
11632 +ifneq ($(CONFIG_PREEMPT_RT_FULL),y)
11633 +obj-y += mutex.o
11634  obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
11635 +obj-y += rwsem.o
11636 +endif
11637  obj-$(CONFIG_LOCKDEP) += lockdep.o
11638  ifeq ($(CONFIG_PROC_FS),y)
11639  obj-$(CONFIG_LOCKDEP) += lockdep_proc.o
11640 @@ -24,7 +28,10 @@
11641  obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
11642  obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
11643  obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o
11644 +ifneq ($(CONFIG_PREEMPT_RT_FULL),y)
11645  obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
11646  obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o
11647 +endif
11648 +obj-$(CONFIG_PREEMPT_RT_FULL) += rt.o
11649  obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o
11650  obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o
11651 diff -Nur linux-4.9.6.orig/kernel/locking/percpu-rwsem.c linux-4.9.6/kernel/locking/percpu-rwsem.c
11652 --- linux-4.9.6.orig/kernel/locking/percpu-rwsem.c      2017-01-26 08:25:24.000000000 +0100
11653 +++ linux-4.9.6/kernel/locking/percpu-rwsem.c   2017-01-28 13:59:09.959657263 +0100
11654 @@ -18,7 +18,7 @@
11655         /* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */
11656         rcu_sync_init(&sem->rss, RCU_SCHED_SYNC);
11657         __init_rwsem(&sem->rw_sem, name, rwsem_key);
11658 -       init_waitqueue_head(&sem->writer);
11659 +       init_swait_queue_head(&sem->writer);
11660         sem->readers_block = 0;
11661         return 0;
11662  }
11663 @@ -103,7 +103,7 @@
11664         __this_cpu_dec(*sem->read_count);
11665
11666         /* Prod writer to recheck readers_active */
11667 -       wake_up(&sem->writer);
11668 +       swake_up(&sem->writer);
11669  }
11670  EXPORT_SYMBOL_GPL(__percpu_up_read);
11671
11672 @@ -160,7 +160,7 @@
11673          */
11674
11675         /* Wait for all now active readers to complete. */
11676 -       wait_event(sem->writer, readers_active_check(sem));
11677 +       swait_event(sem->writer, readers_active_check(sem));
11678  }
11679  EXPORT_SYMBOL_GPL(percpu_down_write);
11680
11681 diff -Nur linux-4.9.6.orig/kernel/locking/rt.c linux-4.9.6/kernel/locking/rt.c
11682 --- linux-4.9.6.orig/kernel/locking/rt.c        1970-01-01 01:00:00.000000000 +0100
11683 +++ linux-4.9.6/kernel/locking/rt.c     2017-01-28 13:59:10.035660185 +0100
11684 @@ -0,0 +1,498 @@
11685 +/*
11686 + * kernel/rt.c
11687 + *
11688 + * Real-Time Preemption Support
11689 + *
11690 + * started by Ingo Molnar:
11691 + *
11692 + *  Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
11693 + *  Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
11694 + *
11695 + * historic credit for proving that Linux spinlocks can be implemented via
11696 + * RT-aware mutexes goes to many people: The Pmutex project (Dirk Grambow
11697 + * and others) who prototyped it on 2.4 and did lots of comparative
11698 + * research and analysis; TimeSys, for proving that you can implement a
11699 + * fully preemptible kernel via the use of IRQ threading and mutexes;
11700 + * Bill Huey for persuasively arguing on lkml that the mutex model is the
11701 + * right one; and to MontaVista, who ported pmutexes to 2.6.
11702 + *
11703 + * This code is a from-scratch implementation and is not based on pmutexes,
11704 + * but the idea of converting spinlocks to mutexes is used here too.
11705 + *
11706 + * lock debugging, locking tree, deadlock detection:
11707 + *
11708 + *  Copyright (C) 2004, LynuxWorks, Inc., Igor Manyilov, Bill Huey
11709 + *  Released under the General Public License (GPL).
11710 + *
11711 + * Includes portions of the generic R/W semaphore implementation from:
11712 + *
11713 + *  Copyright (c) 2001   David Howells (dhowells@redhat.com).
11714 + *  - Derived partially from idea by Andrea Arcangeli <andrea@suse.de>
11715 + *  - Derived also from comments by Linus
11716 + *
11717 + * Pending ownership of locks and ownership stealing:
11718 + *
11719 + *  Copyright (C) 2005, Kihon Technologies Inc., Steven Rostedt
11720 + *
11721 + *   (also by Steven Rostedt)
11722 + *    - Converted single pi_lock to individual task locks.
11723 + *
11724 + * By Esben Nielsen:
11725 + *    Doing priority inheritance with help of the scheduler.
11726 + *
11727 + *  Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
11728 + *  - major rework based on Esben Nielsens initial patch
11729 + *  - replaced thread_info references by task_struct refs
11730 + *  - removed task->pending_owner dependency
11731 + *  - BKL drop/reacquire for semaphore style locks to avoid deadlocks
11732 + *    in the scheduler return path as discussed with Steven Rostedt
11733 + *
11734 + *  Copyright (C) 2006, Kihon Technologies Inc.
11735 + *    Steven Rostedt <rostedt@goodmis.org>
11736 + *  - debugged and patched Thomas Gleixner's rework.
11737 + *  - added back the cmpxchg to the rework.
11738 + *  - turned atomic require back on for SMP.
11739 + */
11740 +
11741 +#include <linux/spinlock.h>
11742 +#include <linux/rtmutex.h>
11743 +#include <linux/sched.h>
11744 +#include <linux/delay.h>
11745 +#include <linux/module.h>
11746 +#include <linux/kallsyms.h>
11747 +#include <linux/syscalls.h>
11748 +#include <linux/interrupt.h>
11749 +#include <linux/plist.h>
11750 +#include <linux/fs.h>
11751 +#include <linux/futex.h>
11752 +#include <linux/hrtimer.h>
11753 +
11754 +#include "rtmutex_common.h"
11755 +
11756 +/*
11757 + * struct mutex functions
11758 + */
11759 +void __mutex_do_init(struct mutex *mutex, const char *name,
11760 +                    struct lock_class_key *key)
11761 +{
11762 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
11763 +       /*
11764 +        * Make sure we are not reinitializing a held lock:
11765 +        */
11766 +       debug_check_no_locks_freed((void *)mutex, sizeof(*mutex));
11767 +       lockdep_init_map(&mutex->dep_map, name, key, 0);
11768 +#endif
11769 +       mutex->lock.save_state = 0;
11770 +}
11771 +EXPORT_SYMBOL(__mutex_do_init);
11772 +
11773 +void __lockfunc _mutex_lock(struct mutex *lock)
11774 +{
11775 +       mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
11776 +       rt_mutex_lock(&lock->lock);
11777 +}
11778 +EXPORT_SYMBOL(_mutex_lock);
11779 +
11780 +int __lockfunc _mutex_lock_interruptible(struct mutex *lock)
11781 +{
11782 +       int ret;
11783 +
11784 +       mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
11785 +       ret = rt_mutex_lock_interruptible(&lock->lock);
11786 +       if (ret)
11787 +               mutex_release(&lock->dep_map, 1, _RET_IP_);
11788 +       return ret;
11789 +}
11790 +EXPORT_SYMBOL(_mutex_lock_interruptible);
11791 +
11792 +int __lockfunc _mutex_lock_killable(struct mutex *lock)
11793 +{
11794 +       int ret;
11795 +
11796 +       mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
11797 +       ret = rt_mutex_lock_killable(&lock->lock);
11798 +       if (ret)
11799 +               mutex_release(&lock->dep_map, 1, _RET_IP_);
11800 +       return ret;
11801 +}
11802 +EXPORT_SYMBOL(_mutex_lock_killable);
11803 +
11804 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
11805 +void __lockfunc _mutex_lock_nested(struct mutex *lock, int subclass)
11806 +{
11807 +       mutex_acquire_nest(&lock->dep_map, subclass, 0, NULL, _RET_IP_);
11808 +       rt_mutex_lock(&lock->lock);
11809 +}
11810 +EXPORT_SYMBOL(_mutex_lock_nested);
11811 +
11812 +void __lockfunc _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest)
11813 +{
11814 +       mutex_acquire_nest(&lock->dep_map, 0, 0, nest, _RET_IP_);
11815 +       rt_mutex_lock(&lock->lock);
11816 +}
11817 +EXPORT_SYMBOL(_mutex_lock_nest_lock);
11818 +
11819 +int __lockfunc _mutex_lock_interruptible_nested(struct mutex *lock, int subclass)
11820 +{
11821 +       int ret;
11822 +
11823 +       mutex_acquire_nest(&lock->dep_map, subclass, 0, NULL, _RET_IP_);
11824 +       ret = rt_mutex_lock_interruptible(&lock->lock);
11825 +       if (ret)
11826 +               mutex_release(&lock->dep_map, 1, _RET_IP_);
11827 +       return ret;
11828 +}
11829 +EXPORT_SYMBOL(_mutex_lock_interruptible_nested);
11830 +
11831 +int __lockfunc _mutex_lock_killable_nested(struct mutex *lock, int subclass)
11832 +{
11833 +       int ret;
11834 +
11835 +       mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
11836 +       ret = rt_mutex_lock_killable(&lock->lock);
11837 +       if (ret)
11838 +               mutex_release(&lock->dep_map, 1, _RET_IP_);
11839 +       return ret;
11840 +}
11841 +EXPORT_SYMBOL(_mutex_lock_killable_nested);
11842 +#endif
11843 +
11844 +int __lockfunc _mutex_trylock(struct mutex *lock)
11845 +{
11846 +       int ret = rt_mutex_trylock(&lock->lock);
11847 +
11848 +       if (ret)
11849 +               mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
11850 +
11851 +       return ret;
11852 +}
11853 +EXPORT_SYMBOL(_mutex_trylock);
11854 +
11855 +void __lockfunc _mutex_unlock(struct mutex *lock)
11856 +{
11857 +       mutex_release(&lock->dep_map, 1, _RET_IP_);
11858 +       rt_mutex_unlock(&lock->lock);
11859 +}
11860 +EXPORT_SYMBOL(_mutex_unlock);
11861 +
11862 +/*
11863 + * rwlock_t functions
11864 + */
11865 +int __lockfunc rt_write_trylock(rwlock_t *rwlock)
11866 +{
11867 +       int ret;
11868 +
11869 +       migrate_disable();
11870 +       ret = rt_mutex_trylock(&rwlock->lock);
11871 +       if (ret)
11872 +               rwlock_acquire(&rwlock->dep_map, 0, 1, _RET_IP_);
11873 +       else
11874 +               migrate_enable();
11875 +
11876 +       return ret;
11877 +}
11878 +EXPORT_SYMBOL(rt_write_trylock);
11879 +
11880 +int __lockfunc rt_write_trylock_irqsave(rwlock_t *rwlock, unsigned long *flags)
11881 +{
11882 +       int ret;
11883 +
11884 +       *flags = 0;
11885 +       ret = rt_write_trylock(rwlock);
11886 +       return ret;
11887 +}
11888 +EXPORT_SYMBOL(rt_write_trylock_irqsave);
11889 +
11890 +int __lockfunc rt_read_trylock(rwlock_t *rwlock)
11891 +{
11892 +       struct rt_mutex *lock = &rwlock->lock;
11893 +       int ret = 1;
11894 +
11895 +       /*
11896 +        * recursive read locks succeed when current owns the lock,
11897 +        * but not when read_depth == 0 which means that the lock is
11898 +        * write locked.
11899 +        */
11900 +       if (rt_mutex_owner(lock) != current) {
11901 +               migrate_disable();
11902 +               ret = rt_mutex_trylock(lock);
11903 +               if (ret)
11904 +                       rwlock_acquire(&rwlock->dep_map, 0, 1, _RET_IP_);
11905 +               else
11906 +                       migrate_enable();
11907 +
11908 +       } else if (!rwlock->read_depth) {
11909 +               ret = 0;
11910 +       }
11911 +
11912 +       if (ret)
11913 +               rwlock->read_depth++;
11914 +
11915 +       return ret;
11916 +}
11917 +EXPORT_SYMBOL(rt_read_trylock);
11918 +
11919 +void __lockfunc rt_write_lock(rwlock_t *rwlock)
11920 +{
11921 +       rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_);
11922 +       __rt_spin_lock(&rwlock->lock);
11923 +}
11924 +EXPORT_SYMBOL(rt_write_lock);
11925 +
11926 +void __lockfunc rt_read_lock(rwlock_t *rwlock)
11927 +{
11928 +       struct rt_mutex *lock = &rwlock->lock;
11929 +
11930 +
11931 +       /*
11932 +        * recursive read locks succeed when current owns the lock
11933 +        */
11934 +       if (rt_mutex_owner(lock) != current) {
11935 +               rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_);
11936 +               __rt_spin_lock(lock);
11937 +       }
11938 +       rwlock->read_depth++;
11939 +}
11940 +
11941 +EXPORT_SYMBOL(rt_read_lock);
11942 +
11943 +void __lockfunc rt_write_unlock(rwlock_t *rwlock)
11944 +{
11945 +       /* NOTE: we always pass in '1' for nested, for simplicity */
11946 +       rwlock_release(&rwlock->dep_map, 1, _RET_IP_);
11947 +       __rt_spin_unlock(&rwlock->lock);
11948 +       migrate_enable();
11949 +}
11950 +EXPORT_SYMBOL(rt_write_unlock);
11951 +
11952 +void __lockfunc rt_read_unlock(rwlock_t *rwlock)
11953 +{
11954 +       /* Release the lock only when read_depth is down to 0 */
11955 +       if (--rwlock->read_depth == 0) {
11956 +               rwlock_release(&rwlock->dep_map, 1, _RET_IP_);
11957 +               __rt_spin_unlock(&rwlock->lock);
11958 +               migrate_enable();
11959 +       }
11960 +}
11961 +EXPORT_SYMBOL(rt_read_unlock);
11962 +
11963 +unsigned long __lockfunc rt_write_lock_irqsave(rwlock_t *rwlock)
11964 +{
11965 +       rt_write_lock(rwlock);
11966 +
11967 +       return 0;
11968 +}
11969 +EXPORT_SYMBOL(rt_write_lock_irqsave);
11970 +
11971 +unsigned long __lockfunc rt_read_lock_irqsave(rwlock_t *rwlock)
11972 +{
11973 +       rt_read_lock(rwlock);
11974 +
11975 +       return 0;
11976 +}
11977 +EXPORT_SYMBOL(rt_read_lock_irqsave);
11978 +
11979 +void __rt_rwlock_init(rwlock_t *rwlock, char *name, struct lock_class_key *key)
11980 +{
11981 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
11982 +       /*
11983 +        * Make sure we are not reinitializing a held lock:
11984 +        */
11985 +       debug_check_no_locks_freed((void *)rwlock, sizeof(*rwlock));
11986 +       lockdep_init_map(&rwlock->dep_map, name, key, 0);
11987 +#endif
11988 +       rwlock->lock.save_state = 1;
11989 +       rwlock->read_depth = 0;
11990 +}
11991 +EXPORT_SYMBOL(__rt_rwlock_init);
11992 +
11993 +/*
11994 + * rw_semaphores
11995 + */
11996 +
11997 +void  rt_up_write(struct rw_semaphore *rwsem)
11998 +{
11999 +       rwsem_release(&rwsem->dep_map, 1, _RET_IP_);
12000 +       rt_mutex_unlock(&rwsem->lock);
12001 +}
12002 +EXPORT_SYMBOL(rt_up_write);
12003 +
12004 +void __rt_up_read(struct rw_semaphore *rwsem)
12005 +{
12006 +       if (--rwsem->read_depth == 0)
12007 +               rt_mutex_unlock(&rwsem->lock);
12008 +}
12009 +
12010 +void  rt_up_read(struct rw_semaphore *rwsem)
12011 +{
12012 +       rwsem_release(&rwsem->dep_map, 1, _RET_IP_);
12013 +       __rt_up_read(rwsem);
12014 +}
12015 +EXPORT_SYMBOL(rt_up_read);
12016 +
12017 +/*
12018 + * downgrade a write lock into a read lock
12019 + * - just wake up any readers at the front of the queue
12020 + */
12021 +void  rt_downgrade_write(struct rw_semaphore *rwsem)
12022 +{
12023 +       BUG_ON(rt_mutex_owner(&rwsem->lock) != current);
12024 +       rwsem->read_depth = 1;
12025 +}
12026 +EXPORT_SYMBOL(rt_downgrade_write);
12027 +
12028 +int  rt_down_write_trylock(struct rw_semaphore *rwsem)
12029 +{
12030 +       int ret = rt_mutex_trylock(&rwsem->lock);
12031 +
12032 +       if (ret)
12033 +               rwsem_acquire(&rwsem->dep_map, 0, 1, _RET_IP_);
12034 +       return ret;
12035 +}
12036 +EXPORT_SYMBOL(rt_down_write_trylock);
12037 +
12038 +void  rt_down_write(struct rw_semaphore *rwsem)
12039 +{
12040 +       rwsem_acquire(&rwsem->dep_map, 0, 0, _RET_IP_);
12041 +       rt_mutex_lock(&rwsem->lock);
12042 +}
12043 +EXPORT_SYMBOL(rt_down_write);
12044 +
12045 +int rt_down_write_killable(struct rw_semaphore *rwsem)
12046 +{
12047 +       int ret;
12048 +
12049 +       rwsem_acquire(&rwsem->dep_map, 0, 0, _RET_IP_);
12050 +       ret = rt_mutex_lock_killable(&rwsem->lock);
12051 +       if (ret)
12052 +               rwsem_release(&rwsem->dep_map, 1, _RET_IP_);
12053 +       return ret;
12054 +}
12055 +EXPORT_SYMBOL(rt_down_write_killable);
12056 +
12057 +int rt_down_write_killable_nested(struct rw_semaphore *rwsem, int subclass)
12058 +{
12059 +       int ret;
12060 +
12061 +       rwsem_acquire(&rwsem->dep_map, subclass, 0, _RET_IP_);
12062 +       ret = rt_mutex_lock_killable(&rwsem->lock);
12063 +       if (ret)
12064 +               rwsem_release(&rwsem->dep_map, 1, _RET_IP_);
12065 +       return ret;
12066 +}
12067 +EXPORT_SYMBOL(rt_down_write_killable_nested);
12068 +
12069 +void  rt_down_write_nested(struct rw_semaphore *rwsem, int subclass)
12070 +{
12071 +       rwsem_acquire(&rwsem->dep_map, subclass, 0, _RET_IP_);
12072 +       rt_mutex_lock(&rwsem->lock);
12073 +}
12074 +EXPORT_SYMBOL(rt_down_write_nested);
12075 +
12076 +void rt_down_write_nested_lock(struct rw_semaphore *rwsem,
12077 +                              struct lockdep_map *nest)
12078 +{
12079 +       rwsem_acquire_nest(&rwsem->dep_map, 0, 0, nest, _RET_IP_);
12080 +       rt_mutex_lock(&rwsem->lock);
12081 +}
12082 +EXPORT_SYMBOL(rt_down_write_nested_lock);
12083 +
12084 +int rt__down_read_trylock(struct rw_semaphore *rwsem)
12085 +{
12086 +       struct rt_mutex *lock = &rwsem->lock;
12087 +       int ret = 1;
12088 +
12089 +       /*
12090 +        * recursive read locks succeed when current owns the rwsem,
12091 +        * but not when read_depth == 0 which means that the rwsem is
12092 +        * write locked.
12093 +        */
12094 +       if (rt_mutex_owner(lock) != current)
12095 +               ret = rt_mutex_trylock(&rwsem->lock);
12096 +       else if (!rwsem->read_depth)
12097 +               ret = 0;
12098 +
12099 +       if (ret)
12100 +               rwsem->read_depth++;
12101 +       return ret;
12102 +
12103 +}
12104 +
12105 +int  rt_down_read_trylock(struct rw_semaphore *rwsem)
12106 +{
12107 +       int ret;
12108 +
12109 +       ret = rt__down_read_trylock(rwsem);
12110 +       if (ret)
12111 +               rwsem_acquire(&rwsem->dep_map, 0, 1, _RET_IP_);
12112 +
12113 +       return ret;
12114 +}
12115 +EXPORT_SYMBOL(rt_down_read_trylock);
12116 +
12117 +void rt__down_read(struct rw_semaphore *rwsem)
12118 +{
12119 +       struct rt_mutex *lock = &rwsem->lock;
12120 +
12121 +       if (rt_mutex_owner(lock) != current)
12122 +               rt_mutex_lock(&rwsem->lock);
12123 +       rwsem->read_depth++;
12124 +}
12125 +EXPORT_SYMBOL(rt__down_read);
12126 +
12127 +static void __rt_down_read(struct rw_semaphore *rwsem, int subclass)
12128 +{
12129 +       rwsem_acquire_read(&rwsem->dep_map, subclass, 0, _RET_IP_);
12130 +       rt__down_read(rwsem);
12131 +}
12132 +
12133 +void  rt_down_read(struct rw_semaphore *rwsem)
12134 +{
12135 +       __rt_down_read(rwsem, 0);
12136 +}
12137 +EXPORT_SYMBOL(rt_down_read);
12138 +
12139 +void  rt_down_read_nested(struct rw_semaphore *rwsem, int subclass)
12140 +{
12141 +       __rt_down_read(rwsem, subclass);
12142 +}
12143 +EXPORT_SYMBOL(rt_down_read_nested);
12144 +
12145 +void  __rt_rwsem_init(struct rw_semaphore *rwsem, const char *name,
12146 +                             struct lock_class_key *key)
12147 +{
12148 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
12149 +       /*
12150 +        * Make sure we are not reinitializing a held lock:
12151 +        */
12152 +       debug_check_no_locks_freed((void *)rwsem, sizeof(*rwsem));
12153 +       lockdep_init_map(&rwsem->dep_map, name, key, 0);
12154 +#endif
12155 +       rwsem->read_depth = 0;
12156 +       rwsem->lock.save_state = 0;
12157 +}
12158 +EXPORT_SYMBOL(__rt_rwsem_init);
12159 +
12160 +/**
12161 + * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0
12162 + * @cnt: the atomic which we are to dec
12163 + * @lock: the mutex to return holding if we dec to 0
12164 + *
12165 + * return true and hold lock if we dec to 0, return false otherwise
12166 + */
12167 +int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock)
12168 +{
12169 +       /* dec if we can't possibly hit 0 */
12170 +       if (atomic_add_unless(cnt, -1, 1))
12171 +               return 0;
12172 +       /* we might hit 0, so take the lock */
12173 +       mutex_lock(lock);
12174 +       if (!atomic_dec_and_test(cnt)) {
12175 +               /* when we actually did the dec, we didn't hit 0 */
12176 +               mutex_unlock(lock);
12177 +               return 0;
12178 +       }
12179 +       /* we hit 0, and we hold the lock */
12180 +       return 1;
12181 +}
12182 +EXPORT_SYMBOL(atomic_dec_and_mutex_lock);
12183 diff -Nur linux-4.9.6.orig/kernel/locking/rtmutex.c linux-4.9.6/kernel/locking/rtmutex.c
12184 --- linux-4.9.6.orig/kernel/locking/rtmutex.c   2017-01-26 08:25:24.000000000 +0100
12185 +++ linux-4.9.6/kernel/locking/rtmutex.c        2017-01-28 13:59:10.035660185 +0100
12186 @@ -7,6 +7,11 @@
12187   *  Copyright (C) 2005-2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
12188   *  Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt
12189   *  Copyright (C) 2006 Esben Nielsen
12190 + *  Adaptive Spinlocks:
12191 + *  Copyright (C) 2008 Novell, Inc., Gregory Haskins, Sven Dietrich,
12192 + *                                  and Peter Morreale,
12193 + * Adaptive Spinlocks simplification:
12194 + *  Copyright (C) 2008 Red Hat, Inc., Steven Rostedt <srostedt@redhat.com>
12195   *
12196   *  See Documentation/locking/rt-mutex-design.txt for details.
12197   */
12198 @@ -16,6 +21,7 @@
12199  #include <linux/sched/rt.h>
12200  #include <linux/sched/deadline.h>
12201  #include <linux/timer.h>
12202 +#include <linux/ww_mutex.h>
12203
12204  #include "rtmutex_common.h"
12205
12206 @@ -133,6 +139,12 @@
12207                 WRITE_ONCE(*p, owner & ~RT_MUTEX_HAS_WAITERS);
12208  }
12209
12210 +static int rt_mutex_real_waiter(struct rt_mutex_waiter *waiter)
12211 +{
12212 +       return waiter && waiter != PI_WAKEUP_INPROGRESS &&
12213 +               waiter != PI_REQUEUE_INPROGRESS;
12214 +}
12215 +
12216  /*
12217   * We can speed up the acquire/release, if there's no debugging state to be
12218   * set up.
12219 @@ -414,6 +426,14 @@
12220         return debug_rt_mutex_detect_deadlock(waiter, chwalk);
12221  }
12222
12223 +static void rt_mutex_wake_waiter(struct rt_mutex_waiter *waiter)
12224 +{
12225 +       if (waiter->savestate)
12226 +               wake_up_lock_sleeper(waiter->task);
12227 +       else
12228 +               wake_up_process(waiter->task);
12229 +}
12230 +
12231  /*
12232   * Max number of times we'll walk the boosting chain:
12233   */
12234 @@ -421,7 +441,8 @@
12235
12236  static inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p)
12237  {
12238 -       return p->pi_blocked_on ? p->pi_blocked_on->lock : NULL;
12239 +       return rt_mutex_real_waiter(p->pi_blocked_on) ?
12240 +               p->pi_blocked_on->lock : NULL;
12241  }
12242
12243  /*
12244 @@ -557,7 +578,7 @@
12245          * reached or the state of the chain has changed while we
12246          * dropped the locks.
12247          */
12248 -       if (!waiter)
12249 +       if (!rt_mutex_real_waiter(waiter))
12250                 goto out_unlock_pi;
12251
12252         /*
12253 @@ -719,13 +740,16 @@
12254          * follow here. This is the end of the chain we are walking.
12255          */
12256         if (!rt_mutex_owner(lock)) {
12257 +               struct rt_mutex_waiter *lock_top_waiter;
12258 +
12259                 /*
12260                  * If the requeue [7] above changed the top waiter,
12261                  * then we need to wake the new top waiter up to try
12262                  * to get the lock.
12263                  */
12264 -               if (prerequeue_top_waiter != rt_mutex_top_waiter(lock))
12265 -                       wake_up_process(rt_mutex_top_waiter(lock)->task);
12266 +               lock_top_waiter = rt_mutex_top_waiter(lock);
12267 +               if (prerequeue_top_waiter != lock_top_waiter)
12268 +                       rt_mutex_wake_waiter(lock_top_waiter);
12269                 raw_spin_unlock_irq(&lock->wait_lock);
12270                 return 0;
12271         }
12272 @@ -818,6 +842,25 @@
12273         return ret;
12274  }
12275
12276 +
12277 +#define STEAL_NORMAL  0
12278 +#define STEAL_LATERAL 1
12279 +
12280 +/*
12281 + * Note that RT tasks are excluded from lateral-steals to prevent the
12282 + * introduction of an unbounded latency
12283 + */
12284 +static inline int lock_is_stealable(struct task_struct *task,
12285 +                                   struct task_struct *pendowner, int mode)
12286 +{
12287 +    if (mode == STEAL_NORMAL || rt_task(task)) {
12288 +           if (task->prio >= pendowner->prio)
12289 +                   return 0;
12290 +    } else if (task->prio > pendowner->prio)
12291 +           return 0;
12292 +    return 1;
12293 +}
12294 +
12295  /*
12296   * Try to take an rt-mutex
12297   *
12298 @@ -828,8 +871,9 @@
12299   * @waiter: The waiter that is queued to the lock's wait tree if the
12300   *         callsite called task_blocked_on_lock(), otherwise NULL
12301   */
12302 -static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
12303 -                               struct rt_mutex_waiter *waiter)
12304 +static int __try_to_take_rt_mutex(struct rt_mutex *lock,
12305 +                                 struct task_struct *task,
12306 +                                 struct rt_mutex_waiter *waiter, int mode)
12307  {
12308         /*
12309          * Before testing whether we can acquire @lock, we set the
12310 @@ -866,8 +910,10 @@
12311                  * If waiter is not the highest priority waiter of
12312                  * @lock, give up.
12313                  */
12314 -               if (waiter != rt_mutex_top_waiter(lock))
12315 +               if (waiter != rt_mutex_top_waiter(lock)) {
12316 +                       /* XXX lock_is_stealable() ? */
12317                         return 0;
12318 +               }
12319
12320                 /*
12321                  * We can acquire the lock. Remove the waiter from the
12322 @@ -885,14 +931,10 @@
12323                  * not need to be dequeued.
12324                  */
12325                 if (rt_mutex_has_waiters(lock)) {
12326 -                       /*
12327 -                        * If @task->prio is greater than or equal to
12328 -                        * the top waiter priority (kernel view),
12329 -                        * @task lost.
12330 -                        */
12331 -                       if (task->prio >= rt_mutex_top_waiter(lock)->prio)
12332 -                               return 0;
12333 +                       struct task_struct *pown = rt_mutex_top_waiter(lock)->task;
12334
12335 +                       if (task != pown && !lock_is_stealable(task, pown, mode))
12336 +                               return 0;
12337                         /*
12338                          * The current top waiter stays enqueued. We
12339                          * don't have to change anything in the lock
12340 @@ -941,6 +983,433 @@
12341         return 1;
12342  }
12343
12344 +#ifdef CONFIG_PREEMPT_RT_FULL
12345 +/*
12346 + * preemptible spin_lock functions:
12347 + */
12348 +static inline void rt_spin_lock_fastlock(struct rt_mutex *lock,
12349 +                                        void  (*slowfn)(struct rt_mutex *lock,
12350 +                                                        bool mg_off),
12351 +                                        bool do_mig_dis)
12352 +{
12353 +       might_sleep_no_state_check();
12354 +
12355 +       if (do_mig_dis)
12356 +               migrate_disable();
12357 +
12358 +       if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
12359 +               rt_mutex_deadlock_account_lock(lock, current);
12360 +       else
12361 +               slowfn(lock, do_mig_dis);
12362 +}
12363 +
12364 +static inline int rt_spin_lock_fastunlock(struct rt_mutex *lock,
12365 +                                         int (*slowfn)(struct rt_mutex *lock))
12366 +{
12367 +       if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) {
12368 +               rt_mutex_deadlock_account_unlock(current);
12369 +               return 0;
12370 +       }
12371 +       return slowfn(lock);
12372 +}
12373 +#ifdef CONFIG_SMP
12374 +/*
12375 + * Note that owner is a speculative pointer and dereferencing relies
12376 + * on rcu_read_lock() and the check against the lock owner.
12377 + */
12378 +static int adaptive_wait(struct rt_mutex *lock,
12379 +                        struct task_struct *owner)
12380 +{
12381 +       int res = 0;
12382 +
12383 +       rcu_read_lock();
12384 +       for (;;) {
12385 +               if (owner != rt_mutex_owner(lock))
12386 +                       break;
12387 +               /*
12388 +                * Ensure that owner->on_cpu is dereferenced _after_
12389 +                * checking the above to be valid.
12390 +                */
12391 +               barrier();
12392 +               if (!owner->on_cpu) {
12393 +                       res = 1;
12394 +                       break;
12395 +               }
12396 +               cpu_relax();
12397 +       }
12398 +       rcu_read_unlock();
12399 +       return res;
12400 +}
12401 +#else
12402 +static int adaptive_wait(struct rt_mutex *lock,
12403 +                        struct task_struct *orig_owner)
12404 +{
12405 +       return 1;
12406 +}
12407 +#endif
12408 +
12409 +static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
12410 +                                  struct rt_mutex_waiter *waiter,
12411 +                                  struct task_struct *task,
12412 +                                  enum rtmutex_chainwalk chwalk);
12413 +/*
12414 + * Slow path lock function spin_lock style: this variant is very
12415 + * careful not to miss any non-lock wakeups.
12416 + *
12417 + * We store the current state under p->pi_lock in p->saved_state and
12418 + * the try_to_wake_up() code handles this accordingly.
12419 + */
12420 +static void  noinline __sched rt_spin_lock_slowlock(struct rt_mutex *lock,
12421 +                                                   bool mg_off)
12422 +{
12423 +       struct task_struct *lock_owner, *self = current;
12424 +       struct rt_mutex_waiter waiter, *top_waiter;
12425 +       unsigned long flags;
12426 +       int ret;
12427 +
12428 +       rt_mutex_init_waiter(&waiter, true);
12429 +
12430 +       raw_spin_lock_irqsave(&lock->wait_lock, flags);
12431 +
12432 +       if (__try_to_take_rt_mutex(lock, self, NULL, STEAL_LATERAL)) {
12433 +               raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
12434 +               return;
12435 +       }
12436 +
12437 +       BUG_ON(rt_mutex_owner(lock) == self);
12438 +
12439 +       /*
12440 +        * We save whatever state the task is in and we'll restore it
12441 +        * after acquiring the lock taking real wakeups into account
12442 +        * as well. We are serialized via pi_lock against wakeups. See
12443 +        * try_to_wake_up().
12444 +        */
12445 +       raw_spin_lock(&self->pi_lock);
12446 +       self->saved_state = self->state;
12447 +       __set_current_state_no_track(TASK_UNINTERRUPTIBLE);
12448 +       raw_spin_unlock(&self->pi_lock);
12449 +
12450 +       ret = task_blocks_on_rt_mutex(lock, &waiter, self, RT_MUTEX_MIN_CHAINWALK);
12451 +       BUG_ON(ret);
12452 +
12453 +       for (;;) {
12454 +               /* Try to acquire the lock again. */
12455 +               if (__try_to_take_rt_mutex(lock, self, &waiter, STEAL_LATERAL))
12456 +                       break;
12457 +
12458 +               top_waiter = rt_mutex_top_waiter(lock);
12459 +               lock_owner = rt_mutex_owner(lock);
12460 +
12461 +               raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
12462 +
12463 +               debug_rt_mutex_print_deadlock(&waiter);
12464 +
12465 +               if (top_waiter != &waiter || adaptive_wait(lock, lock_owner)) {
12466 +                       if (mg_off)
12467 +                               migrate_enable();
12468 +                       schedule();
12469 +                       if (mg_off)
12470 +                               migrate_disable();
12471 +               }
12472 +
12473 +               raw_spin_lock_irqsave(&lock->wait_lock, flags);
12474 +
12475 +               raw_spin_lock(&self->pi_lock);
12476 +               __set_current_state_no_track(TASK_UNINTERRUPTIBLE);
12477 +               raw_spin_unlock(&self->pi_lock);
12478 +       }
12479 +
12480 +       /*
12481 +        * Restore the task state to current->saved_state. We set it
12482 +        * to the original state above and the try_to_wake_up() code
12483 +        * has possibly updated it when a real (non-rtmutex) wakeup
12484 +        * happened while we were blocked. Clear saved_state so
12485 +        * try_to_wakeup() does not get confused.
12486 +        */
12487 +       raw_spin_lock(&self->pi_lock);
12488 +       __set_current_state_no_track(self->saved_state);
12489 +       self->saved_state = TASK_RUNNING;
12490 +       raw_spin_unlock(&self->pi_lock);
12491 +
12492 +       /*
12493 +        * try_to_take_rt_mutex() sets the waiter bit
12494 +        * unconditionally. We might have to fix that up:
12495 +        */
12496 +       fixup_rt_mutex_waiters(lock);
12497 +
12498 +       BUG_ON(rt_mutex_has_waiters(lock) && &waiter == rt_mutex_top_waiter(lock));
12499 +       BUG_ON(!RB_EMPTY_NODE(&waiter.tree_entry));
12500 +
12501 +       raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
12502 +
12503 +       debug_rt_mutex_free_waiter(&waiter);
12504 +}
12505 +
12506 +static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
12507 +                                   struct wake_q_head *wake_sleeper_q,
12508 +                                   struct rt_mutex *lock);
12509 +/*
12510 + * Slow path to release a rt_mutex spin_lock style
12511 + */
12512 +static int noinline __sched rt_spin_lock_slowunlock(struct rt_mutex *lock)
12513 +{
12514 +       unsigned long flags;
12515 +       WAKE_Q(wake_q);
12516 +       WAKE_Q(wake_sleeper_q);
12517 +
12518 +       raw_spin_lock_irqsave(&lock->wait_lock, flags);
12519 +
12520 +       debug_rt_mutex_unlock(lock);
12521 +
12522 +       rt_mutex_deadlock_account_unlock(current);
12523 +
12524 +       if (!rt_mutex_has_waiters(lock)) {
12525 +               lock->owner = NULL;
12526 +               raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
12527 +               return 0;
12528 +       }
12529 +
12530 +       mark_wakeup_next_waiter(&wake_q, &wake_sleeper_q, lock);
12531 +
12532 +       raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
12533 +       wake_up_q(&wake_q);
12534 +       wake_up_q_sleeper(&wake_sleeper_q);
12535 +
12536 +       /* Undo pi boosting.when necessary */
12537 +       rt_mutex_adjust_prio(current);
12538 +       return 0;
12539 +}
12540 +
12541 +static int noinline __sched rt_spin_lock_slowunlock_no_deboost(struct rt_mutex *lock)
12542 +{
12543 +       unsigned long flags;
12544 +       WAKE_Q(wake_q);
12545 +       WAKE_Q(wake_sleeper_q);
12546 +
12547 +       raw_spin_lock_irqsave(&lock->wait_lock, flags);
12548 +
12549 +       debug_rt_mutex_unlock(lock);
12550 +
12551 +       rt_mutex_deadlock_account_unlock(current);
12552 +
12553 +       if (!rt_mutex_has_waiters(lock)) {
12554 +               lock->owner = NULL;
12555 +               raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
12556 +               return 0;
12557 +       }
12558 +
12559 +       mark_wakeup_next_waiter(&wake_q, &wake_sleeper_q, lock);
12560 +
12561 +       raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
12562 +       wake_up_q(&wake_q);
12563 +       wake_up_q_sleeper(&wake_sleeper_q);
12564 +       return 1;
12565 +}
12566 +
12567 +void __lockfunc rt_spin_lock__no_mg(spinlock_t *lock)
12568 +{
12569 +       rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock, false);
12570 +       spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
12571 +}
12572 +EXPORT_SYMBOL(rt_spin_lock__no_mg);
12573 +
12574 +void __lockfunc rt_spin_lock(spinlock_t *lock)
12575 +{
12576 +       rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock, true);
12577 +       spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
12578 +}
12579 +EXPORT_SYMBOL(rt_spin_lock);
12580 +
12581 +void __lockfunc __rt_spin_lock(struct rt_mutex *lock)
12582 +{
12583 +       rt_spin_lock_fastlock(lock, rt_spin_lock_slowlock, true);
12584 +}
12585 +EXPORT_SYMBOL(__rt_spin_lock);
12586 +
12587 +void __lockfunc __rt_spin_lock__no_mg(struct rt_mutex *lock)
12588 +{
12589 +       rt_spin_lock_fastlock(lock, rt_spin_lock_slowlock, false);
12590 +}
12591 +EXPORT_SYMBOL(__rt_spin_lock__no_mg);
12592 +
12593 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
12594 +void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int subclass)
12595 +{
12596 +       spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
12597 +       rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock, true);
12598 +}
12599 +EXPORT_SYMBOL(rt_spin_lock_nested);
12600 +#endif
12601 +
12602 +void __lockfunc rt_spin_unlock__no_mg(spinlock_t *lock)
12603 +{
12604 +       /* NOTE: we always pass in '1' for nested, for simplicity */
12605 +       spin_release(&lock->dep_map, 1, _RET_IP_);
12606 +       rt_spin_lock_fastunlock(&lock->lock, rt_spin_lock_slowunlock);
12607 +}
12608 +EXPORT_SYMBOL(rt_spin_unlock__no_mg);
12609 +
12610 +void __lockfunc rt_spin_unlock(spinlock_t *lock)
12611 +{
12612 +       /* NOTE: we always pass in '1' for nested, for simplicity */
12613 +       spin_release(&lock->dep_map, 1, _RET_IP_);
12614 +       rt_spin_lock_fastunlock(&lock->lock, rt_spin_lock_slowunlock);
12615 +       migrate_enable();
12616 +}
12617 +EXPORT_SYMBOL(rt_spin_unlock);
12618 +
12619 +int __lockfunc rt_spin_unlock_no_deboost(spinlock_t *lock)
12620 +{
12621 +       int ret;
12622 +
12623 +       /* NOTE: we always pass in '1' for nested, for simplicity */
12624 +       spin_release(&lock->dep_map, 1, _RET_IP_);
12625 +       ret = rt_spin_lock_fastunlock(&lock->lock, rt_spin_lock_slowunlock_no_deboost);
12626 +       migrate_enable();
12627 +       return ret;
12628 +}
12629 +
12630 +void __lockfunc __rt_spin_unlock(struct rt_mutex *lock)
12631 +{
12632 +       rt_spin_lock_fastunlock(lock, rt_spin_lock_slowunlock);
12633 +}
12634 +EXPORT_SYMBOL(__rt_spin_unlock);
12635 +
12636 +/*
12637 + * Wait for the lock to get unlocked: instead of polling for an unlock
12638 + * (like raw spinlocks do), we lock and unlock, to force the kernel to
12639 + * schedule if there's contention:
12640 + */
12641 +void __lockfunc rt_spin_unlock_wait(spinlock_t *lock)
12642 +{
12643 +       spin_lock(lock);
12644 +       spin_unlock(lock);
12645 +}
12646 +EXPORT_SYMBOL(rt_spin_unlock_wait);
12647 +
12648 +int __lockfunc rt_spin_trylock__no_mg(spinlock_t *lock)
12649 +{
12650 +       int ret;
12651 +
12652 +       ret = rt_mutex_trylock(&lock->lock);
12653 +       if (ret)
12654 +               spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
12655 +       return ret;
12656 +}
12657 +EXPORT_SYMBOL(rt_spin_trylock__no_mg);
12658 +
12659 +int __lockfunc rt_spin_trylock(spinlock_t *lock)
12660 +{
12661 +       int ret;
12662 +
12663 +       migrate_disable();
12664 +       ret = rt_mutex_trylock(&lock->lock);
12665 +       if (ret)
12666 +               spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
12667 +       else
12668 +               migrate_enable();
12669 +       return ret;
12670 +}
12671 +EXPORT_SYMBOL(rt_spin_trylock);
12672 +
12673 +int __lockfunc rt_spin_trylock_bh(spinlock_t *lock)
12674 +{
12675 +       int ret;
12676 +
12677 +       local_bh_disable();
12678 +       ret = rt_mutex_trylock(&lock->lock);
12679 +       if (ret) {
12680 +               migrate_disable();
12681 +               spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
12682 +       } else
12683 +               local_bh_enable();
12684 +       return ret;
12685 +}
12686 +EXPORT_SYMBOL(rt_spin_trylock_bh);
12687 +
12688 +int __lockfunc rt_spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags)
12689 +{
12690 +       int ret;
12691 +
12692 +       *flags = 0;
12693 +       ret = rt_mutex_trylock(&lock->lock);
12694 +       if (ret) {
12695 +               migrate_disable();
12696 +               spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
12697 +       }
12698 +       return ret;
12699 +}
12700 +EXPORT_SYMBOL(rt_spin_trylock_irqsave);
12701 +
12702 +int atomic_dec_and_spin_lock(atomic_t *atomic, spinlock_t *lock)
12703 +{
12704 +       /* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */
12705 +       if (atomic_add_unless(atomic, -1, 1))
12706 +               return 0;
12707 +       rt_spin_lock(lock);
12708 +       if (atomic_dec_and_test(atomic))
12709 +               return 1;
12710 +       rt_spin_unlock(lock);
12711 +       return 0;
12712 +}
12713 +EXPORT_SYMBOL(atomic_dec_and_spin_lock);
12714 +
12715 +       void
12716 +__rt_spin_lock_init(spinlock_t *lock, char *name, struct lock_class_key *key)
12717 +{
12718 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
12719 +       /*
12720 +        * Make sure we are not reinitializing a held lock:
12721 +        */
12722 +       debug_check_no_locks_freed((void *)lock, sizeof(*lock));
12723 +       lockdep_init_map(&lock->dep_map, name, key, 0);
12724 +#endif
12725 +}
12726 +EXPORT_SYMBOL(__rt_spin_lock_init);
12727 +
12728 +#endif /* PREEMPT_RT_FULL */
12729 +
12730 +#ifdef CONFIG_PREEMPT_RT_FULL
12731 +       static inline int __sched
12732 +__mutex_lock_check_stamp(struct rt_mutex *lock, struct ww_acquire_ctx *ctx)
12733 +{
12734 +       struct ww_mutex *ww = container_of(lock, struct ww_mutex, base.lock);
12735 +       struct ww_acquire_ctx *hold_ctx = ACCESS_ONCE(ww->ctx);
12736 +
12737 +       if (!hold_ctx)
12738 +               return 0;
12739 +
12740 +       if (unlikely(ctx == hold_ctx))
12741 +               return -EALREADY;
12742 +
12743 +       if (ctx->stamp - hold_ctx->stamp <= LONG_MAX &&
12744 +           (ctx->stamp != hold_ctx->stamp || ctx > hold_ctx)) {
12745 +#ifdef CONFIG_DEBUG_MUTEXES
12746 +               DEBUG_LOCKS_WARN_ON(ctx->contending_lock);
12747 +               ctx->contending_lock = ww;
12748 +#endif
12749 +               return -EDEADLK;
12750 +       }
12751 +
12752 +       return 0;
12753 +}
12754 +#else
12755 +       static inline int __sched
12756 +__mutex_lock_check_stamp(struct rt_mutex *lock, struct ww_acquire_ctx *ctx)
12757 +{
12758 +       BUG();
12759 +       return 0;
12760 +}
12761 +
12762 +#endif
12763 +
12764 +static inline int
12765 +try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
12766 +                    struct rt_mutex_waiter *waiter)
12767 +{
12768 +       return __try_to_take_rt_mutex(lock, task, waiter, STEAL_NORMAL);
12769 +}
12770 +
12771  /*
12772   * Task blocks on lock.
12773   *
12774 @@ -971,6 +1440,23 @@
12775                 return -EDEADLK;
12776
12777         raw_spin_lock(&task->pi_lock);
12778 +
12779 +       /*
12780 +        * In the case of futex requeue PI, this will be a proxy
12781 +        * lock. The task will wake unaware that it is enqueueed on
12782 +        * this lock. Avoid blocking on two locks and corrupting
12783 +        * pi_blocked_on via the PI_WAKEUP_INPROGRESS
12784 +        * flag. futex_wait_requeue_pi() sets this when it wakes up
12785 +        * before requeue (due to a signal or timeout). Do not enqueue
12786 +        * the task if PI_WAKEUP_INPROGRESS is set.
12787 +        */
12788 +       if (task != current && task->pi_blocked_on == PI_WAKEUP_INPROGRESS) {
12789 +               raw_spin_unlock(&task->pi_lock);
12790 +               return -EAGAIN;
12791 +       }
12792 +
12793 +       BUG_ON(rt_mutex_real_waiter(task->pi_blocked_on));
12794 +
12795         __rt_mutex_adjust_prio(task);
12796         waiter->task = task;
12797         waiter->lock = lock;
12798 @@ -994,7 +1480,7 @@
12799                 rt_mutex_enqueue_pi(owner, waiter);
12800
12801                 __rt_mutex_adjust_prio(owner);
12802 -               if (owner->pi_blocked_on)
12803 +               if (rt_mutex_real_waiter(owner->pi_blocked_on))
12804                         chain_walk = 1;
12805         } else if (rt_mutex_cond_detect_deadlock(waiter, chwalk)) {
12806                 chain_walk = 1;
12807 @@ -1036,6 +1522,7 @@
12808   * Called with lock->wait_lock held and interrupts disabled.
12809   */
12810  static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
12811 +                                   struct wake_q_head *wake_sleeper_q,
12812                                     struct rt_mutex *lock)
12813  {
12814         struct rt_mutex_waiter *waiter;
12815 @@ -1064,7 +1551,10 @@
12816
12817         raw_spin_unlock(&current->pi_lock);
12818
12819 -       wake_q_add(wake_q, waiter->task);
12820 +       if (waiter->savestate)
12821 +               wake_q_add(wake_sleeper_q, waiter->task);
12822 +       else
12823 +               wake_q_add(wake_q, waiter->task);
12824  }
12825
12826  /*
12827 @@ -1078,7 +1568,7 @@
12828  {
12829         bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock));
12830         struct task_struct *owner = rt_mutex_owner(lock);
12831 -       struct rt_mutex *next_lock;
12832 +       struct rt_mutex *next_lock = NULL;
12833
12834         raw_spin_lock(&current->pi_lock);
12835         rt_mutex_dequeue(lock, waiter);
12836 @@ -1102,7 +1592,8 @@
12837         __rt_mutex_adjust_prio(owner);
12838
12839         /* Store the lock on which owner is blocked or NULL */
12840 -       next_lock = task_blocked_on_lock(owner);
12841 +       if (rt_mutex_real_waiter(owner->pi_blocked_on))
12842 +               next_lock = task_blocked_on_lock(owner);
12843
12844         raw_spin_unlock(&owner->pi_lock);
12845
12846 @@ -1138,17 +1629,17 @@
12847         raw_spin_lock_irqsave(&task->pi_lock, flags);
12848
12849         waiter = task->pi_blocked_on;
12850 -       if (!waiter || (waiter->prio == task->prio &&
12851 +       if (!rt_mutex_real_waiter(waiter) || (waiter->prio == task->prio &&
12852                         !dl_prio(task->prio))) {
12853                 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
12854                 return;
12855         }
12856         next_lock = waiter->lock;
12857 -       raw_spin_unlock_irqrestore(&task->pi_lock, flags);
12858
12859         /* gets dropped in rt_mutex_adjust_prio_chain()! */
12860         get_task_struct(task);
12861
12862 +       raw_spin_unlock_irqrestore(&task->pi_lock, flags);
12863         rt_mutex_adjust_prio_chain(task, RT_MUTEX_MIN_CHAINWALK, NULL,
12864                                    next_lock, NULL, task);
12865  }
12866 @@ -1166,7 +1657,8 @@
12867  static int __sched
12868  __rt_mutex_slowlock(struct rt_mutex *lock, int state,
12869                     struct hrtimer_sleeper *timeout,
12870 -                   struct rt_mutex_waiter *waiter)
12871 +                   struct rt_mutex_waiter *waiter,
12872 +                   struct ww_acquire_ctx *ww_ctx)
12873  {
12874         int ret = 0;
12875
12876 @@ -1189,6 +1681,12 @@
12877                                 break;
12878                 }
12879
12880 +               if (ww_ctx && ww_ctx->acquired > 0) {
12881 +                       ret = __mutex_lock_check_stamp(lock, ww_ctx);
12882 +                       if (ret)
12883 +                               break;
12884 +               }
12885 +
12886                 raw_spin_unlock_irq(&lock->wait_lock);
12887
12888                 debug_rt_mutex_print_deadlock(waiter);
12889 @@ -1223,21 +1721,96 @@
12890         }
12891  }
12892
12893 +static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww,
12894 +                                                  struct ww_acquire_ctx *ww_ctx)
12895 +{
12896 +#ifdef CONFIG_DEBUG_MUTEXES
12897 +       /*
12898 +        * If this WARN_ON triggers, you used ww_mutex_lock to acquire,
12899 +        * but released with a normal mutex_unlock in this call.
12900 +        *
12901 +        * This should never happen, always use ww_mutex_unlock.
12902 +        */
12903 +       DEBUG_LOCKS_WARN_ON(ww->ctx);
12904 +
12905 +       /*
12906 +        * Not quite done after calling ww_acquire_done() ?
12907 +        */
12908 +       DEBUG_LOCKS_WARN_ON(ww_ctx->done_acquire);
12909 +
12910 +       if (ww_ctx->contending_lock) {
12911 +               /*
12912 +                * After -EDEADLK you tried to
12913 +                * acquire a different ww_mutex? Bad!
12914 +                */
12915 +               DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock != ww);
12916 +
12917 +               /*
12918 +                * You called ww_mutex_lock after receiving -EDEADLK,
12919 +                * but 'forgot' to unlock everything else first?
12920 +                */
12921 +               DEBUG_LOCKS_WARN_ON(ww_ctx->acquired > 0);
12922 +               ww_ctx->contending_lock = NULL;
12923 +       }
12924 +
12925 +       /*
12926 +        * Naughty, using a different class will lead to undefined behavior!
12927 +        */
12928 +       DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class);
12929 +#endif
12930 +       ww_ctx->acquired++;
12931 +}
12932 +
12933 +#ifdef CONFIG_PREEMPT_RT_FULL
12934 +static void ww_mutex_account_lock(struct rt_mutex *lock,
12935 +                                 struct ww_acquire_ctx *ww_ctx)
12936 +{
12937 +       struct ww_mutex *ww = container_of(lock, struct ww_mutex, base.lock);
12938 +       struct rt_mutex_waiter *waiter, *n;
12939 +
12940 +       /*
12941 +        * This branch gets optimized out for the common case,
12942 +        * and is only important for ww_mutex_lock.
12943 +        */
12944 +       ww_mutex_lock_acquired(ww, ww_ctx);
12945 +       ww->ctx = ww_ctx;
12946 +
12947 +       /*
12948 +        * Give any possible sleeping processes the chance to wake up,
12949 +        * so they can recheck if they have to back off.
12950 +        */
12951 +       rbtree_postorder_for_each_entry_safe(waiter, n, &lock->waiters,
12952 +                                            tree_entry) {
12953 +               /* XXX debug rt mutex waiter wakeup */
12954 +
12955 +               BUG_ON(waiter->lock != lock);
12956 +               rt_mutex_wake_waiter(waiter);
12957 +       }
12958 +}
12959 +
12960 +#else
12961 +
12962 +static void ww_mutex_account_lock(struct rt_mutex *lock,
12963 +                                 struct ww_acquire_ctx *ww_ctx)
12964 +{
12965 +       BUG();
12966 +}
12967 +#endif
12968 +
12969  /*
12970   * Slow path lock function:
12971   */
12972  static int __sched
12973  rt_mutex_slowlock(struct rt_mutex *lock, int state,
12974                   struct hrtimer_sleeper *timeout,
12975 -                 enum rtmutex_chainwalk chwalk)
12976 +                 enum rtmutex_chainwalk chwalk,
12977 +                 struct ww_acquire_ctx *ww_ctx)
12978  {
12979         struct rt_mutex_waiter waiter;
12980         unsigned long flags;
12981         int ret = 0;
12982
12983 -       debug_rt_mutex_init_waiter(&waiter);
12984 -       RB_CLEAR_NODE(&waiter.pi_tree_entry);
12985 -       RB_CLEAR_NODE(&waiter.tree_entry);
12986 +       rt_mutex_init_waiter(&waiter, false);
12987
12988         /*
12989          * Technically we could use raw_spin_[un]lock_irq() here, but this can
12990 @@ -1251,6 +1824,8 @@
12991
12992         /* Try to acquire the lock again: */
12993         if (try_to_take_rt_mutex(lock, current, NULL)) {
12994 +               if (ww_ctx)
12995 +                       ww_mutex_account_lock(lock, ww_ctx);
12996                 raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
12997                 return 0;
12998         }
12999 @@ -1265,13 +1840,23 @@
13000
13001         if (likely(!ret))
13002                 /* sleep on the mutex */
13003 -               ret = __rt_mutex_slowlock(lock, state, timeout, &waiter);
13004 +               ret = __rt_mutex_slowlock(lock, state, timeout, &waiter,
13005 +                                         ww_ctx);
13006 +       else if (ww_ctx) {
13007 +               /* ww_mutex received EDEADLK, let it become EALREADY */
13008 +               ret = __mutex_lock_check_stamp(lock, ww_ctx);
13009 +               BUG_ON(!ret);
13010 +       }
13011
13012         if (unlikely(ret)) {
13013                 __set_current_state(TASK_RUNNING);
13014                 if (rt_mutex_has_waiters(lock))
13015                         remove_waiter(lock, &waiter);
13016 -               rt_mutex_handle_deadlock(ret, chwalk, &waiter);
13017 +               /* ww_mutex want to report EDEADLK/EALREADY, let them */
13018 +               if (!ww_ctx)
13019 +                       rt_mutex_handle_deadlock(ret, chwalk, &waiter);
13020 +       } else if (ww_ctx) {
13021 +               ww_mutex_account_lock(lock, ww_ctx);
13022         }
13023
13024         /*
13025 @@ -1331,7 +1916,8 @@
13026   * Return whether the current task needs to undo a potential priority boosting.
13027   */
13028  static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
13029 -                                       struct wake_q_head *wake_q)
13030 +                                       struct wake_q_head *wake_q,
13031 +                                       struct wake_q_head *wake_sleeper_q)
13032  {
13033         unsigned long flags;
13034
13035 @@ -1387,7 +1973,7 @@
13036          *
13037          * Queue the next waiter for wakeup once we release the wait_lock.
13038          */
13039 -       mark_wakeup_next_waiter(wake_q, lock);
13040 +       mark_wakeup_next_waiter(wake_q, wake_sleeper_q, lock);
13041
13042         raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
13043
13044 @@ -1403,31 +1989,36 @@
13045   */
13046  static inline int
13047  rt_mutex_fastlock(struct rt_mutex *lock, int state,
13048 +                 struct ww_acquire_ctx *ww_ctx,
13049                   int (*slowfn)(struct rt_mutex *lock, int state,
13050                                 struct hrtimer_sleeper *timeout,
13051 -                               enum rtmutex_chainwalk chwalk))
13052 +                               enum rtmutex_chainwalk chwalk,
13053 +                               struct ww_acquire_ctx *ww_ctx))
13054  {
13055         if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) {
13056                 rt_mutex_deadlock_account_lock(lock, current);
13057                 return 0;
13058         } else
13059 -               return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK);
13060 +               return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK,
13061 +                             ww_ctx);
13062  }
13063
13064  static inline int
13065  rt_mutex_timed_fastlock(struct rt_mutex *lock, int state,
13066                         struct hrtimer_sleeper *timeout,
13067                         enum rtmutex_chainwalk chwalk,
13068 +                       struct ww_acquire_ctx *ww_ctx,
13069                         int (*slowfn)(struct rt_mutex *lock, int state,
13070                                       struct hrtimer_sleeper *timeout,
13071 -                                     enum rtmutex_chainwalk chwalk))
13072 +                                     enum rtmutex_chainwalk chwalk,
13073 +                                     struct ww_acquire_ctx *ww_ctx))
13074  {
13075         if (chwalk == RT_MUTEX_MIN_CHAINWALK &&
13076             likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) {
13077                 rt_mutex_deadlock_account_lock(lock, current);
13078                 return 0;
13079         } else
13080 -               return slowfn(lock, state, timeout, chwalk);
13081 +               return slowfn(lock, state, timeout, chwalk, ww_ctx);
13082  }
13083
13084  static inline int
13085 @@ -1444,17 +2035,20 @@
13086  static inline void
13087  rt_mutex_fastunlock(struct rt_mutex *lock,
13088                     bool (*slowfn)(struct rt_mutex *lock,
13089 -                                  struct wake_q_head *wqh))
13090 +                                  struct wake_q_head *wqh,
13091 +                                  struct wake_q_head *wq_sleeper))
13092  {
13093         WAKE_Q(wake_q);
13094 +       WAKE_Q(wake_sleeper_q);
13095
13096         if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) {
13097                 rt_mutex_deadlock_account_unlock(current);
13098
13099         } else {
13100 -               bool deboost = slowfn(lock, &wake_q);
13101 +               bool deboost = slowfn(lock, &wake_q, &wake_sleeper_q);
13102
13103                 wake_up_q(&wake_q);
13104 +               wake_up_q_sleeper(&wake_sleeper_q);
13105
13106                 /* Undo pi boosting if necessary: */
13107                 if (deboost)
13108 @@ -1471,7 +2065,7 @@
13109  {
13110         might_sleep();
13111
13112 -       rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, rt_mutex_slowlock);
13113 +       rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, NULL, rt_mutex_slowlock);
13114  }
13115  EXPORT_SYMBOL_GPL(rt_mutex_lock);
13116
13117 @@ -1488,7 +2082,7 @@
13118  {
13119         might_sleep();
13120
13121 -       return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, rt_mutex_slowlock);
13122 +       return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, NULL, rt_mutex_slowlock);
13123  }
13124  EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
13125
13126 @@ -1501,11 +2095,30 @@
13127         might_sleep();
13128
13129         return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
13130 -                                      RT_MUTEX_FULL_CHAINWALK,
13131 +                                      RT_MUTEX_FULL_CHAINWALK, NULL,
13132                                        rt_mutex_slowlock);
13133  }
13134
13135  /**
13136 + * rt_mutex_lock_killable - lock a rt_mutex killable
13137 + *
13138 + * @lock:              the rt_mutex to be locked
13139 + * @detect_deadlock:   deadlock detection on/off
13140 + *
13141 + * Returns:
13142 + *  0          on success
13143 + * -EINTR      when interrupted by a signal
13144 + * -EDEADLK    when the lock would deadlock (when deadlock detection is on)
13145 + */
13146 +int __sched rt_mutex_lock_killable(struct rt_mutex *lock)
13147 +{
13148 +       might_sleep();
13149 +
13150 +       return rt_mutex_fastlock(lock, TASK_KILLABLE, NULL, rt_mutex_slowlock);
13151 +}
13152 +EXPORT_SYMBOL_GPL(rt_mutex_lock_killable);
13153 +
13154 +/**
13155   * rt_mutex_timed_lock - lock a rt_mutex interruptible
13156   *                     the timeout structure is provided
13157   *                     by the caller
13158 @@ -1525,6 +2138,7 @@
13159
13160         return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
13161                                        RT_MUTEX_MIN_CHAINWALK,
13162 +                                      NULL,
13163                                        rt_mutex_slowlock);
13164  }
13165  EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);
13166 @@ -1542,7 +2156,11 @@
13167   */
13168  int __sched rt_mutex_trylock(struct rt_mutex *lock)
13169  {
13170 +#ifdef CONFIG_PREEMPT_RT_FULL
13171 +       if (WARN_ON_ONCE(in_irq() || in_nmi()))
13172 +#else
13173         if (WARN_ON_ONCE(in_irq() || in_nmi() || in_serving_softirq()))
13174 +#endif
13175                 return 0;
13176
13177         return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock);
13178 @@ -1568,13 +2186,14 @@
13179   * required or not.
13180   */
13181  bool __sched rt_mutex_futex_unlock(struct rt_mutex *lock,
13182 -                                  struct wake_q_head *wqh)
13183 +                                  struct wake_q_head *wqh,
13184 +                                  struct wake_q_head *wq_sleeper)
13185  {
13186         if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) {
13187                 rt_mutex_deadlock_account_unlock(current);
13188                 return false;
13189         }
13190 -       return rt_mutex_slowunlock(lock, wqh);
13191 +       return rt_mutex_slowunlock(lock, wqh, wq_sleeper);
13192  }
13193
13194  /**
13195 @@ -1607,13 +2226,12 @@
13196  void __rt_mutex_init(struct rt_mutex *lock, const char *name)
13197  {
13198         lock->owner = NULL;
13199 -       raw_spin_lock_init(&lock->wait_lock);
13200         lock->waiters = RB_ROOT;
13201         lock->waiters_leftmost = NULL;
13202
13203         debug_rt_mutex_init(lock, name);
13204  }
13205 -EXPORT_SYMBOL_GPL(__rt_mutex_init);
13206 +EXPORT_SYMBOL(__rt_mutex_init);
13207
13208  /**
13209   * rt_mutex_init_proxy_locked - initialize and lock a rt_mutex on behalf of a
13210 @@ -1628,7 +2246,7 @@
13211  void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
13212                                 struct task_struct *proxy_owner)
13213  {
13214 -       __rt_mutex_init(lock, NULL);
13215 +       rt_mutex_init(lock);
13216         debug_rt_mutex_proxy_lock(lock, proxy_owner);
13217         rt_mutex_set_owner(lock, proxy_owner);
13218         rt_mutex_deadlock_account_lock(lock, proxy_owner);
13219 @@ -1676,6 +2294,35 @@
13220                 return 1;
13221         }
13222
13223 +#ifdef CONFIG_PREEMPT_RT_FULL
13224 +       /*
13225 +        * In PREEMPT_RT there's an added race.
13226 +        * If the task, that we are about to requeue, times out,
13227 +        * it can set the PI_WAKEUP_INPROGRESS. This tells the requeue
13228 +        * to skip this task. But right after the task sets
13229 +        * its pi_blocked_on to PI_WAKEUP_INPROGRESS it can then
13230 +        * block on the spin_lock(&hb->lock), which in RT is an rtmutex.
13231 +        * This will replace the PI_WAKEUP_INPROGRESS with the actual
13232 +        * lock that it blocks on. We *must not* place this task
13233 +        * on this proxy lock in that case.
13234 +        *
13235 +        * To prevent this race, we first take the task's pi_lock
13236 +        * and check if it has updated its pi_blocked_on. If it has,
13237 +        * we assume that it woke up and we return -EAGAIN.
13238 +        * Otherwise, we set the task's pi_blocked_on to
13239 +        * PI_REQUEUE_INPROGRESS, so that if the task is waking up
13240 +        * it will know that we are in the process of requeuing it.
13241 +        */
13242 +       raw_spin_lock(&task->pi_lock);
13243 +       if (task->pi_blocked_on) {
13244 +               raw_spin_unlock(&task->pi_lock);
13245 +               raw_spin_unlock_irq(&lock->wait_lock);
13246 +               return -EAGAIN;
13247 +       }
13248 +       task->pi_blocked_on = PI_REQUEUE_INPROGRESS;
13249 +       raw_spin_unlock(&task->pi_lock);
13250 +#endif
13251 +
13252         /* We enforce deadlock detection for futexes */
13253         ret = task_blocks_on_rt_mutex(lock, waiter, task,
13254                                       RT_MUTEX_FULL_CHAINWALK);
13255 @@ -1690,7 +2337,7 @@
13256                 ret = 0;
13257         }
13258
13259 -       if (unlikely(ret))
13260 +       if (ret && rt_mutex_has_waiters(lock))
13261                 remove_waiter(lock, waiter);
13262
13263         raw_spin_unlock_irq(&lock->wait_lock);
13264 @@ -1746,7 +2393,7 @@
13265         set_current_state(TASK_INTERRUPTIBLE);
13266
13267         /* sleep on the mutex */
13268 -       ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter);
13269 +       ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter, NULL);
13270
13271         if (unlikely(ret))
13272                 remove_waiter(lock, waiter);
13273 @@ -1761,3 +2408,89 @@
13274
13275         return ret;
13276  }
13277 +
13278 +static inline int
13279 +ww_mutex_deadlock_injection(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
13280 +{
13281 +#ifdef CONFIG_DEBUG_WW_MUTEX_SLOWPATH
13282 +       unsigned tmp;
13283 +
13284 +       if (ctx->deadlock_inject_countdown-- == 0) {
13285 +               tmp = ctx->deadlock_inject_interval;
13286 +               if (tmp > UINT_MAX/4)
13287 +                       tmp = UINT_MAX;
13288 +               else
13289 +                       tmp = tmp*2 + tmp + tmp/2;
13290 +
13291 +               ctx->deadlock_inject_interval = tmp;
13292 +               ctx->deadlock_inject_countdown = tmp;
13293 +               ctx->contending_lock = lock;
13294 +
13295 +               ww_mutex_unlock(lock);
13296 +
13297 +               return -EDEADLK;
13298 +       }
13299 +#endif
13300 +
13301 +       return 0;
13302 +}
13303 +
13304 +#ifdef CONFIG_PREEMPT_RT_FULL
13305 +int __sched
13306 +__ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ww_ctx)
13307 +{
13308 +       int ret;
13309 +
13310 +       might_sleep();
13311 +
13312 +       mutex_acquire_nest(&lock->base.dep_map, 0, 0, &ww_ctx->dep_map, _RET_IP_);
13313 +       ret = rt_mutex_slowlock(&lock->base.lock, TASK_INTERRUPTIBLE, NULL, 0, ww_ctx);
13314 +       if (ret)
13315 +               mutex_release(&lock->base.dep_map, 1, _RET_IP_);
13316 +       else if (!ret && ww_ctx->acquired > 1)
13317 +               return ww_mutex_deadlock_injection(lock, ww_ctx);
13318 +
13319 +       return ret;
13320 +}
13321 +EXPORT_SYMBOL_GPL(__ww_mutex_lock_interruptible);
13322 +
13323 +int __sched
13324 +__ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ww_ctx)
13325 +{
13326 +       int ret;
13327 +
13328 +       might_sleep();
13329 +
13330 +       mutex_acquire_nest(&lock->base.dep_map, 0, 0, &ww_ctx->dep_map, _RET_IP_);
13331 +       ret = rt_mutex_slowlock(&lock->base.lock, TASK_UNINTERRUPTIBLE, NULL, 0, ww_ctx);
13332 +       if (ret)
13333 +               mutex_release(&lock->base.dep_map, 1, _RET_IP_);
13334 +       else if (!ret && ww_ctx->acquired > 1)
13335 +               return ww_mutex_deadlock_injection(lock, ww_ctx);
13336 +
13337 +       return ret;
13338 +}
13339 +EXPORT_SYMBOL_GPL(__ww_mutex_lock);
13340 +
13341 +void __sched ww_mutex_unlock(struct ww_mutex *lock)
13342 +{
13343 +       int nest = !!lock->ctx;
13344 +
13345 +       /*
13346 +        * The unlocking fastpath is the 0->1 transition from 'locked'
13347 +        * into 'unlocked' state:
13348 +        */
13349 +       if (nest) {
13350 +#ifdef CONFIG_DEBUG_MUTEXES
13351 +               DEBUG_LOCKS_WARN_ON(!lock->ctx->acquired);
13352 +#endif
13353 +               if (lock->ctx->acquired > 0)
13354 +                       lock->ctx->acquired--;
13355 +               lock->ctx = NULL;
13356 +       }
13357 +
13358 +       mutex_release(&lock->base.dep_map, nest, _RET_IP_);
13359 +       rt_mutex_unlock(&lock->base.lock);
13360 +}
13361 +EXPORT_SYMBOL(ww_mutex_unlock);
13362 +#endif
13363 diff -Nur linux-4.9.6.orig/kernel/locking/rtmutex_common.h linux-4.9.6/kernel/locking/rtmutex_common.h
13364 --- linux-4.9.6.orig/kernel/locking/rtmutex_common.h    2017-01-26 08:25:24.000000000 +0100
13365 +++ linux-4.9.6/kernel/locking/rtmutex_common.h 2017-01-28 13:59:10.035660185 +0100
13366 @@ -27,6 +27,7 @@
13367         struct rb_node          pi_tree_entry;
13368         struct task_struct      *task;
13369         struct rt_mutex         *lock;
13370 +       bool                    savestate;
13371  #ifdef CONFIG_DEBUG_RT_MUTEXES
13372         unsigned long           ip;
13373         struct pid              *deadlock_task_pid;
13374 @@ -98,6 +99,9 @@
13375  /*
13376   * PI-futex support (proxy locking functions, etc.):
13377   */
13378 +#define PI_WAKEUP_INPROGRESS   ((struct rt_mutex_waiter *) 1)
13379 +#define PI_REQUEUE_INPROGRESS  ((struct rt_mutex_waiter *) 2)
13380 +
13381  extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock);
13382  extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
13383                                        struct task_struct *proxy_owner);
13384 @@ -111,7 +115,8 @@
13385                                       struct rt_mutex_waiter *waiter);
13386  extern int rt_mutex_timed_futex_lock(struct rt_mutex *l, struct hrtimer_sleeper *to);
13387  extern bool rt_mutex_futex_unlock(struct rt_mutex *lock,
13388 -                                 struct wake_q_head *wqh);
13389 +                                 struct wake_q_head *wqh,
13390 +                                 struct wake_q_head *wq_sleeper);
13391  extern void rt_mutex_adjust_prio(struct task_struct *task);
13392
13393  #ifdef CONFIG_DEBUG_RT_MUTEXES
13394 @@ -120,4 +125,14 @@
13395  # include "rtmutex.h"
13396  #endif
13397
13398 +static inline void
13399 +rt_mutex_init_waiter(struct rt_mutex_waiter *waiter, bool savestate)
13400 +{
13401 +       debug_rt_mutex_init_waiter(waiter);
13402 +       waiter->task = NULL;
13403 +       waiter->savestate = savestate;
13404 +       RB_CLEAR_NODE(&waiter->pi_tree_entry);
13405 +       RB_CLEAR_NODE(&waiter->tree_entry);
13406 +}
13407 +
13408  #endif
13409 diff -Nur linux-4.9.6.orig/kernel/locking/spinlock.c linux-4.9.6/kernel/locking/spinlock.c
13410 --- linux-4.9.6.orig/kernel/locking/spinlock.c  2017-01-26 08:25:24.000000000 +0100
13411 +++ linux-4.9.6/kernel/locking/spinlock.c       2017-01-28 13:59:10.035660185 +0100
13412 @@ -124,8 +124,11 @@
13413   *         __[spin|read|write]_lock_bh()
13414   */
13415  BUILD_LOCK_OPS(spin, raw_spinlock);
13416 +
13417 +#ifndef CONFIG_PREEMPT_RT_FULL
13418  BUILD_LOCK_OPS(read, rwlock);
13419  BUILD_LOCK_OPS(write, rwlock);
13420 +#endif
13421
13422  #endif
13423
13424 @@ -209,6 +212,8 @@
13425  EXPORT_SYMBOL(_raw_spin_unlock_bh);
13426  #endif
13427
13428 +#ifndef CONFIG_PREEMPT_RT_FULL
13429 +
13430  #ifndef CONFIG_INLINE_READ_TRYLOCK
13431  int __lockfunc _raw_read_trylock(rwlock_t *lock)
13432  {
13433 @@ -353,6 +358,8 @@
13434  EXPORT_SYMBOL(_raw_write_unlock_bh);
13435  #endif
13436
13437 +#endif /* !PREEMPT_RT_FULL */
13438 +
13439  #ifdef CONFIG_DEBUG_LOCK_ALLOC
13440
13441  void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass)
13442 diff -Nur linux-4.9.6.orig/kernel/locking/spinlock_debug.c linux-4.9.6/kernel/locking/spinlock_debug.c
13443 --- linux-4.9.6.orig/kernel/locking/spinlock_debug.c    2017-01-26 08:25:24.000000000 +0100
13444 +++ linux-4.9.6/kernel/locking/spinlock_debug.c 2017-01-28 13:59:10.035660185 +0100
13445 @@ -31,6 +31,7 @@
13446
13447  EXPORT_SYMBOL(__raw_spin_lock_init);
13448
13449 +#ifndef CONFIG_PREEMPT_RT_FULL
13450  void __rwlock_init(rwlock_t *lock, const char *name,
13451                    struct lock_class_key *key)
13452  {
13453 @@ -48,6 +49,7 @@
13454  }
13455
13456  EXPORT_SYMBOL(__rwlock_init);
13457 +#endif
13458
13459  static void spin_dump(raw_spinlock_t *lock, const char *msg)
13460  {
13461 @@ -159,6 +161,7 @@
13462         arch_spin_unlock(&lock->raw_lock);
13463  }
13464
13465 +#ifndef CONFIG_PREEMPT_RT_FULL
13466  static void rwlock_bug(rwlock_t *lock, const char *msg)
13467  {
13468         if (!debug_locks_off())
13469 @@ -300,3 +303,5 @@
13470         debug_write_unlock(lock);
13471         arch_write_unlock(&lock->raw_lock);
13472  }
13473 +
13474 +#endif
13475 diff -Nur linux-4.9.6.orig/kernel/panic.c linux-4.9.6/kernel/panic.c
13476 --- linux-4.9.6.orig/kernel/panic.c     2017-01-26 08:25:24.000000000 +0100
13477 +++ linux-4.9.6/kernel/panic.c  2017-01-28 13:59:10.035660185 +0100
13478 @@ -482,9 +482,11 @@
13479
13480  static int init_oops_id(void)
13481  {
13482 +#ifndef CONFIG_PREEMPT_RT_FULL
13483         if (!oops_id)
13484                 get_random_bytes(&oops_id, sizeof(oops_id));
13485         else
13486 +#endif
13487                 oops_id++;
13488
13489         return 0;
13490 diff -Nur linux-4.9.6.orig/kernel/power/hibernate.c linux-4.9.6/kernel/power/hibernate.c
13491 --- linux-4.9.6.orig/kernel/power/hibernate.c   2017-01-26 08:25:24.000000000 +0100
13492 +++ linux-4.9.6/kernel/power/hibernate.c        2017-01-28 13:59:10.039660339 +0100
13493 @@ -286,6 +286,8 @@
13494
13495         local_irq_disable();
13496
13497 +       system_state = SYSTEM_SUSPEND;
13498 +
13499         error = syscore_suspend();
13500         if (error) {
13501                 printk(KERN_ERR "PM: Some system devices failed to power down, "
13502 @@ -317,6 +319,7 @@
13503         syscore_resume();
13504
13505   Enable_irqs:
13506 +       system_state = SYSTEM_RUNNING;
13507         local_irq_enable();
13508
13509   Enable_cpus:
13510 @@ -446,6 +449,7 @@
13511                 goto Enable_cpus;
13512
13513         local_irq_disable();
13514 +       system_state = SYSTEM_SUSPEND;
13515
13516         error = syscore_suspend();
13517         if (error)
13518 @@ -479,6 +483,7 @@
13519         syscore_resume();
13520
13521   Enable_irqs:
13522 +       system_state = SYSTEM_RUNNING;
13523         local_irq_enable();
13524
13525   Enable_cpus:
13526 @@ -564,6 +569,7 @@
13527                 goto Enable_cpus;
13528
13529         local_irq_disable();
13530 +       system_state = SYSTEM_SUSPEND;
13531         syscore_suspend();
13532         if (pm_wakeup_pending()) {
13533                 error = -EAGAIN;
13534 @@ -576,6 +582,7 @@
13535
13536   Power_up:
13537         syscore_resume();
13538 +       system_state = SYSTEM_RUNNING;
13539         local_irq_enable();
13540
13541   Enable_cpus:
13542 @@ -676,6 +683,10 @@
13543         return error;
13544  }
13545
13546 +#ifndef CONFIG_SUSPEND
13547 +bool pm_in_action;
13548 +#endif
13549 +
13550  /**
13551   * hibernate - Carry out system hibernation, including saving the image.
13552   */
13553 @@ -689,6 +700,8 @@
13554                 return -EPERM;
13555         }
13556
13557 +       pm_in_action = true;
13558 +
13559         lock_system_sleep();
13560         /* The snapshot device should not be opened while we're running */
13561         if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
13562 @@ -766,6 +779,7 @@
13563         atomic_inc(&snapshot_device_available);
13564   Unlock:
13565         unlock_system_sleep();
13566 +       pm_in_action = false;
13567         return error;
13568  }
13569
13570 diff -Nur linux-4.9.6.orig/kernel/power/suspend.c linux-4.9.6/kernel/power/suspend.c
13571 --- linux-4.9.6.orig/kernel/power/suspend.c     2017-01-26 08:25:24.000000000 +0100
13572 +++ linux-4.9.6/kernel/power/suspend.c  2017-01-28 13:59:10.039660339 +0100
13573 @@ -369,6 +369,8 @@
13574         arch_suspend_disable_irqs();
13575         BUG_ON(!irqs_disabled());
13576
13577 +       system_state = SYSTEM_SUSPEND;
13578 +
13579         error = syscore_suspend();
13580         if (!error) {
13581                 *wakeup = pm_wakeup_pending();
13582 @@ -385,6 +387,8 @@
13583                 syscore_resume();
13584         }
13585
13586 +       system_state = SYSTEM_RUNNING;
13587 +
13588         arch_suspend_enable_irqs();
13589         BUG_ON(irqs_disabled());
13590
13591 @@ -527,6 +531,8 @@
13592         return error;
13593  }
13594
13595 +bool pm_in_action;
13596 +
13597  /**
13598   * pm_suspend - Externally visible function for suspending the system.
13599   * @state: System sleep state to enter.
13600 @@ -541,6 +547,8 @@
13601         if (state <= PM_SUSPEND_ON || state >= PM_SUSPEND_MAX)
13602                 return -EINVAL;
13603
13604 +       pm_in_action = true;
13605 +
13606         error = enter_state(state);
13607         if (error) {
13608                 suspend_stats.fail++;
13609 @@ -548,6 +556,7 @@
13610         } else {
13611                 suspend_stats.success++;
13612         }
13613 +       pm_in_action = false;
13614         return error;
13615  }
13616  EXPORT_SYMBOL(pm_suspend);
13617 diff -Nur linux-4.9.6.orig/kernel/printk/printk.c linux-4.9.6/kernel/printk/printk.c
13618 --- linux-4.9.6.orig/kernel/printk/printk.c     2017-01-26 08:25:24.000000000 +0100
13619 +++ linux-4.9.6/kernel/printk/printk.c  2017-01-28 13:59:10.039660339 +0100
13620 @@ -351,6 +351,65 @@
13621   */
13622  DEFINE_RAW_SPINLOCK(logbuf_lock);
13623
13624 +#ifdef CONFIG_EARLY_PRINTK
13625 +struct console *early_console;
13626 +
13627 +static void early_vprintk(const char *fmt, va_list ap)
13628 +{
13629 +       if (early_console) {
13630 +               char buf[512];
13631 +               int n = vscnprintf(buf, sizeof(buf), fmt, ap);
13632 +
13633 +               early_console->write(early_console, buf, n);
13634 +       }
13635 +}
13636 +
13637 +asmlinkage void early_printk(const char *fmt, ...)
13638 +{
13639 +       va_list ap;
13640 +
13641 +       va_start(ap, fmt);
13642 +       early_vprintk(fmt, ap);
13643 +       va_end(ap);
13644 +}
13645 +
13646 +/*
13647 + * This is independent of any log levels - a global
13648 + * kill switch that turns off all of printk.
13649 + *
13650 + * Used by the NMI watchdog if early-printk is enabled.
13651 + */
13652 +static bool __read_mostly printk_killswitch;
13653 +
13654 +static int __init force_early_printk_setup(char *str)
13655 +{
13656 +       printk_killswitch = true;
13657 +       return 0;
13658 +}
13659 +early_param("force_early_printk", force_early_printk_setup);
13660 +
13661 +void printk_kill(void)
13662 +{
13663 +       printk_killswitch = true;
13664 +}
13665 +
13666 +#ifdef CONFIG_PRINTK
13667 +static int forced_early_printk(const char *fmt, va_list ap)
13668 +{
13669 +       if (!printk_killswitch)
13670 +               return 0;
13671 +       early_vprintk(fmt, ap);
13672 +       return 1;
13673 +}
13674 +#endif
13675 +
13676 +#else
13677 +static inline int forced_early_printk(const char *fmt, va_list ap)
13678 +{
13679 +       return 0;
13680 +}
13681 +#endif
13682 +
13683  #ifdef CONFIG_PRINTK
13684  DECLARE_WAIT_QUEUE_HEAD(log_wait);
13685  /* the next printk record to read by syslog(READ) or /proc/kmsg */
13686 @@ -1337,6 +1396,7 @@
13687  {
13688         char *text;
13689         int len = 0;
13690 +       int attempts = 0;
13691
13692         text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL);
13693         if (!text)
13694 @@ -1348,6 +1408,14 @@
13695                 u64 seq;
13696                 u32 idx;
13697                 enum log_flags prev;
13698 +               int num_msg;
13699 +try_again:
13700 +               attempts++;
13701 +               if (attempts > 10) {
13702 +                       len = -EBUSY;
13703 +                       goto out;
13704 +               }
13705 +               num_msg = 0;
13706
13707                 /*
13708                  * Find first record that fits, including all following records,
13709 @@ -1363,6 +1431,14 @@
13710                         prev = msg->flags;
13711                         idx = log_next(idx);
13712                         seq++;
13713 +                       num_msg++;
13714 +                       if (num_msg > 5) {
13715 +                               num_msg = 0;
13716 +                               raw_spin_unlock_irq(&logbuf_lock);
13717 +                               raw_spin_lock_irq(&logbuf_lock);
13718 +                               if (clear_seq < log_first_seq)
13719 +                                       goto try_again;
13720 +                       }
13721                 }
13722
13723                 /* move first record forward until length fits into the buffer */
13724 @@ -1376,6 +1452,14 @@
13725                         prev = msg->flags;
13726                         idx = log_next(idx);
13727                         seq++;
13728 +                       num_msg++;
13729 +                       if (num_msg > 5) {
13730 +                               num_msg = 0;
13731 +                               raw_spin_unlock_irq(&logbuf_lock);
13732 +                               raw_spin_lock_irq(&logbuf_lock);
13733 +                               if (clear_seq < log_first_seq)
13734 +                                       goto try_again;
13735 +                       }
13736                 }
13737
13738                 /* last message fitting into this dump */
13739 @@ -1416,6 +1500,7 @@
13740                 clear_seq = log_next_seq;
13741                 clear_idx = log_next_idx;
13742         }
13743 +out:
13744         raw_spin_unlock_irq(&logbuf_lock);
13745
13746         kfree(text);
13747 @@ -1569,6 +1654,12 @@
13748         if (!console_drivers)
13749                 return;
13750
13751 +       if (IS_ENABLED(CONFIG_PREEMPT_RT_BASE)) {
13752 +               if (in_irq() || in_nmi())
13753 +                       return;
13754 +       }
13755 +
13756 +       migrate_disable();
13757         for_each_console(con) {
13758                 if (exclusive_console && con != exclusive_console)
13759                         continue;
13760 @@ -1584,6 +1675,7 @@
13761                 else
13762                         con->write(con, text, len);
13763         }
13764 +       migrate_enable();
13765  }
13766
13767  /*
13768 @@ -1781,6 +1873,13 @@
13769         /* cpu currently holding logbuf_lock in this function */
13770         static unsigned int logbuf_cpu = UINT_MAX;
13771
13772 +       /*
13773 +        * Fall back to early_printk if a debugging subsystem has
13774 +        * killed printk output
13775 +        */
13776 +       if (unlikely(forced_early_printk(fmt, args)))
13777 +               return 1;
13778 +
13779         if (level == LOGLEVEL_SCHED) {
13780                 level = LOGLEVEL_DEFAULT;
13781                 in_sched = true;
13782 @@ -1885,13 +1984,23 @@
13783
13784         /* If called from the scheduler, we can not call up(). */
13785         if (!in_sched) {
13786 +               int may_trylock = 1;
13787 +
13788                 lockdep_off();
13789 +#ifdef CONFIG_PREEMPT_RT_FULL
13790 +               /*
13791 +                * we can't take a sleeping lock with IRQs or preeption disabled
13792 +                * so we can't print in these contexts
13793 +                */
13794 +               if (!(preempt_count() == 0 && !irqs_disabled()))
13795 +                       may_trylock = 0;
13796 +#endif
13797                 /*
13798                  * Try to acquire and then immediately release the console
13799                  * semaphore.  The release will print out buffers and wake up
13800                  * /dev/kmsg and syslog() users.
13801                  */
13802 -               if (console_trylock())
13803 +               if (may_trylock && console_trylock())
13804                         console_unlock();
13805                 lockdep_on();
13806         }
13807 @@ -2014,26 +2123,6 @@
13808
13809  #endif /* CONFIG_PRINTK */
13810
13811 -#ifdef CONFIG_EARLY_PRINTK
13812 -struct console *early_console;
13813 -
13814 -asmlinkage __visible void early_printk(const char *fmt, ...)
13815 -{
13816 -       va_list ap;
13817 -       char buf[512];
13818 -       int n;
13819 -
13820 -       if (!early_console)
13821 -               return;
13822 -
13823 -       va_start(ap, fmt);
13824 -       n = vscnprintf(buf, sizeof(buf), fmt, ap);
13825 -       va_end(ap);
13826 -
13827 -       early_console->write(early_console, buf, n);
13828 -}
13829 -#endif
13830 -
13831  static int __add_preferred_console(char *name, int idx, char *options,
13832                                    char *brl_options)
13833  {
13834 @@ -2303,11 +2392,16 @@
13835                 goto out;
13836
13837         len = cont_print_text(text, size);
13838 +#ifdef CONFIG_PREEMPT_RT_FULL
13839 +       raw_spin_unlock_irqrestore(&logbuf_lock, flags);
13840 +       call_console_drivers(cont.level, NULL, 0, text, len);
13841 +#else
13842         raw_spin_unlock(&logbuf_lock);
13843         stop_critical_timings();
13844         call_console_drivers(cont.level, NULL, 0, text, len);
13845         start_critical_timings();
13846         local_irq_restore(flags);
13847 +#endif
13848         return;
13849  out:
13850         raw_spin_unlock_irqrestore(&logbuf_lock, flags);
13851 @@ -2431,13 +2525,17 @@
13852                 console_idx = log_next(console_idx);
13853                 console_seq++;
13854                 console_prev = msg->flags;
13855 +#ifdef CONFIG_PREEMPT_RT_FULL
13856 +               raw_spin_unlock_irqrestore(&logbuf_lock, flags);
13857 +               call_console_drivers(level, ext_text, ext_len, text, len);
13858 +#else
13859                 raw_spin_unlock(&logbuf_lock);
13860
13861                 stop_critical_timings();        /* don't trace print latency */
13862                 call_console_drivers(level, ext_text, ext_len, text, len);
13863                 start_critical_timings();
13864                 local_irq_restore(flags);
13865 -
13866 +#endif
13867                 if (do_cond_resched)
13868                         cond_resched();
13869         }
13870 @@ -2489,6 +2587,11 @@
13871  {
13872         struct console *c;
13873
13874 +       if (IS_ENABLED(CONFIG_PREEMPT_RT_BASE)) {
13875 +               if (in_irq() || in_nmi())
13876 +                       return;
13877 +       }
13878 +
13879         /*
13880          * console_unblank can no longer be called in interrupt context unless
13881          * oops_in_progress is set to 1..
13882 diff -Nur linux-4.9.6.orig/kernel/ptrace.c linux-4.9.6/kernel/ptrace.c
13883 --- linux-4.9.6.orig/kernel/ptrace.c    2017-01-26 08:25:24.000000000 +0100
13884 +++ linux-4.9.6/kernel/ptrace.c 2017-01-28 13:59:10.039660339 +0100
13885 @@ -166,7 +166,14 @@
13886
13887         spin_lock_irq(&task->sighand->siglock);
13888         if (task_is_traced(task) && !__fatal_signal_pending(task)) {
13889 -               task->state = __TASK_TRACED;
13890 +               unsigned long flags;
13891 +
13892 +               raw_spin_lock_irqsave(&task->pi_lock, flags);
13893 +               if (task->state & __TASK_TRACED)
13894 +                       task->state = __TASK_TRACED;
13895 +               else
13896 +                       task->saved_state = __TASK_TRACED;
13897 +               raw_spin_unlock_irqrestore(&task->pi_lock, flags);
13898                 ret = true;
13899         }
13900         spin_unlock_irq(&task->sighand->siglock);
13901 diff -Nur linux-4.9.6.orig/kernel/rcu/rcutorture.c linux-4.9.6/kernel/rcu/rcutorture.c
13902 --- linux-4.9.6.orig/kernel/rcu/rcutorture.c    2017-01-26 08:25:24.000000000 +0100
13903 +++ linux-4.9.6/kernel/rcu/rcutorture.c 2017-01-28 13:59:10.039660339 +0100
13904 @@ -404,6 +404,7 @@
13905         .name           = "rcu"
13906  };
13907
13908 +#ifndef CONFIG_PREEMPT_RT_FULL
13909  /*
13910   * Definitions for rcu_bh torture testing.
13911   */
13912 @@ -443,6 +444,12 @@
13913         .name           = "rcu_bh"
13914  };
13915
13916 +#else
13917 +static struct rcu_torture_ops rcu_bh_ops = {
13918 +       .ttype          = INVALID_RCU_FLAVOR,
13919 +};
13920 +#endif
13921 +
13922  /*
13923   * Don't even think about trying any of these in real life!!!
13924   * The names includes "busted", and they really means it!
13925 diff -Nur linux-4.9.6.orig/kernel/rcu/tree.c linux-4.9.6/kernel/rcu/tree.c
13926 --- linux-4.9.6.orig/kernel/rcu/tree.c  2017-01-26 08:25:24.000000000 +0100
13927 +++ linux-4.9.6/kernel/rcu/tree.c       2017-01-28 13:59:10.039660339 +0100
13928 @@ -55,6 +55,11 @@
13929  #include <linux/random.h>
13930  #include <linux/trace_events.h>
13931  #include <linux/suspend.h>
13932 +#include <linux/delay.h>
13933 +#include <linux/gfp.h>
13934 +#include <linux/oom.h>
13935 +#include <linux/smpboot.h>
13936 +#include "../time/tick-internal.h"
13937
13938  #include "tree.h"
13939  #include "rcu.h"
13940 @@ -260,6 +265,19 @@
13941                            this_cpu_ptr(&rcu_sched_data), true);
13942  }
13943
13944 +#ifdef CONFIG_PREEMPT_RT_FULL
13945 +static void rcu_preempt_qs(void);
13946 +
13947 +void rcu_bh_qs(void)
13948 +{
13949 +       unsigned long flags;
13950 +
13951 +       /* Callers to this function, rcu_preempt_qs(), must disable irqs. */
13952 +       local_irq_save(flags);
13953 +       rcu_preempt_qs();
13954 +       local_irq_restore(flags);
13955 +}
13956 +#else
13957  void rcu_bh_qs(void)
13958  {
13959         if (__this_cpu_read(rcu_bh_data.cpu_no_qs.s)) {
13960 @@ -269,6 +287,7 @@
13961                 __this_cpu_write(rcu_bh_data.cpu_no_qs.b.norm, false);
13962         }
13963  }
13964 +#endif
13965
13966  static DEFINE_PER_CPU(int, rcu_sched_qs_mask);
13967
13968 @@ -449,11 +468,13 @@
13969  /*
13970   * Return the number of RCU BH batches started thus far for debug & stats.
13971   */
13972 +#ifndef CONFIG_PREEMPT_RT_FULL
13973  unsigned long rcu_batches_started_bh(void)
13974  {
13975         return rcu_bh_state.gpnum;
13976  }
13977  EXPORT_SYMBOL_GPL(rcu_batches_started_bh);
13978 +#endif
13979
13980  /*
13981   * Return the number of RCU batches completed thus far for debug & stats.
13982 @@ -473,6 +494,7 @@
13983  }
13984  EXPORT_SYMBOL_GPL(rcu_batches_completed_sched);
13985
13986 +#ifndef CONFIG_PREEMPT_RT_FULL
13987  /*
13988   * Return the number of RCU BH batches completed thus far for debug & stats.
13989   */
13990 @@ -481,6 +503,7 @@
13991         return rcu_bh_state.completed;
13992  }
13993  EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
13994 +#endif
13995
13996  /*
13997   * Return the number of RCU expedited batches completed thus far for
13998 @@ -504,6 +527,7 @@
13999  }
14000  EXPORT_SYMBOL_GPL(rcu_exp_batches_completed_sched);
14001
14002 +#ifndef CONFIG_PREEMPT_RT_FULL
14003  /*
14004   * Force a quiescent state.
14005   */
14006 @@ -522,6 +546,13 @@
14007  }
14008  EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state);
14009
14010 +#else
14011 +void rcu_force_quiescent_state(void)
14012 +{
14013 +}
14014 +EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
14015 +#endif
14016 +
14017  /*
14018   * Force a quiescent state for RCU-sched.
14019   */
14020 @@ -572,9 +603,11 @@
14021         case RCU_FLAVOR:
14022                 rsp = rcu_state_p;
14023                 break;
14024 +#ifndef CONFIG_PREEMPT_RT_FULL
14025         case RCU_BH_FLAVOR:
14026                 rsp = &rcu_bh_state;
14027                 break;
14028 +#endif
14029         case RCU_SCHED_FLAVOR:
14030                 rsp = &rcu_sched_state;
14031                 break;
14032 @@ -3016,18 +3049,17 @@
14033  /*
14034   * Do RCU core processing for the current CPU.
14035   */
14036 -static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused)
14037 +static __latent_entropy void rcu_process_callbacks(void)
14038  {
14039         struct rcu_state *rsp;
14040
14041         if (cpu_is_offline(smp_processor_id()))
14042                 return;
14043 -       trace_rcu_utilization(TPS("Start RCU core"));
14044         for_each_rcu_flavor(rsp)
14045                 __rcu_process_callbacks(rsp);
14046 -       trace_rcu_utilization(TPS("End RCU core"));
14047  }
14048
14049 +static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
14050  /*
14051   * Schedule RCU callback invocation.  If the specified type of RCU
14052   * does not support RCU priority boosting, just do a direct call,
14053 @@ -3039,18 +3071,105 @@
14054  {
14055         if (unlikely(!READ_ONCE(rcu_scheduler_fully_active)))
14056                 return;
14057 -       if (likely(!rsp->boost)) {
14058 -               rcu_do_batch(rsp, rdp);
14059 +       rcu_do_batch(rsp, rdp);
14060 +}
14061 +
14062 +static void rcu_wake_cond(struct task_struct *t, int status)
14063 +{
14064 +       /*
14065 +        * If the thread is yielding, only wake it when this
14066 +        * is invoked from idle
14067 +        */
14068 +       if (t && (status != RCU_KTHREAD_YIELDING || is_idle_task(current)))
14069 +               wake_up_process(t);
14070 +}
14071 +
14072 +/*
14073 + * Wake up this CPU's rcuc kthread to do RCU core processing.
14074 + */
14075 +static void invoke_rcu_core(void)
14076 +{
14077 +       unsigned long flags;
14078 +       struct task_struct *t;
14079 +
14080 +       if (!cpu_online(smp_processor_id()))
14081                 return;
14082 +       local_irq_save(flags);
14083 +       __this_cpu_write(rcu_cpu_has_work, 1);
14084 +       t = __this_cpu_read(rcu_cpu_kthread_task);
14085 +       if (t != NULL && current != t)
14086 +               rcu_wake_cond(t, __this_cpu_read(rcu_cpu_kthread_status));
14087 +       local_irq_restore(flags);
14088 +}
14089 +
14090 +static void rcu_cpu_kthread_park(unsigned int cpu)
14091 +{
14092 +       per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
14093 +}
14094 +
14095 +static int rcu_cpu_kthread_should_run(unsigned int cpu)
14096 +{
14097 +       return __this_cpu_read(rcu_cpu_has_work);
14098 +}
14099 +
14100 +/*
14101 + * Per-CPU kernel thread that invokes RCU callbacks.  This replaces the
14102 + * RCU softirq used in flavors and configurations of RCU that do not
14103 + * support RCU priority boosting.
14104 + */
14105 +static void rcu_cpu_kthread(unsigned int cpu)
14106 +{
14107 +       unsigned int *statusp = this_cpu_ptr(&rcu_cpu_kthread_status);
14108 +       char work, *workp = this_cpu_ptr(&rcu_cpu_has_work);
14109 +       int spincnt;
14110 +
14111 +       for (spincnt = 0; spincnt < 10; spincnt++) {
14112 +               trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait"));
14113 +               local_bh_disable();
14114 +               *statusp = RCU_KTHREAD_RUNNING;
14115 +               this_cpu_inc(rcu_cpu_kthread_loops);
14116 +               local_irq_disable();
14117 +               work = *workp;
14118 +               *workp = 0;
14119 +               local_irq_enable();
14120 +               if (work)
14121 +                       rcu_process_callbacks();
14122 +               local_bh_enable();
14123 +               if (*workp == 0) {
14124 +                       trace_rcu_utilization(TPS("End CPU kthread@rcu_wait"));
14125 +                       *statusp = RCU_KTHREAD_WAITING;
14126 +                       return;
14127 +               }
14128         }
14129 -       invoke_rcu_callbacks_kthread();
14130 +       *statusp = RCU_KTHREAD_YIELDING;
14131 +       trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield"));
14132 +       schedule_timeout_interruptible(2);
14133 +       trace_rcu_utilization(TPS("End CPU kthread@rcu_yield"));
14134 +       *statusp = RCU_KTHREAD_WAITING;
14135  }
14136
14137 -static void invoke_rcu_core(void)
14138 +static struct smp_hotplug_thread rcu_cpu_thread_spec = {
14139 +       .store                  = &rcu_cpu_kthread_task,
14140 +       .thread_should_run      = rcu_cpu_kthread_should_run,
14141 +       .thread_fn              = rcu_cpu_kthread,
14142 +       .thread_comm            = "rcuc/%u",
14143 +       .setup                  = rcu_cpu_kthread_setup,
14144 +       .park                   = rcu_cpu_kthread_park,
14145 +};
14146 +
14147 +/*
14148 + * Spawn per-CPU RCU core processing kthreads.
14149 + */
14150 +static int __init rcu_spawn_core_kthreads(void)
14151  {
14152 -       if (cpu_online(smp_processor_id()))
14153 -               raise_softirq(RCU_SOFTIRQ);
14154 +       int cpu;
14155 +
14156 +       for_each_possible_cpu(cpu)
14157 +               per_cpu(rcu_cpu_has_work, cpu) = 0;
14158 +       BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec));
14159 +       return 0;
14160  }
14161 +early_initcall(rcu_spawn_core_kthreads);
14162
14163  /*
14164   * Handle any core-RCU processing required by a call_rcu() invocation.
14165 @@ -3195,6 +3314,7 @@
14166  }
14167  EXPORT_SYMBOL_GPL(call_rcu_sched);
14168
14169 +#ifndef CONFIG_PREEMPT_RT_FULL
14170  /*
14171   * Queue an RCU callback for invocation after a quicker grace period.
14172   */
14173 @@ -3203,6 +3323,7 @@
14174         __call_rcu(head, func, &rcu_bh_state, -1, 0);
14175  }
14176  EXPORT_SYMBOL_GPL(call_rcu_bh);
14177 +#endif
14178
14179  /*
14180   * Queue an RCU callback for lazy invocation after a grace period.
14181 @@ -3294,6 +3415,7 @@
14182  }
14183  EXPORT_SYMBOL_GPL(synchronize_sched);
14184
14185 +#ifndef CONFIG_PREEMPT_RT_FULL
14186  /**
14187   * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed.
14188   *
14189 @@ -3320,6 +3442,7 @@
14190                 wait_rcu_gp(call_rcu_bh);
14191  }
14192  EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
14193 +#endif
14194
14195  /**
14196   * get_state_synchronize_rcu - Snapshot current RCU state
14197 @@ -3698,6 +3821,7 @@
14198         mutex_unlock(&rsp->barrier_mutex);
14199  }
14200
14201 +#ifndef CONFIG_PREEMPT_RT_FULL
14202  /**
14203   * rcu_barrier_bh - Wait until all in-flight call_rcu_bh() callbacks complete.
14204   */
14205 @@ -3706,6 +3830,7 @@
14206         _rcu_barrier(&rcu_bh_state);
14207  }
14208  EXPORT_SYMBOL_GPL(rcu_barrier_bh);
14209 +#endif
14210
14211  /**
14212   * rcu_barrier_sched - Wait for in-flight call_rcu_sched() callbacks.
14213 @@ -4227,12 +4352,13 @@
14214
14215         rcu_bootup_announce();
14216         rcu_init_geometry();
14217 +#ifndef CONFIG_PREEMPT_RT_FULL
14218         rcu_init_one(&rcu_bh_state);
14219 +#endif
14220         rcu_init_one(&rcu_sched_state);
14221         if (dump_tree)
14222                 rcu_dump_rcu_node_tree(&rcu_sched_state);
14223         __rcu_init_preempt();
14224 -       open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
14225
14226         /*
14227          * We don't need protection against CPU-hotplug here because
14228 diff -Nur linux-4.9.6.orig/kernel/rcu/tree.h linux-4.9.6/kernel/rcu/tree.h
14229 --- linux-4.9.6.orig/kernel/rcu/tree.h  2017-01-26 08:25:24.000000000 +0100
14230 +++ linux-4.9.6/kernel/rcu/tree.h       2017-01-28 13:59:10.043660493 +0100
14231 @@ -588,18 +588,18 @@
14232   */
14233  extern struct rcu_state rcu_sched_state;
14234
14235 +#ifndef CONFIG_PREEMPT_RT_FULL
14236  extern struct rcu_state rcu_bh_state;
14237 +#endif
14238
14239  #ifdef CONFIG_PREEMPT_RCU
14240  extern struct rcu_state rcu_preempt_state;
14241  #endif /* #ifdef CONFIG_PREEMPT_RCU */
14242
14243 -#ifdef CONFIG_RCU_BOOST
14244  DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
14245  DECLARE_PER_CPU(int, rcu_cpu_kthread_cpu);
14246  DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
14247  DECLARE_PER_CPU(char, rcu_cpu_has_work);
14248 -#endif /* #ifdef CONFIG_RCU_BOOST */
14249
14250  #ifndef RCU_TREE_NONCORE
14251
14252 @@ -619,10 +619,9 @@
14253  static void __init __rcu_init_preempt(void);
14254  static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
14255  static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
14256 -static void invoke_rcu_callbacks_kthread(void);
14257  static bool rcu_is_callbacks_kthread(void);
14258 +static void rcu_cpu_kthread_setup(unsigned int cpu);
14259  #ifdef CONFIG_RCU_BOOST
14260 -static void rcu_preempt_do_callbacks(void);
14261  static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
14262                                                  struct rcu_node *rnp);
14263  #endif /* #ifdef CONFIG_RCU_BOOST */
14264 diff -Nur linux-4.9.6.orig/kernel/rcu/tree_plugin.h linux-4.9.6/kernel/rcu/tree_plugin.h
14265 --- linux-4.9.6.orig/kernel/rcu/tree_plugin.h   2017-01-26 08:25:24.000000000 +0100
14266 +++ linux-4.9.6/kernel/rcu/tree_plugin.h        2017-01-28 13:59:10.043660493 +0100
14267 @@ -24,25 +24,10 @@
14268   *        Paul E. McKenney <paulmck@linux.vnet.ibm.com>
14269   */
14270
14271 -#include <linux/delay.h>
14272 -#include <linux/gfp.h>
14273 -#include <linux/oom.h>
14274 -#include <linux/smpboot.h>
14275 -#include "../time/tick-internal.h"
14276 -
14277  #ifdef CONFIG_RCU_BOOST
14278
14279  #include "../locking/rtmutex_common.h"
14280
14281 -/*
14282 - * Control variables for per-CPU and per-rcu_node kthreads.  These
14283 - * handle all flavors of RCU.
14284 - */
14285 -static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
14286 -DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
14287 -DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
14288 -DEFINE_PER_CPU(char, rcu_cpu_has_work);
14289 -
14290  #else /* #ifdef CONFIG_RCU_BOOST */
14291
14292  /*
14293 @@ -55,6 +40,14 @@
14294
14295  #endif /* #else #ifdef CONFIG_RCU_BOOST */
14296
14297 +/*
14298 + * Control variables for per-CPU and per-rcu_node kthreads.  These
14299 + * handle all flavors of RCU.
14300 + */
14301 +DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
14302 +DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
14303 +DEFINE_PER_CPU(char, rcu_cpu_has_work);
14304 +
14305  #ifdef CONFIG_RCU_NOCB_CPU
14306  static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */
14307  static bool have_rcu_nocb_mask;            /* Was rcu_nocb_mask allocated? */
14308 @@ -426,7 +419,7 @@
14309         }
14310
14311         /* Hardware IRQ handlers cannot block, complain if they get here. */
14312 -       if (in_irq() || in_serving_softirq()) {
14313 +       if (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_OFFSET)) {
14314                 lockdep_rcu_suspicious(__FILE__, __LINE__,
14315                                        "rcu_read_unlock() from irq or softirq with blocking in critical section!!!\n");
14316                 pr_alert("->rcu_read_unlock_special: %#x (b: %d, enq: %d nq: %d)\n",
14317 @@ -632,15 +625,6 @@
14318                 t->rcu_read_unlock_special.b.need_qs = true;
14319  }
14320
14321 -#ifdef CONFIG_RCU_BOOST
14322 -
14323 -static void rcu_preempt_do_callbacks(void)
14324 -{
14325 -       rcu_do_batch(rcu_state_p, this_cpu_ptr(rcu_data_p));
14326 -}
14327 -
14328 -#endif /* #ifdef CONFIG_RCU_BOOST */
14329 -
14330  /*
14331   * Queue a preemptible-RCU callback for invocation after a grace period.
14332   */
14333 @@ -829,6 +813,19 @@
14334
14335  #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
14336
14337 +/*
14338 + * If boosting, set rcuc kthreads to realtime priority.
14339 + */
14340 +static void rcu_cpu_kthread_setup(unsigned int cpu)
14341 +{
14342 +#ifdef CONFIG_RCU_BOOST
14343 +       struct sched_param sp;
14344 +
14345 +       sp.sched_priority = kthread_prio;
14346 +       sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
14347 +#endif /* #ifdef CONFIG_RCU_BOOST */
14348 +}
14349 +
14350  #ifdef CONFIG_RCU_BOOST
14351
14352  #include "../locking/rtmutex_common.h"
14353 @@ -860,16 +857,6 @@
14354
14355  #endif /* #else #ifdef CONFIG_RCU_TRACE */
14356
14357 -static void rcu_wake_cond(struct task_struct *t, int status)
14358 -{
14359 -       /*
14360 -        * If the thread is yielding, only wake it when this
14361 -        * is invoked from idle
14362 -        */
14363 -       if (status != RCU_KTHREAD_YIELDING || is_idle_task(current))
14364 -               wake_up_process(t);
14365 -}
14366 -
14367  /*
14368   * Carry out RCU priority boosting on the task indicated by ->exp_tasks
14369   * or ->boost_tasks, advancing the pointer to the next task in the
14370 @@ -1013,23 +1000,6 @@
14371  }
14372
14373  /*
14374 - * Wake up the per-CPU kthread to invoke RCU callbacks.
14375 - */
14376 -static void invoke_rcu_callbacks_kthread(void)
14377 -{
14378 -       unsigned long flags;
14379 -
14380 -       local_irq_save(flags);
14381 -       __this_cpu_write(rcu_cpu_has_work, 1);
14382 -       if (__this_cpu_read(rcu_cpu_kthread_task) != NULL &&
14383 -           current != __this_cpu_read(rcu_cpu_kthread_task)) {
14384 -               rcu_wake_cond(__this_cpu_read(rcu_cpu_kthread_task),
14385 -                             __this_cpu_read(rcu_cpu_kthread_status));
14386 -       }
14387 -       local_irq_restore(flags);
14388 -}
14389 -
14390 -/*
14391   * Is the current CPU running the RCU-callbacks kthread?
14392   * Caller must have preemption disabled.
14393   */
14394 @@ -1083,67 +1053,6 @@
14395         return 0;
14396  }
14397
14398 -static void rcu_kthread_do_work(void)
14399 -{
14400 -       rcu_do_batch(&rcu_sched_state, this_cpu_ptr(&rcu_sched_data));
14401 -       rcu_do_batch(&rcu_bh_state, this_cpu_ptr(&rcu_bh_data));
14402 -       rcu_preempt_do_callbacks();
14403 -}
14404 -
14405 -static void rcu_cpu_kthread_setup(unsigned int cpu)
14406 -{
14407 -       struct sched_param sp;
14408 -
14409 -       sp.sched_priority = kthread_prio;
14410 -       sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
14411 -}
14412 -
14413 -static void rcu_cpu_kthread_park(unsigned int cpu)
14414 -{
14415 -       per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
14416 -}
14417 -
14418 -static int rcu_cpu_kthread_should_run(unsigned int cpu)
14419 -{
14420 -       return __this_cpu_read(rcu_cpu_has_work);
14421 -}
14422 -
14423 -/*
14424 - * Per-CPU kernel thread that invokes RCU callbacks.  This replaces the
14425 - * RCU softirq used in flavors and configurations of RCU that do not
14426 - * support RCU priority boosting.
14427 - */
14428 -static void rcu_cpu_kthread(unsigned int cpu)
14429 -{
14430 -       unsigned int *statusp = this_cpu_ptr(&rcu_cpu_kthread_status);
14431 -       char work, *workp = this_cpu_ptr(&rcu_cpu_has_work);
14432 -       int spincnt;
14433 -
14434 -       for (spincnt = 0; spincnt < 10; spincnt++) {
14435 -               trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait"));
14436 -               local_bh_disable();
14437 -               *statusp = RCU_KTHREAD_RUNNING;
14438 -               this_cpu_inc(rcu_cpu_kthread_loops);
14439 -               local_irq_disable();
14440 -               work = *workp;
14441 -               *workp = 0;
14442 -               local_irq_enable();
14443 -               if (work)
14444 -                       rcu_kthread_do_work();
14445 -               local_bh_enable();
14446 -               if (*workp == 0) {
14447 -                       trace_rcu_utilization(TPS("End CPU kthread@rcu_wait"));
14448 -                       *statusp = RCU_KTHREAD_WAITING;
14449 -                       return;
14450 -               }
14451 -       }
14452 -       *statusp = RCU_KTHREAD_YIELDING;
14453 -       trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield"));
14454 -       schedule_timeout_interruptible(2);
14455 -       trace_rcu_utilization(TPS("End CPU kthread@rcu_yield"));
14456 -       *statusp = RCU_KTHREAD_WAITING;
14457 -}
14458 -
14459  /*
14460   * Set the per-rcu_node kthread's affinity to cover all CPUs that are
14461   * served by the rcu_node in question.  The CPU hotplug lock is still
14462 @@ -1174,26 +1083,12 @@
14463         free_cpumask_var(cm);
14464  }
14465
14466 -static struct smp_hotplug_thread rcu_cpu_thread_spec = {
14467 -       .store                  = &rcu_cpu_kthread_task,
14468 -       .thread_should_run      = rcu_cpu_kthread_should_run,
14469 -       .thread_fn              = rcu_cpu_kthread,
14470 -       .thread_comm            = "rcuc/%u",
14471 -       .setup                  = rcu_cpu_kthread_setup,
14472 -       .park                   = rcu_cpu_kthread_park,
14473 -};
14474 -
14475  /*
14476   * Spawn boost kthreads -- called as soon as the scheduler is running.
14477   */
14478  static void __init rcu_spawn_boost_kthreads(void)
14479  {
14480         struct rcu_node *rnp;
14481 -       int cpu;
14482 -
14483 -       for_each_possible_cpu(cpu)
14484 -               per_cpu(rcu_cpu_has_work, cpu) = 0;
14485 -       BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec));
14486         rcu_for_each_leaf_node(rcu_state_p, rnp)
14487                 (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp);
14488  }
14489 @@ -1216,11 +1111,6 @@
14490         raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
14491  }
14492
14493 -static void invoke_rcu_callbacks_kthread(void)
14494 -{
14495 -       WARN_ON_ONCE(1);
14496 -}
14497 -
14498  static bool rcu_is_callbacks_kthread(void)
14499  {
14500         return false;
14501 @@ -1244,7 +1134,7 @@
14502
14503  #endif /* #else #ifdef CONFIG_RCU_BOOST */
14504
14505 -#if !defined(CONFIG_RCU_FAST_NO_HZ)
14506 +#if !defined(CONFIG_RCU_FAST_NO_HZ) || defined(CONFIG_PREEMPT_RT_FULL)
14507
14508  /*
14509   * Check to see if any future RCU-related work will need to be done
14510 @@ -1261,7 +1151,9 @@
14511         return IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL)
14512                ? 0 : rcu_cpu_has_callbacks(NULL);
14513  }
14514 +#endif /* !defined(CONFIG_RCU_FAST_NO_HZ) || defined(CONFIG_PREEMPT_RT_FULL) */
14515
14516 +#if !defined(CONFIG_RCU_FAST_NO_HZ)
14517  /*
14518   * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up
14519   * after it.
14520 @@ -1357,6 +1249,8 @@
14521         return cbs_ready;
14522  }
14523
14524 +#ifndef CONFIG_PREEMPT_RT_FULL
14525 +
14526  /*
14527   * Allow the CPU to enter dyntick-idle mode unless it has callbacks ready
14528   * to invoke.  If the CPU has callbacks, try to advance them.  Tell the
14529 @@ -1402,6 +1296,7 @@
14530         *nextevt = basemono + dj * TICK_NSEC;
14531         return 0;
14532  }
14533 +#endif /* #ifndef CONFIG_PREEMPT_RT_FULL */
14534
14535  /*
14536   * Prepare a CPU for idle from an RCU perspective.  The first major task
14537 diff -Nur linux-4.9.6.orig/kernel/rcu/update.c linux-4.9.6/kernel/rcu/update.c
14538 --- linux-4.9.6.orig/kernel/rcu/update.c        2017-01-26 08:25:24.000000000 +0100
14539 +++ linux-4.9.6/kernel/rcu/update.c     2017-01-28 13:59:10.043660493 +0100
14540 @@ -62,7 +62,7 @@
14541  #ifndef CONFIG_TINY_RCU
14542  module_param(rcu_expedited, int, 0);
14543  module_param(rcu_normal, int, 0);
14544 -static int rcu_normal_after_boot;
14545 +static int rcu_normal_after_boot = IS_ENABLED(CONFIG_PREEMPT_RT_FULL);
14546  module_param(rcu_normal_after_boot, int, 0);
14547  #endif /* #ifndef CONFIG_TINY_RCU */
14548
14549 @@ -132,8 +132,7 @@
14550  }
14551  EXPORT_SYMBOL_GPL(rcu_gp_is_normal);
14552
14553 -static atomic_t rcu_expedited_nesting =
14554 -       ATOMIC_INIT(IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT) ? 1 : 0);
14555 +static atomic_t rcu_expedited_nesting =        ATOMIC_INIT(1);
14556
14557  /*
14558   * Should normal grace-period primitives be expedited?  Intended for
14559 @@ -182,8 +181,7 @@
14560   */
14561  void rcu_end_inkernel_boot(void)
14562  {
14563 -       if (IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT))
14564 -               rcu_unexpedite_gp();
14565 +       rcu_unexpedite_gp();
14566         if (rcu_normal_after_boot)
14567                 WRITE_ONCE(rcu_normal, 1);
14568  }
14569 @@ -298,6 +296,7 @@
14570  }
14571  EXPORT_SYMBOL_GPL(rcu_read_lock_held);
14572
14573 +#ifndef CONFIG_PREEMPT_RT_FULL
14574  /**
14575   * rcu_read_lock_bh_held() - might we be in RCU-bh read-side critical section?
14576   *
14577 @@ -324,6 +323,7 @@
14578         return in_softirq() || irqs_disabled();
14579  }
14580  EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
14581 +#endif
14582
14583  #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
14584
14585 diff -Nur linux-4.9.6.orig/kernel/sched/completion.c linux-4.9.6/kernel/sched/completion.c
14586 --- linux-4.9.6.orig/kernel/sched/completion.c  2017-01-26 08:25:24.000000000 +0100
14587 +++ linux-4.9.6/kernel/sched/completion.c       2017-01-28 13:59:10.043660493 +0100
14588 @@ -30,10 +30,10 @@
14589  {
14590         unsigned long flags;
14591
14592 -       spin_lock_irqsave(&x->wait.lock, flags);
14593 +       raw_spin_lock_irqsave(&x->wait.lock, flags);
14594         x->done++;
14595 -       __wake_up_locked(&x->wait, TASK_NORMAL, 1);
14596 -       spin_unlock_irqrestore(&x->wait.lock, flags);
14597 +       swake_up_locked(&x->wait);
14598 +       raw_spin_unlock_irqrestore(&x->wait.lock, flags);
14599  }
14600  EXPORT_SYMBOL(complete);
14601
14602 @@ -50,10 +50,10 @@
14603  {
14604         unsigned long flags;
14605
14606 -       spin_lock_irqsave(&x->wait.lock, flags);
14607 +       raw_spin_lock_irqsave(&x->wait.lock, flags);
14608         x->done += UINT_MAX/2;
14609 -       __wake_up_locked(&x->wait, TASK_NORMAL, 0);
14610 -       spin_unlock_irqrestore(&x->wait.lock, flags);
14611 +       swake_up_all_locked(&x->wait);
14612 +       raw_spin_unlock_irqrestore(&x->wait.lock, flags);
14613  }
14614  EXPORT_SYMBOL(complete_all);
14615
14616 @@ -62,20 +62,20 @@
14617                    long (*action)(long), long timeout, int state)
14618  {
14619         if (!x->done) {
14620 -               DECLARE_WAITQUEUE(wait, current);
14621 +               DECLARE_SWAITQUEUE(wait);
14622
14623 -               __add_wait_queue_tail_exclusive(&x->wait, &wait);
14624 +               __prepare_to_swait(&x->wait, &wait);
14625                 do {
14626                         if (signal_pending_state(state, current)) {
14627                                 timeout = -ERESTARTSYS;
14628                                 break;
14629                         }
14630                         __set_current_state(state);
14631 -                       spin_unlock_irq(&x->wait.lock);
14632 +                       raw_spin_unlock_irq(&x->wait.lock);
14633                         timeout = action(timeout);
14634 -                       spin_lock_irq(&x->wait.lock);
14635 +                       raw_spin_lock_irq(&x->wait.lock);
14636                 } while (!x->done && timeout);
14637 -               __remove_wait_queue(&x->wait, &wait);
14638 +               __finish_swait(&x->wait, &wait);
14639                 if (!x->done)
14640                         return timeout;
14641         }
14642 @@ -89,9 +89,9 @@
14643  {
14644         might_sleep();
14645
14646 -       spin_lock_irq(&x->wait.lock);
14647 +       raw_spin_lock_irq(&x->wait.lock);
14648         timeout = do_wait_for_common(x, action, timeout, state);
14649 -       spin_unlock_irq(&x->wait.lock);
14650 +       raw_spin_unlock_irq(&x->wait.lock);
14651         return timeout;
14652  }
14653
14654 @@ -277,12 +277,12 @@
14655         if (!READ_ONCE(x->done))
14656                 return 0;
14657
14658 -       spin_lock_irqsave(&x->wait.lock, flags);
14659 +       raw_spin_lock_irqsave(&x->wait.lock, flags);
14660         if (!x->done)
14661                 ret = 0;
14662         else
14663                 x->done--;
14664 -       spin_unlock_irqrestore(&x->wait.lock, flags);
14665 +       raw_spin_unlock_irqrestore(&x->wait.lock, flags);
14666         return ret;
14667  }
14668  EXPORT_SYMBOL(try_wait_for_completion);
14669 @@ -311,7 +311,7 @@
14670          * after it's acquired the lock.
14671          */
14672         smp_rmb();
14673 -       spin_unlock_wait(&x->wait.lock);
14674 +       raw_spin_unlock_wait(&x->wait.lock);
14675         return true;
14676  }
14677  EXPORT_SYMBOL(completion_done);
14678 diff -Nur linux-4.9.6.orig/kernel/sched/core.c linux-4.9.6/kernel/sched/core.c
14679 --- linux-4.9.6.orig/kernel/sched/core.c        2017-01-26 08:25:24.000000000 +0100
14680 +++ linux-4.9.6/kernel/sched/core.c     2017-01-28 13:59:10.043660493 +0100
14681 @@ -129,7 +129,11 @@
14682   * Number of tasks to iterate in a single balance run.
14683   * Limited because this is done with IRQs disabled.
14684   */
14685 +#ifndef CONFIG_PREEMPT_RT_FULL
14686  const_debug unsigned int sysctl_sched_nr_migrate = 32;
14687 +#else
14688 +const_debug unsigned int sysctl_sched_nr_migrate = 8;
14689 +#endif
14690
14691  /*
14692   * period over which we average the RT time consumption, measured
14693 @@ -345,6 +349,7 @@
14694
14695         hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
14696         rq->hrtick_timer.function = hrtick;
14697 +       rq->hrtick_timer.irqsafe = 1;
14698  }
14699  #else  /* CONFIG_SCHED_HRTICK */
14700  static inline void hrtick_clear(struct rq *rq)
14701 @@ -449,7 +454,7 @@
14702         head->lastp = &node->next;
14703  }
14704
14705 -void wake_up_q(struct wake_q_head *head)
14706 +void __wake_up_q(struct wake_q_head *head, bool sleeper)
14707  {
14708         struct wake_q_node *node = head->first;
14709
14710 @@ -466,7 +471,10 @@
14711                  * wake_up_process() implies a wmb() to pair with the queueing
14712                  * in wake_q_add() so as not to miss wakeups.
14713                  */
14714 -               wake_up_process(task);
14715 +               if (sleeper)
14716 +                       wake_up_lock_sleeper(task);
14717 +               else
14718 +                       wake_up_process(task);
14719                 put_task_struct(task);
14720         }
14721  }
14722 @@ -502,6 +510,38 @@
14723                 trace_sched_wake_idle_without_ipi(cpu);
14724  }
14725
14726 +#ifdef CONFIG_PREEMPT_LAZY
14727 +void resched_curr_lazy(struct rq *rq)
14728 +{
14729 +       struct task_struct *curr = rq->curr;
14730 +       int cpu;
14731 +
14732 +       if (!sched_feat(PREEMPT_LAZY)) {
14733 +               resched_curr(rq);
14734 +               return;
14735 +       }
14736 +
14737 +       lockdep_assert_held(&rq->lock);
14738 +
14739 +       if (test_tsk_need_resched(curr))
14740 +               return;
14741 +
14742 +       if (test_tsk_need_resched_lazy(curr))
14743 +               return;
14744 +
14745 +       set_tsk_need_resched_lazy(curr);
14746 +
14747 +       cpu = cpu_of(rq);
14748 +       if (cpu == smp_processor_id())
14749 +               return;
14750 +
14751 +       /* NEED_RESCHED_LAZY must be visible before we test polling */
14752 +       smp_mb();
14753 +       if (!tsk_is_polling(curr))
14754 +               smp_send_reschedule(cpu);
14755 +}
14756 +#endif
14757 +
14758  void resched_cpu(int cpu)
14759  {
14760         struct rq *rq = cpu_rq(cpu);
14761 @@ -525,11 +565,14 @@
14762   */
14763  int get_nohz_timer_target(void)
14764  {
14765 -       int i, cpu = smp_processor_id();
14766 +       int i, cpu;
14767         struct sched_domain *sd;
14768
14769 +       preempt_disable_rt();
14770 +       cpu = smp_processor_id();
14771 +
14772         if (!idle_cpu(cpu) && is_housekeeping_cpu(cpu))
14773 -               return cpu;
14774 +               goto preempt_en_rt;
14775
14776         rcu_read_lock();
14777         for_each_domain(cpu, sd) {
14778 @@ -548,6 +591,8 @@
14779                 cpu = housekeeping_any_cpu();
14780  unlock:
14781         rcu_read_unlock();
14782 +preempt_en_rt:
14783 +       preempt_enable_rt();
14784         return cpu;
14785  }
14786  /*
14787 @@ -1100,6 +1145,11 @@
14788
14789         lockdep_assert_held(&p->pi_lock);
14790
14791 +       if (__migrate_disabled(p)) {
14792 +               cpumask_copy(&p->cpus_allowed, new_mask);
14793 +               return;
14794 +       }
14795 +
14796         queued = task_on_rq_queued(p);
14797         running = task_current(rq, p);
14798
14799 @@ -1122,6 +1172,84 @@
14800                 set_curr_task(rq, p);
14801  }
14802
14803 +static DEFINE_PER_CPU(struct cpumask, sched_cpumasks);
14804 +static DEFINE_MUTEX(sched_down_mutex);
14805 +static cpumask_t sched_down_cpumask;
14806 +
14807 +void tell_sched_cpu_down_begin(int cpu)
14808 +{
14809 +       mutex_lock(&sched_down_mutex);
14810 +       cpumask_set_cpu(cpu, &sched_down_cpumask);
14811 +       mutex_unlock(&sched_down_mutex);
14812 +}
14813 +
14814 +void tell_sched_cpu_down_done(int cpu)
14815 +{
14816 +       mutex_lock(&sched_down_mutex);
14817 +       cpumask_clear_cpu(cpu, &sched_down_cpumask);
14818 +       mutex_unlock(&sched_down_mutex);
14819 +}
14820 +
14821 +/**
14822 + * migrate_me - try to move the current task off this cpu
14823 + *
14824 + * Used by the pin_current_cpu() code to try to get tasks
14825 + * to move off the current CPU as it is going down.
14826 + * It will only move the task if the task isn't pinned to
14827 + * the CPU (with migrate_disable, affinity or NO_SETAFFINITY)
14828 + * and the task has to be in a RUNNING state. Otherwise the
14829 + * movement of the task will wake it up (change its state
14830 + * to running) when the task did not expect it.
14831 + *
14832 + * Returns 1 if it succeeded in moving the current task
14833 + *         0 otherwise.
14834 + */
14835 +int migrate_me(void)
14836 +{
14837 +       struct task_struct *p = current;
14838 +       struct migration_arg arg;
14839 +       struct cpumask *cpumask;
14840 +       struct cpumask *mask;
14841 +       unsigned int dest_cpu;
14842 +       struct rq_flags rf;
14843 +       struct rq *rq;
14844 +
14845 +       /*
14846 +        * We can not migrate tasks bounded to a CPU or tasks not
14847 +        * running. The movement of the task will wake it up.
14848 +        */
14849 +       if (p->flags & PF_NO_SETAFFINITY || p->state)
14850 +               return 0;
14851 +
14852 +       mutex_lock(&sched_down_mutex);
14853 +       rq = task_rq_lock(p, &rf);
14854 +
14855 +       cpumask = this_cpu_ptr(&sched_cpumasks);
14856 +       mask = &p->cpus_allowed;
14857 +
14858 +       cpumask_andnot(cpumask, mask, &sched_down_cpumask);
14859 +
14860 +       if (!cpumask_weight(cpumask)) {
14861 +               /* It's only on this CPU? */
14862 +               task_rq_unlock(rq, p, &rf);
14863 +               mutex_unlock(&sched_down_mutex);
14864 +               return 0;
14865 +       }
14866 +
14867 +       dest_cpu = cpumask_any_and(cpu_active_mask, cpumask);
14868 +
14869 +       arg.task = p;
14870 +       arg.dest_cpu = dest_cpu;
14871 +
14872 +       task_rq_unlock(rq, p, &rf);
14873 +
14874 +       stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
14875 +       tlb_migrate_finish(p->mm);
14876 +       mutex_unlock(&sched_down_mutex);
14877 +
14878 +       return 1;
14879 +}
14880 +
14881  /*
14882   * Change a given task's CPU affinity. Migrate the thread to a
14883   * proper CPU and schedule it away if the CPU it's executing on
14884 @@ -1179,7 +1307,7 @@
14885         }
14886
14887         /* Can the task run on the task's current CPU? If so, we're done */
14888 -       if (cpumask_test_cpu(task_cpu(p), new_mask))
14889 +       if (cpumask_test_cpu(task_cpu(p), new_mask) || __migrate_disabled(p))
14890                 goto out;
14891
14892         dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);
14893 @@ -1366,6 +1494,18 @@
14894         return ret;
14895  }
14896
14897 +static bool check_task_state(struct task_struct *p, long match_state)
14898 +{
14899 +       bool match = false;
14900 +
14901 +       raw_spin_lock_irq(&p->pi_lock);
14902 +       if (p->state == match_state || p->saved_state == match_state)
14903 +               match = true;
14904 +       raw_spin_unlock_irq(&p->pi_lock);
14905 +
14906 +       return match;
14907 +}
14908 +
14909  /*
14910   * wait_task_inactive - wait for a thread to unschedule.
14911   *
14912 @@ -1410,7 +1550,7 @@
14913                  * is actually now running somewhere else!
14914                  */
14915                 while (task_running(rq, p)) {
14916 -                       if (match_state && unlikely(p->state != match_state))
14917 +                       if (match_state && !check_task_state(p, match_state))
14918                                 return 0;
14919                         cpu_relax();
14920                 }
14921 @@ -1425,7 +1565,8 @@
14922                 running = task_running(rq, p);
14923                 queued = task_on_rq_queued(p);
14924                 ncsw = 0;
14925 -               if (!match_state || p->state == match_state)
14926 +               if (!match_state || p->state == match_state ||
14927 +                   p->saved_state == match_state)
14928                         ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
14929                 task_rq_unlock(rq, p, &rf);
14930
14931 @@ -1680,10 +1821,6 @@
14932  {
14933         activate_task(rq, p, en_flags);
14934         p->on_rq = TASK_ON_RQ_QUEUED;
14935 -
14936 -       /* if a worker is waking up, notify workqueue */
14937 -       if (p->flags & PF_WQ_WORKER)
14938 -               wq_worker_waking_up(p, cpu_of(rq));
14939  }
14940
14941  /*
14942 @@ -2018,8 +2155,27 @@
14943          */
14944         smp_mb__before_spinlock();
14945         raw_spin_lock_irqsave(&p->pi_lock, flags);
14946 -       if (!(p->state & state))
14947 +       if (!(p->state & state)) {
14948 +               /*
14949 +                * The task might be running due to a spinlock sleeper
14950 +                * wakeup. Check the saved state and set it to running
14951 +                * if the wakeup condition is true.
14952 +                */
14953 +               if (!(wake_flags & WF_LOCK_SLEEPER)) {
14954 +                       if (p->saved_state & state) {
14955 +                               p->saved_state = TASK_RUNNING;
14956 +                               success = 1;
14957 +                       }
14958 +               }
14959                 goto out;
14960 +       }
14961 +
14962 +       /*
14963 +        * If this is a regular wakeup, then we can unconditionally
14964 +        * clear the saved state of a "lock sleeper".
14965 +        */
14966 +       if (!(wake_flags & WF_LOCK_SLEEPER))
14967 +               p->saved_state = TASK_RUNNING;
14968
14969         trace_sched_waking(p);
14970
14971 @@ -2102,53 +2258,6 @@
14972  }
14973
14974  /**
14975 - * try_to_wake_up_local - try to wake up a local task with rq lock held
14976 - * @p: the thread to be awakened
14977 - * @cookie: context's cookie for pinning
14978 - *
14979 - * Put @p on the run-queue if it's not already there. The caller must
14980 - * ensure that this_rq() is locked, @p is bound to this_rq() and not
14981 - * the current task.
14982 - */
14983 -static void try_to_wake_up_local(struct task_struct *p, struct pin_cookie cookie)
14984 -{
14985 -       struct rq *rq = task_rq(p);
14986 -
14987 -       if (WARN_ON_ONCE(rq != this_rq()) ||
14988 -           WARN_ON_ONCE(p == current))
14989 -               return;
14990 -
14991 -       lockdep_assert_held(&rq->lock);
14992 -
14993 -       if (!raw_spin_trylock(&p->pi_lock)) {
14994 -               /*
14995 -                * This is OK, because current is on_cpu, which avoids it being
14996 -                * picked for load-balance and preemption/IRQs are still
14997 -                * disabled avoiding further scheduler activity on it and we've
14998 -                * not yet picked a replacement task.
14999 -                */
15000 -               lockdep_unpin_lock(&rq->lock, cookie);
15001 -               raw_spin_unlock(&rq->lock);
15002 -               raw_spin_lock(&p->pi_lock);
15003 -               raw_spin_lock(&rq->lock);
15004 -               lockdep_repin_lock(&rq->lock, cookie);
15005 -       }
15006 -
15007 -       if (!(p->state & TASK_NORMAL))
15008 -               goto out;
15009 -
15010 -       trace_sched_waking(p);
15011 -
15012 -       if (!task_on_rq_queued(p))
15013 -               ttwu_activate(rq, p, ENQUEUE_WAKEUP);
15014 -
15015 -       ttwu_do_wakeup(rq, p, 0, cookie);
15016 -       ttwu_stat(p, smp_processor_id(), 0);
15017 -out:
15018 -       raw_spin_unlock(&p->pi_lock);
15019 -}
15020 -
15021 -/**
15022   * wake_up_process - Wake up a specific process
15023   * @p: The process to be woken up.
15024   *
15025 @@ -2166,6 +2275,18 @@
15026  }
15027  EXPORT_SYMBOL(wake_up_process);
15028
15029 +/**
15030 + * wake_up_lock_sleeper - Wake up a specific process blocked on a "sleeping lock"
15031 + * @p: The process to be woken up.
15032 + *
15033 + * Same as wake_up_process() above, but wake_flags=WF_LOCK_SLEEPER to indicate
15034 + * the nature of the wakeup.
15035 + */
15036 +int wake_up_lock_sleeper(struct task_struct *p)
15037 +{
15038 +       return try_to_wake_up(p, TASK_ALL, WF_LOCK_SLEEPER);
15039 +}
15040 +
15041  int wake_up_state(struct task_struct *p, unsigned int state)
15042  {
15043         return try_to_wake_up(p, state, 0);
15044 @@ -2442,6 +2563,9 @@
15045         p->on_cpu = 0;
15046  #endif
15047         init_task_preempt_count(p);
15048 +#ifdef CONFIG_HAVE_PREEMPT_LAZY
15049 +       task_thread_info(p)->preempt_lazy_count = 0;
15050 +#endif
15051  #ifdef CONFIG_SMP
15052         plist_node_init(&p->pushable_tasks, MAX_PRIO);
15053         RB_CLEAR_NODE(&p->pushable_dl_tasks);
15054 @@ -2770,21 +2894,16 @@
15055         finish_arch_post_lock_switch();
15056
15057         fire_sched_in_preempt_notifiers(current);
15058 +       /*
15059 +        * We use mmdrop_delayed() here so we don't have to do the
15060 +        * full __mmdrop() when we are the last user.
15061 +        */
15062         if (mm)
15063 -               mmdrop(mm);
15064 +               mmdrop_delayed(mm);
15065         if (unlikely(prev_state == TASK_DEAD)) {
15066                 if (prev->sched_class->task_dead)
15067                         prev->sched_class->task_dead(prev);
15068
15069 -               /*
15070 -                * Remove function-return probe instances associated with this
15071 -                * task and put them back on the free list.
15072 -                */
15073 -               kprobe_flush_task(prev);
15074 -
15075 -               /* Task is done with its stack. */
15076 -               put_task_stack(prev);
15077 -
15078                 put_task_struct(prev);
15079         }
15080
15081 @@ -3252,6 +3371,77 @@
15082         schedstat_inc(this_rq()->sched_count);
15083  }
15084
15085 +#if defined(CONFIG_PREEMPT_RT_FULL) && defined(CONFIG_SMP)
15086 +
15087 +void migrate_disable(void)
15088 +{
15089 +       struct task_struct *p = current;
15090 +
15091 +       if (in_atomic() || irqs_disabled()) {
15092 +#ifdef CONFIG_SCHED_DEBUG
15093 +               p->migrate_disable_atomic++;
15094 +#endif
15095 +               return;
15096 +       }
15097 +
15098 +#ifdef CONFIG_SCHED_DEBUG
15099 +       if (unlikely(p->migrate_disable_atomic)) {
15100 +               tracing_off();
15101 +               WARN_ON_ONCE(1);
15102 +       }
15103 +#endif
15104 +
15105 +       if (p->migrate_disable) {
15106 +               p->migrate_disable++;
15107 +               return;
15108 +       }
15109 +
15110 +       preempt_disable();
15111 +       preempt_lazy_disable();
15112 +       pin_current_cpu();
15113 +       p->migrate_disable = 1;
15114 +       preempt_enable();
15115 +}
15116 +EXPORT_SYMBOL(migrate_disable);
15117 +
15118 +void migrate_enable(void)
15119 +{
15120 +       struct task_struct *p = current;
15121 +
15122 +       if (in_atomic() || irqs_disabled()) {
15123 +#ifdef CONFIG_SCHED_DEBUG
15124 +               p->migrate_disable_atomic--;
15125 +#endif
15126 +               return;
15127 +       }
15128 +
15129 +#ifdef CONFIG_SCHED_DEBUG
15130 +       if (unlikely(p->migrate_disable_atomic)) {
15131 +               tracing_off();
15132 +               WARN_ON_ONCE(1);
15133 +       }
15134 +#endif
15135 +       WARN_ON_ONCE(p->migrate_disable <= 0);
15136 +
15137 +       if (p->migrate_disable > 1) {
15138 +               p->migrate_disable--;
15139 +               return;
15140 +       }
15141 +
15142 +       preempt_disable();
15143 +       /*
15144 +        * Clearing migrate_disable causes tsk_cpus_allowed to
15145 +        * show the tasks original cpu affinity.
15146 +        */
15147 +       p->migrate_disable = 0;
15148 +
15149 +       unpin_current_cpu();
15150 +       preempt_enable();
15151 +       preempt_lazy_enable();
15152 +}
15153 +EXPORT_SYMBOL(migrate_enable);
15154 +#endif
15155 +
15156  /*
15157   * Pick up the highest-prio task:
15158   */
15159 @@ -3368,19 +3558,6 @@
15160                 } else {
15161                         deactivate_task(rq, prev, DEQUEUE_SLEEP);
15162                         prev->on_rq = 0;
15163 -
15164 -                       /*
15165 -                        * If a worker went to sleep, notify and ask workqueue
15166 -                        * whether it wants to wake up a task to maintain
15167 -                        * concurrency.
15168 -                        */
15169 -                       if (prev->flags & PF_WQ_WORKER) {
15170 -                               struct task_struct *to_wakeup;
15171 -
15172 -                               to_wakeup = wq_worker_sleeping(prev);
15173 -                               if (to_wakeup)
15174 -                                       try_to_wake_up_local(to_wakeup, cookie);
15175 -                       }
15176                 }
15177                 switch_count = &prev->nvcsw;
15178         }
15179 @@ -3390,6 +3567,7 @@
15180
15181         next = pick_next_task(rq, prev, cookie);
15182         clear_tsk_need_resched(prev);
15183 +       clear_tsk_need_resched_lazy(prev);
15184         clear_preempt_need_resched();
15185         rq->clock_skip_update = 0;
15186
15187 @@ -3437,9 +3615,20 @@
15188
15189  static inline void sched_submit_work(struct task_struct *tsk)
15190  {
15191 -       if (!tsk->state || tsk_is_pi_blocked(tsk))
15192 +       if (!tsk->state)
15193                 return;
15194         /*
15195 +        * If a worker went to sleep, notify and ask workqueue whether
15196 +        * it wants to wake up a task to maintain concurrency.
15197 +        */
15198 +       if (tsk->flags & PF_WQ_WORKER)
15199 +               wq_worker_sleeping(tsk);
15200 +
15201 +
15202 +       if (tsk_is_pi_blocked(tsk))
15203 +               return;
15204 +
15205 +       /*
15206          * If we are going to sleep and we have plugged IO queued,
15207          * make sure to submit it to avoid deadlocks.
15208          */
15209 @@ -3447,6 +3636,12 @@
15210                 blk_schedule_flush_plug(tsk);
15211  }
15212
15213 +static void sched_update_worker(struct task_struct *tsk)
15214 +{
15215 +       if (tsk->flags & PF_WQ_WORKER)
15216 +               wq_worker_running(tsk);
15217 +}
15218 +
15219  asmlinkage __visible void __sched schedule(void)
15220  {
15221         struct task_struct *tsk = current;
15222 @@ -3457,6 +3652,7 @@
15223                 __schedule(false);
15224                 sched_preempt_enable_no_resched();
15225         } while (need_resched());
15226 +       sched_update_worker(tsk);
15227  }
15228  EXPORT_SYMBOL(schedule);
15229
15230 @@ -3520,6 +3716,30 @@
15231         } while (need_resched());
15232  }
15233
15234 +#ifdef CONFIG_PREEMPT_LAZY
15235 +/*
15236 + * If TIF_NEED_RESCHED is then we allow to be scheduled away since this is
15237 + * set by a RT task. Oterwise we try to avoid beeing scheduled out as long as
15238 + * preempt_lazy_count counter >0.
15239 + */
15240 +static __always_inline int preemptible_lazy(void)
15241 +{
15242 +       if (test_thread_flag(TIF_NEED_RESCHED))
15243 +               return 1;
15244 +       if (current_thread_info()->preempt_lazy_count)
15245 +               return 0;
15246 +       return 1;
15247 +}
15248 +
15249 +#else
15250 +
15251 +static inline int preemptible_lazy(void)
15252 +{
15253 +       return 1;
15254 +}
15255 +
15256 +#endif
15257 +
15258  #ifdef CONFIG_PREEMPT
15259  /*
15260   * this is the entry point to schedule() from in-kernel preemption
15261 @@ -3534,7 +3754,8 @@
15262          */
15263         if (likely(!preemptible()))
15264                 return;
15265 -
15266 +       if (!preemptible_lazy())
15267 +               return;
15268         preempt_schedule_common();
15269  }
15270  NOKPROBE_SYMBOL(preempt_schedule);
15271 @@ -3561,6 +3782,9 @@
15272         if (likely(!preemptible()))
15273                 return;
15274
15275 +       if (!preemptible_lazy())
15276 +               return;
15277 +
15278         do {
15279                 /*
15280                  * Because the function tracer can trace preempt_count_sub()
15281 @@ -3583,7 +3807,16 @@
15282                  * an infinite recursion.
15283                  */
15284                 prev_ctx = exception_enter();
15285 +               /*
15286 +                * The add/subtract must not be traced by the function
15287 +                * tracer. But we still want to account for the
15288 +                * preempt off latency tracer. Since the _notrace versions
15289 +                * of add/subtract skip the accounting for latency tracer
15290 +                * we must force it manually.
15291 +                */
15292 +               start_critical_timings();
15293                 __schedule(true);
15294 +               stop_critical_timings();
15295                 exception_exit(prev_ctx);
15296
15297                 preempt_latency_stop(1);
15298 @@ -4939,6 +5172,7 @@
15299  }
15300  EXPORT_SYMBOL(__cond_resched_lock);
15301
15302 +#ifndef CONFIG_PREEMPT_RT_FULL
15303  int __sched __cond_resched_softirq(void)
15304  {
15305         BUG_ON(!in_softirq());
15306 @@ -4952,6 +5186,7 @@
15307         return 0;
15308  }
15309  EXPORT_SYMBOL(__cond_resched_softirq);
15310 +#endif
15311
15312  /**
15313   * yield - yield the current processor to other threads.
15314 @@ -5315,7 +5550,9 @@
15315
15316         /* Set the preempt count _outside_ the spinlocks! */
15317         init_idle_preempt_count(idle, cpu);
15318 -
15319 +#ifdef CONFIG_HAVE_PREEMPT_LAZY
15320 +       task_thread_info(idle)->preempt_lazy_count = 0;
15321 +#endif
15322         /*
15323          * The idle tasks have their own, simple scheduling class:
15324          */
15325 @@ -5458,6 +5695,8 @@
15326  #endif /* CONFIG_NUMA_BALANCING */
15327
15328  #ifdef CONFIG_HOTPLUG_CPU
15329 +static DEFINE_PER_CPU(struct mm_struct *, idle_last_mm);
15330 +
15331  /*
15332   * Ensures that the idle task is using init_mm right before its cpu goes
15333   * offline.
15334 @@ -5472,7 +5711,12 @@
15335                 switch_mm_irqs_off(mm, &init_mm, current);
15336                 finish_arch_post_lock_switch();
15337         }
15338 -       mmdrop(mm);
15339 +       /*
15340 +        * Defer the cleanup to an alive cpu. On RT we can neither
15341 +        * call mmdrop() nor mmdrop_delayed() from here.
15342 +        */
15343 +       per_cpu(idle_last_mm, smp_processor_id()) = mm;
15344 +
15345  }
15346
15347  /*
15348 @@ -7418,6 +7662,10 @@
15349         update_max_interval();
15350         nohz_balance_exit_idle(cpu);
15351         hrtick_clear(rq);
15352 +       if (per_cpu(idle_last_mm, cpu)) {
15353 +               mmdrop_delayed(per_cpu(idle_last_mm, cpu));
15354 +               per_cpu(idle_last_mm, cpu) = NULL;
15355 +       }
15356         return 0;
15357  }
15358  #endif
15359 @@ -7698,7 +7946,7 @@
15360  #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
15361  static inline int preempt_count_equals(int preempt_offset)
15362  {
15363 -       int nested = preempt_count() + rcu_preempt_depth();
15364 +       int nested = preempt_count() + sched_rcu_preempt_depth();
15365
15366         return (nested == preempt_offset);
15367  }
15368 diff -Nur linux-4.9.6.orig/kernel/sched/deadline.c linux-4.9.6/kernel/sched/deadline.c
15369 --- linux-4.9.6.orig/kernel/sched/deadline.c    2017-01-26 08:25:24.000000000 +0100
15370 +++ linux-4.9.6/kernel/sched/deadline.c 2017-01-28 13:59:10.043660493 +0100
15371 @@ -687,6 +687,7 @@
15372
15373         hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
15374         timer->function = dl_task_timer;
15375 +       timer->irqsafe = 1;
15376  }
15377
15378  static
15379 diff -Nur linux-4.9.6.orig/kernel/sched/debug.c linux-4.9.6/kernel/sched/debug.c
15380 --- linux-4.9.6.orig/kernel/sched/debug.c       2017-01-26 08:25:24.000000000 +0100
15381 +++ linux-4.9.6/kernel/sched/debug.c    2017-01-28 13:59:10.043660493 +0100
15382 @@ -558,6 +558,9 @@
15383         P(rt_throttled);
15384         PN(rt_time);
15385         PN(rt_runtime);
15386 +#ifdef CONFIG_SMP
15387 +       P(rt_nr_migratory);
15388 +#endif
15389
15390  #undef PN
15391  #undef P
15392 @@ -953,6 +956,10 @@
15393  #endif
15394         P(policy);
15395         P(prio);
15396 +#ifdef CONFIG_PREEMPT_RT_FULL
15397 +       P(migrate_disable);
15398 +#endif
15399 +       P(nr_cpus_allowed);
15400  #undef PN_SCHEDSTAT
15401  #undef PN
15402  #undef __PN
15403 diff -Nur linux-4.9.6.orig/kernel/sched/fair.c linux-4.9.6/kernel/sched/fair.c
15404 --- linux-4.9.6.orig/kernel/sched/fair.c        2017-01-26 08:25:24.000000000 +0100
15405 +++ linux-4.9.6/kernel/sched/fair.c     2017-01-28 13:59:10.047660646 +0100
15406 @@ -3518,7 +3518,7 @@
15407         ideal_runtime = sched_slice(cfs_rq, curr);
15408         delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
15409         if (delta_exec > ideal_runtime) {
15410 -               resched_curr(rq_of(cfs_rq));
15411 +               resched_curr_lazy(rq_of(cfs_rq));
15412                 /*
15413                  * The current task ran long enough, ensure it doesn't get
15414                  * re-elected due to buddy favours.
15415 @@ -3542,7 +3542,7 @@
15416                 return;
15417
15418         if (delta > ideal_runtime)
15419 -               resched_curr(rq_of(cfs_rq));
15420 +               resched_curr_lazy(rq_of(cfs_rq));
15421  }
15422
15423  static void
15424 @@ -3684,7 +3684,7 @@
15425          * validating it and just reschedule.
15426          */
15427         if (queued) {
15428 -               resched_curr(rq_of(cfs_rq));
15429 +               resched_curr_lazy(rq_of(cfs_rq));
15430                 return;
15431         }
15432         /*
15433 @@ -3866,7 +3866,7 @@
15434          * hierarchy can be throttled
15435          */
15436         if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
15437 -               resched_curr(rq_of(cfs_rq));
15438 +               resched_curr_lazy(rq_of(cfs_rq));
15439  }
15440
15441  static __always_inline
15442 @@ -4494,7 +4494,7 @@
15443
15444                 if (delta < 0) {
15445                         if (rq->curr == p)
15446 -                               resched_curr(rq);
15447 +                               resched_curr_lazy(rq);
15448                         return;
15449                 }
15450                 hrtick_start(rq, delta);
15451 @@ -5905,7 +5905,7 @@
15452         return;
15453
15454  preempt:
15455 -       resched_curr(rq);
15456 +       resched_curr_lazy(rq);
15457         /*
15458          * Only set the backward buddy when the current task is still
15459          * on the rq. This can happen when a wakeup gets interleaved
15460 @@ -8631,7 +8631,7 @@
15461                  * 'current' within the tree based on its new key value.
15462                  */
15463                 swap(curr->vruntime, se->vruntime);
15464 -               resched_curr(rq);
15465 +               resched_curr_lazy(rq);
15466         }
15467
15468         se->vruntime -= cfs_rq->min_vruntime;
15469 @@ -8655,7 +8655,7 @@
15470          */
15471         if (rq->curr == p) {
15472                 if (p->prio > oldprio)
15473 -                       resched_curr(rq);
15474 +                       resched_curr_lazy(rq);
15475         } else
15476                 check_preempt_curr(rq, p, 0);
15477  }
15478 diff -Nur linux-4.9.6.orig/kernel/sched/features.h linux-4.9.6/kernel/sched/features.h
15479 --- linux-4.9.6.orig/kernel/sched/features.h    2017-01-26 08:25:24.000000000 +0100
15480 +++ linux-4.9.6/kernel/sched/features.h 2017-01-28 13:59:10.047660646 +0100
15481 @@ -45,11 +45,19 @@
15482   */
15483  SCHED_FEAT(NONTASK_CAPACITY, true)
15484
15485 +#ifdef CONFIG_PREEMPT_RT_FULL
15486 +SCHED_FEAT(TTWU_QUEUE, false)
15487 +# ifdef CONFIG_PREEMPT_LAZY
15488 +SCHED_FEAT(PREEMPT_LAZY, true)
15489 +# endif
15490 +#else
15491 +
15492  /*
15493   * Queue remote wakeups on the target CPU and process them
15494   * using the scheduler IPI. Reduces rq->lock contention/bounces.
15495   */
15496  SCHED_FEAT(TTWU_QUEUE, true)
15497 +#endif
15498
15499  #ifdef HAVE_RT_PUSH_IPI
15500  /*
15501 diff -Nur linux-4.9.6.orig/kernel/sched/Makefile linux-4.9.6/kernel/sched/Makefile
15502 --- linux-4.9.6.orig/kernel/sched/Makefile      2017-01-26 08:25:24.000000000 +0100
15503 +++ linux-4.9.6/kernel/sched/Makefile   2017-01-28 13:59:10.043660493 +0100
15504 @@ -17,7 +17,7 @@
15505
15506  obj-y += core.o loadavg.o clock.o cputime.o
15507  obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
15508 -obj-y += wait.o swait.o completion.o idle.o
15509 +obj-y += wait.o swait.o swork.o completion.o idle.o
15510  obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o
15511  obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
15512  obj-$(CONFIG_SCHEDSTATS) += stats.o
15513 diff -Nur linux-4.9.6.orig/kernel/sched/rt.c linux-4.9.6/kernel/sched/rt.c
15514 --- linux-4.9.6.orig/kernel/sched/rt.c  2017-01-26 08:25:24.000000000 +0100
15515 +++ linux-4.9.6/kernel/sched/rt.c       2017-01-28 13:59:10.047660646 +0100
15516 @@ -47,6 +47,7 @@
15517
15518         hrtimer_init(&rt_b->rt_period_timer,
15519                         CLOCK_MONOTONIC, HRTIMER_MODE_REL);
15520 +       rt_b->rt_period_timer.irqsafe = 1;
15521         rt_b->rt_period_timer.function = sched_rt_period_timer;
15522  }
15523
15524 @@ -101,6 +102,7 @@
15525         rt_rq->push_cpu = nr_cpu_ids;
15526         raw_spin_lock_init(&rt_rq->push_lock);
15527         init_irq_work(&rt_rq->push_work, push_irq_work_func);
15528 +       rt_rq->push_work.flags |= IRQ_WORK_HARD_IRQ;
15529  #endif
15530  #endif /* CONFIG_SMP */
15531         /* We start is dequeued state, because no RT tasks are queued */
15532 diff -Nur linux-4.9.6.orig/kernel/sched/sched.h linux-4.9.6/kernel/sched/sched.h
15533 --- linux-4.9.6.orig/kernel/sched/sched.h       2017-01-26 08:25:24.000000000 +0100
15534 +++ linux-4.9.6/kernel/sched/sched.h    2017-01-28 13:59:10.047660646 +0100
15535 @@ -1163,6 +1163,7 @@
15536  #define WF_SYNC                0x01            /* waker goes to sleep after wakeup */
15537  #define WF_FORK                0x02            /* child wakeup after fork */
15538  #define WF_MIGRATED    0x4             /* internal use, task got migrated */
15539 +#define WF_LOCK_SLEEPER        0x08            /* wakeup spinlock "sleeper" */
15540
15541  /*
15542   * To aid in avoiding the subversion of "niceness" due to uneven distribution
15543 @@ -1346,6 +1347,15 @@
15544  extern void resched_curr(struct rq *rq);
15545  extern void resched_cpu(int cpu);
15546
15547 +#ifdef CONFIG_PREEMPT_LAZY
15548 +extern void resched_curr_lazy(struct rq *rq);
15549 +#else
15550 +static inline void resched_curr_lazy(struct rq *rq)
15551 +{
15552 +       resched_curr(rq);
15553 +}
15554 +#endif
15555 +
15556  extern struct rt_bandwidth def_rt_bandwidth;
15557  extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
15558
15559 diff -Nur linux-4.9.6.orig/kernel/sched/swait.c linux-4.9.6/kernel/sched/swait.c
15560 --- linux-4.9.6.orig/kernel/sched/swait.c       2017-01-26 08:25:24.000000000 +0100
15561 +++ linux-4.9.6/kernel/sched/swait.c    2017-01-28 13:59:10.047660646 +0100
15562 @@ -1,5 +1,6 @@
15563  #include <linux/sched.h>
15564  #include <linux/swait.h>
15565 +#include <linux/suspend.h>
15566
15567  void __init_swait_queue_head(struct swait_queue_head *q, const char *name,
15568                              struct lock_class_key *key)
15569 @@ -29,6 +30,25 @@
15570  }
15571  EXPORT_SYMBOL(swake_up_locked);
15572
15573 +void swake_up_all_locked(struct swait_queue_head *q)
15574 +{
15575 +       struct swait_queue *curr;
15576 +       int wakes = 0;
15577 +
15578 +       while (!list_empty(&q->task_list)) {
15579 +
15580 +               curr = list_first_entry(&q->task_list, typeof(*curr),
15581 +                                       task_list);
15582 +               wake_up_process(curr->task);
15583 +               list_del_init(&curr->task_list);
15584 +               wakes++;
15585 +       }
15586 +       if (pm_in_action)
15587 +               return;
15588 +       WARN(wakes > 2, "complete_all() with %d waiters\n", wakes);
15589 +}
15590 +EXPORT_SYMBOL(swake_up_all_locked);
15591 +
15592  void swake_up(struct swait_queue_head *q)
15593  {
15594         unsigned long flags;
15595 @@ -54,6 +74,7 @@
15596         if (!swait_active(q))
15597                 return;
15598
15599 +       WARN_ON(irqs_disabled());
15600         raw_spin_lock_irq(&q->lock);
15601         list_splice_init(&q->task_list, &tmp);
15602         while (!list_empty(&tmp)) {
15603 diff -Nur linux-4.9.6.orig/kernel/sched/swork.c linux-4.9.6/kernel/sched/swork.c
15604 --- linux-4.9.6.orig/kernel/sched/swork.c       1970-01-01 01:00:00.000000000 +0100
15605 +++ linux-4.9.6/kernel/sched/swork.c    2017-01-28 13:59:10.047660646 +0100
15606 @@ -0,0 +1,173 @@
15607 +/*
15608 + * Copyright (C) 2014 BMW Car IT GmbH, Daniel Wagner daniel.wagner@bmw-carit.de
15609 + *
15610 + * Provides a framework for enqueuing callbacks from irq context
15611 + * PREEMPT_RT_FULL safe. The callbacks are executed in kthread context.
15612 + */
15613 +
15614 +#include <linux/swait.h>
15615 +#include <linux/swork.h>
15616 +#include <linux/kthread.h>
15617 +#include <linux/slab.h>
15618 +#include <linux/spinlock.h>
15619 +#include <linux/export.h>
15620 +
15621 +#define SWORK_EVENT_PENDING     (1 << 0)
15622 +
15623 +static DEFINE_MUTEX(worker_mutex);
15624 +static struct sworker *glob_worker;
15625 +
15626 +struct sworker {
15627 +       struct list_head events;
15628 +       struct swait_queue_head wq;
15629 +
15630 +       raw_spinlock_t lock;
15631 +
15632 +       struct task_struct *task;
15633 +       int refs;
15634 +};
15635 +
15636 +static bool swork_readable(struct sworker *worker)
15637 +{
15638 +       bool r;
15639 +
15640 +       if (kthread_should_stop())
15641 +               return true;
15642 +
15643 +       raw_spin_lock_irq(&worker->lock);
15644 +       r = !list_empty(&worker->events);
15645 +       raw_spin_unlock_irq(&worker->lock);
15646 +
15647 +       return r;
15648 +}
15649 +
15650 +static int swork_kthread(void *arg)
15651 +{
15652 +       struct sworker *worker = arg;
15653 +
15654 +       for (;;) {
15655 +               swait_event_interruptible(worker->wq,
15656 +                                       swork_readable(worker));
15657 +               if (kthread_should_stop())
15658 +                       break;
15659 +
15660 +               raw_spin_lock_irq(&worker->lock);
15661 +               while (!list_empty(&worker->events)) {
15662 +                       struct swork_event *sev;
15663 +
15664 +                       sev = list_first_entry(&worker->events,
15665 +                                       struct swork_event, item);
15666 +                       list_del(&sev->item);
15667 +                       raw_spin_unlock_irq(&worker->lock);
15668 +
15669 +                       WARN_ON_ONCE(!test_and_clear_bit(SWORK_EVENT_PENDING,
15670 +                                                        &sev->flags));
15671 +                       sev->func(sev);
15672 +                       raw_spin_lock_irq(&worker->lock);
15673 +               }
15674 +               raw_spin_unlock_irq(&worker->lock);
15675 +       }
15676 +       return 0;
15677 +}
15678 +
15679 +static struct sworker *swork_create(void)
15680 +{
15681 +       struct sworker *worker;
15682 +
15683 +       worker = kzalloc(sizeof(*worker), GFP_KERNEL);
15684 +       if (!worker)
15685 +               return ERR_PTR(-ENOMEM);
15686 +
15687 +       INIT_LIST_HEAD(&worker->events);
15688 +       raw_spin_lock_init(&worker->lock);
15689 +       init_swait_queue_head(&worker->wq);
15690 +
15691 +       worker->task = kthread_run(swork_kthread, worker, "kswork");
15692 +       if (IS_ERR(worker->task)) {
15693 +               kfree(worker);
15694 +               return ERR_PTR(-ENOMEM);
15695 +       }
15696 +
15697 +       return worker;
15698 +}
15699 +
15700 +static void swork_destroy(struct sworker *worker)
15701 +{
15702 +       kthread_stop(worker->task);
15703 +
15704 +       WARN_ON(!list_empty(&worker->events));
15705 +       kfree(worker);
15706 +}
15707 +
15708 +/**
15709 + * swork_queue - queue swork
15710 + *
15711 + * Returns %false if @work was already on a queue, %true otherwise.
15712 + *
15713 + * The work is queued and processed on a random CPU
15714 + */
15715 +bool swork_queue(struct swork_event *sev)
15716 +{
15717 +       unsigned long flags;
15718 +
15719 +       if (test_and_set_bit(SWORK_EVENT_PENDING, &sev->flags))
15720 +               return false;
15721 +
15722 +       raw_spin_lock_irqsave(&glob_worker->lock, flags);
15723 +       list_add_tail(&sev->item, &glob_worker->events);
15724 +       raw_spin_unlock_irqrestore(&glob_worker->lock, flags);
15725 +
15726 +       swake_up(&glob_worker->wq);
15727 +       return true;
15728 +}
15729 +EXPORT_SYMBOL_GPL(swork_queue);
15730 +
15731 +/**
15732 + * swork_get - get an instance of the sworker
15733 + *
15734 + * Returns an negative error code if the initialization if the worker did not
15735 + * work, %0 otherwise.
15736 + *
15737 + */
15738 +int swork_get(void)
15739 +{
15740 +       struct sworker *worker;
15741 +
15742 +       mutex_lock(&worker_mutex);
15743 +       if (!glob_worker) {
15744 +               worker = swork_create();
15745 +               if (IS_ERR(worker)) {
15746 +                       mutex_unlock(&worker_mutex);
15747 +                       return -ENOMEM;
15748 +               }
15749 +
15750 +               glob_worker = worker;
15751 +       }
15752 +
15753 +       glob_worker->refs++;
15754 +       mutex_unlock(&worker_mutex);
15755 +
15756 +       return 0;
15757 +}
15758 +EXPORT_SYMBOL_GPL(swork_get);
15759 +
15760 +/**
15761 + * swork_put - puts an instance of the sworker
15762 + *
15763 + * Will destroy the sworker thread. This function must not be called until all
15764 + * queued events have been completed.
15765 + */
15766 +void swork_put(void)
15767 +{
15768 +       mutex_lock(&worker_mutex);
15769 +
15770 +       glob_worker->refs--;
15771 +       if (glob_worker->refs > 0)
15772 +               goto out;
15773 +
15774 +       swork_destroy(glob_worker);
15775 +       glob_worker = NULL;
15776 +out:
15777 +       mutex_unlock(&worker_mutex);
15778 +}
15779 +EXPORT_SYMBOL_GPL(swork_put);
15780 diff -Nur linux-4.9.6.orig/kernel/signal.c linux-4.9.6/kernel/signal.c
15781 --- linux-4.9.6.orig/kernel/signal.c    2017-01-26 08:25:24.000000000 +0100
15782 +++ linux-4.9.6/kernel/signal.c 2017-01-28 13:59:10.047660646 +0100
15783 @@ -14,6 +14,7 @@
15784  #include <linux/export.h>
15785  #include <linux/init.h>
15786  #include <linux/sched.h>
15787 +#include <linux/sched/rt.h>
15788  #include <linux/fs.h>
15789  #include <linux/tty.h>
15790  #include <linux/binfmts.h>
15791 @@ -352,13 +353,30 @@
15792         return false;
15793  }
15794
15795 +static inline struct sigqueue *get_task_cache(struct task_struct *t)
15796 +{
15797 +       struct sigqueue *q = t->sigqueue_cache;
15798 +
15799 +       if (cmpxchg(&t->sigqueue_cache, q, NULL) != q)
15800 +               return NULL;
15801 +       return q;
15802 +}
15803 +
15804 +static inline int put_task_cache(struct task_struct *t, struct sigqueue *q)
15805 +{
15806 +       if (cmpxchg(&t->sigqueue_cache, NULL, q) == NULL)
15807 +               return 0;
15808 +       return 1;
15809 +}
15810 +
15811  /*
15812   * allocate a new signal queue record
15813   * - this may be called without locks if and only if t == current, otherwise an
15814   *   appropriate lock must be held to stop the target task from exiting
15815   */
15816  static struct sigqueue *
15817 -__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimit)
15818 +__sigqueue_do_alloc(int sig, struct task_struct *t, gfp_t flags,
15819 +                   int override_rlimit, int fromslab)
15820  {
15821         struct sigqueue *q = NULL;
15822         struct user_struct *user;
15823 @@ -375,7 +393,10 @@
15824         if (override_rlimit ||
15825             atomic_read(&user->sigpending) <=
15826                         task_rlimit(t, RLIMIT_SIGPENDING)) {
15827 -               q = kmem_cache_alloc(sigqueue_cachep, flags);
15828 +               if (!fromslab)
15829 +                       q = get_task_cache(t);
15830 +               if (!q)
15831 +                       q = kmem_cache_alloc(sigqueue_cachep, flags);
15832         } else {
15833                 print_dropped_signal(sig);
15834         }
15835 @@ -392,6 +413,13 @@
15836         return q;
15837  }
15838
15839 +static struct sigqueue *
15840 +__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags,
15841 +                int override_rlimit)
15842 +{
15843 +       return __sigqueue_do_alloc(sig, t, flags, override_rlimit, 0);
15844 +}
15845 +
15846  static void __sigqueue_free(struct sigqueue *q)
15847  {
15848         if (q->flags & SIGQUEUE_PREALLOC)
15849 @@ -401,6 +429,21 @@
15850         kmem_cache_free(sigqueue_cachep, q);
15851  }
15852
15853 +static void sigqueue_free_current(struct sigqueue *q)
15854 +{
15855 +       struct user_struct *up;
15856 +
15857 +       if (q->flags & SIGQUEUE_PREALLOC)
15858 +               return;
15859 +
15860 +       up = q->user;
15861 +       if (rt_prio(current->normal_prio) && !put_task_cache(current, q)) {
15862 +               atomic_dec(&up->sigpending);
15863 +               free_uid(up);
15864 +       } else
15865 +                 __sigqueue_free(q);
15866 +}
15867 +
15868  void flush_sigqueue(struct sigpending *queue)
15869  {
15870         struct sigqueue *q;
15871 @@ -414,6 +457,21 @@
15872  }
15873
15874  /*
15875 + * Called from __exit_signal. Flush tsk->pending and
15876 + * tsk->sigqueue_cache
15877 + */
15878 +void flush_task_sigqueue(struct task_struct *tsk)
15879 +{
15880 +       struct sigqueue *q;
15881 +
15882 +       flush_sigqueue(&tsk->pending);
15883 +
15884 +       q = get_task_cache(tsk);
15885 +       if (q)
15886 +               kmem_cache_free(sigqueue_cachep, q);
15887 +}
15888 +
15889 +/*
15890   * Flush all pending signals for this kthread.
15891   */
15892  void flush_signals(struct task_struct *t)
15893 @@ -525,7 +583,7 @@
15894  still_pending:
15895                 list_del_init(&first->list);
15896                 copy_siginfo(info, &first->info);
15897 -               __sigqueue_free(first);
15898 +               sigqueue_free_current(first);
15899         } else {
15900                 /*
15901                  * Ok, it wasn't in the queue.  This must be
15902 @@ -560,6 +618,8 @@
15903  {
15904         int signr;
15905
15906 +       WARN_ON_ONCE(tsk != current);
15907 +
15908         /* We only dequeue private signals from ourselves, we don't let
15909          * signalfd steal them
15910          */
15911 @@ -1156,8 +1216,8 @@
15912   * We don't want to have recursive SIGSEGV's etc, for example,
15913   * that is why we also clear SIGNAL_UNKILLABLE.
15914   */
15915 -int
15916 -force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
15917 +static int
15918 +do_force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
15919  {
15920         unsigned long int flags;
15921         int ret, blocked, ignored;
15922 @@ -1182,6 +1242,39 @@
15923         return ret;
15924  }
15925
15926 +int force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
15927 +{
15928 +/*
15929 + * On some archs, PREEMPT_RT has to delay sending a signal from a trap
15930 + * since it can not enable preemption, and the signal code's spin_locks
15931 + * turn into mutexes. Instead, it must set TIF_NOTIFY_RESUME which will
15932 + * send the signal on exit of the trap.
15933 + */
15934 +#ifdef ARCH_RT_DELAYS_SIGNAL_SEND
15935 +       if (in_atomic()) {
15936 +               if (WARN_ON_ONCE(t != current))
15937 +                       return 0;
15938 +               if (WARN_ON_ONCE(t->forced_info.si_signo))
15939 +                       return 0;
15940 +
15941 +               if (is_si_special(info)) {
15942 +                       WARN_ON_ONCE(info != SEND_SIG_PRIV);
15943 +                       t->forced_info.si_signo = sig;
15944 +                       t->forced_info.si_errno = 0;
15945 +                       t->forced_info.si_code = SI_KERNEL;
15946 +                       t->forced_info.si_pid = 0;
15947 +                       t->forced_info.si_uid = 0;
15948 +               } else {
15949 +                       t->forced_info = *info;
15950 +               }
15951 +
15952 +               set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
15953 +               return 0;
15954 +       }
15955 +#endif
15956 +       return do_force_sig_info(sig, info, t);
15957 +}
15958 +
15959  /*
15960   * Nuke all other threads in the group.
15961   */
15962 @@ -1216,12 +1309,12 @@
15963                  * Disable interrupts early to avoid deadlocks.
15964                  * See rcu_read_unlock() comment header for details.
15965                  */
15966 -               local_irq_save(*flags);
15967 +               local_irq_save_nort(*flags);
15968                 rcu_read_lock();
15969                 sighand = rcu_dereference(tsk->sighand);
15970                 if (unlikely(sighand == NULL)) {
15971                         rcu_read_unlock();
15972 -                       local_irq_restore(*flags);
15973 +                       local_irq_restore_nort(*flags);
15974                         break;
15975                 }
15976                 /*
15977 @@ -1242,7 +1335,7 @@
15978                 }
15979                 spin_unlock(&sighand->siglock);
15980                 rcu_read_unlock();
15981 -               local_irq_restore(*flags);
15982 +               local_irq_restore_nort(*flags);
15983         }
15984
15985         return sighand;
15986 @@ -1485,7 +1578,8 @@
15987   */
15988  struct sigqueue *sigqueue_alloc(void)
15989  {
15990 -       struct sigqueue *q = __sigqueue_alloc(-1, current, GFP_KERNEL, 0);
15991 +       /* Preallocated sigqueue objects always from the slabcache ! */
15992 +       struct sigqueue *q = __sigqueue_do_alloc(-1, current, GFP_KERNEL, 0, 1);
15993
15994         if (q)
15995                 q->flags |= SIGQUEUE_PREALLOC;
15996 @@ -1846,15 +1940,7 @@
15997                 if (gstop_done && ptrace_reparented(current))
15998                         do_notify_parent_cldstop(current, false, why);
15999
16000 -               /*
16001 -                * Don't want to allow preemption here, because
16002 -                * sys_ptrace() needs this task to be inactive.
16003 -                *
16004 -                * XXX: implement read_unlock_no_resched().
16005 -                */
16006 -               preempt_disable();
16007                 read_unlock(&tasklist_lock);
16008 -               preempt_enable_no_resched();
16009                 freezable_schedule();
16010         } else {
16011                 /*
16012 diff -Nur linux-4.9.6.orig/kernel/softirq.c linux-4.9.6/kernel/softirq.c
16013 --- linux-4.9.6.orig/kernel/softirq.c   2017-01-26 08:25:24.000000000 +0100
16014 +++ linux-4.9.6/kernel/softirq.c        2017-01-28 13:59:10.083662028 +0100
16015 @@ -21,10 +21,12 @@
16016  #include <linux/freezer.h>
16017  #include <linux/kthread.h>
16018  #include <linux/rcupdate.h>
16019 +#include <linux/delay.h>
16020  #include <linux/ftrace.h>
16021  #include <linux/smp.h>
16022  #include <linux/smpboot.h>
16023  #include <linux/tick.h>
16024 +#include <linux/locallock.h>
16025  #include <linux/irq.h>
16026
16027  #define CREATE_TRACE_POINTS
16028 @@ -56,12 +58,108 @@
16029  static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp;
16030
16031  DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
16032 +#ifdef CONFIG_PREEMPT_RT_FULL
16033 +#define TIMER_SOFTIRQS ((1 << TIMER_SOFTIRQ) | (1 << HRTIMER_SOFTIRQ))
16034 +DEFINE_PER_CPU(struct task_struct *, ktimer_softirqd);
16035 +#endif
16036
16037  const char * const softirq_to_name[NR_SOFTIRQS] = {
16038         "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "IRQ_POLL",
16039         "TASKLET", "SCHED", "HRTIMER", "RCU"
16040  };
16041
16042 +#ifdef CONFIG_NO_HZ_COMMON
16043 +# ifdef CONFIG_PREEMPT_RT_FULL
16044 +
16045 +struct softirq_runner {
16046 +       struct task_struct *runner[NR_SOFTIRQS];
16047 +};
16048 +
16049 +static DEFINE_PER_CPU(struct softirq_runner, softirq_runners);
16050 +
16051 +static inline void softirq_set_runner(unsigned int sirq)
16052 +{
16053 +       struct softirq_runner *sr = this_cpu_ptr(&softirq_runners);
16054 +
16055 +       sr->runner[sirq] = current;
16056 +}
16057 +
16058 +static inline void softirq_clr_runner(unsigned int sirq)
16059 +{
16060 +       struct softirq_runner *sr = this_cpu_ptr(&softirq_runners);
16061 +
16062 +       sr->runner[sirq] = NULL;
16063 +}
16064 +
16065 +/*
16066 + * On preempt-rt a softirq running context might be blocked on a
16067 + * lock. There might be no other runnable task on this CPU because the
16068 + * lock owner runs on some other CPU. So we have to go into idle with
16069 + * the pending bit set. Therefor we need to check this otherwise we
16070 + * warn about false positives which confuses users and defeats the
16071 + * whole purpose of this test.
16072 + *
16073 + * This code is called with interrupts disabled.
16074 + */
16075 +void softirq_check_pending_idle(void)
16076 +{
16077 +       static int rate_limit;
16078 +       struct softirq_runner *sr = this_cpu_ptr(&softirq_runners);
16079 +       u32 warnpending;
16080 +       int i;
16081 +
16082 +       if (rate_limit >= 10)
16083 +               return;
16084 +
16085 +       warnpending = local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK;
16086 +       for (i = 0; i < NR_SOFTIRQS; i++) {
16087 +               struct task_struct *tsk = sr->runner[i];
16088 +
16089 +               /*
16090 +                * The wakeup code in rtmutex.c wakes up the task
16091 +                * _before_ it sets pi_blocked_on to NULL under
16092 +                * tsk->pi_lock. So we need to check for both: state
16093 +                * and pi_blocked_on.
16094 +                */
16095 +               if (tsk) {
16096 +                       raw_spin_lock(&tsk->pi_lock);
16097 +                       if (tsk->pi_blocked_on || tsk->state == TASK_RUNNING) {
16098 +                               /* Clear all bits pending in that task */
16099 +                               warnpending &= ~(tsk->softirqs_raised);
16100 +                               warnpending &= ~(1 << i);
16101 +                       }
16102 +                       raw_spin_unlock(&tsk->pi_lock);
16103 +               }
16104 +       }
16105 +
16106 +       if (warnpending) {
16107 +               printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
16108 +                      warnpending);
16109 +               rate_limit++;
16110 +       }
16111 +}
16112 +# else
16113 +/*
16114 + * On !PREEMPT_RT we just printk rate limited:
16115 + */
16116 +void softirq_check_pending_idle(void)
16117 +{
16118 +       static int rate_limit;
16119 +
16120 +       if (rate_limit < 10 &&
16121 +                       (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) {
16122 +               printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
16123 +                      local_softirq_pending());
16124 +               rate_limit++;
16125 +       }
16126 +}
16127 +# endif
16128 +
16129 +#else /* !CONFIG_NO_HZ_COMMON */
16130 +static inline void softirq_set_runner(unsigned int sirq) { }
16131 +static inline void softirq_clr_runner(unsigned int sirq) { }
16132 +#endif
16133 +
16134  /*
16135   * we cannot loop indefinitely here to avoid userspace starvation,
16136   * but we also don't want to introduce a worst case 1/HZ latency
16137 @@ -77,6 +175,37 @@
16138                 wake_up_process(tsk);
16139  }
16140
16141 +#ifdef CONFIG_PREEMPT_RT_FULL
16142 +static void wakeup_timer_softirqd(void)
16143 +{
16144 +       /* Interrupts are disabled: no need to stop preemption */
16145 +       struct task_struct *tsk = __this_cpu_read(ktimer_softirqd);
16146 +
16147 +       if (tsk && tsk->state != TASK_RUNNING)
16148 +               wake_up_process(tsk);
16149 +}
16150 +#endif
16151 +
16152 +static void handle_softirq(unsigned int vec_nr)
16153 +{
16154 +       struct softirq_action *h = softirq_vec + vec_nr;
16155 +       int prev_count;
16156 +
16157 +       prev_count = preempt_count();
16158 +
16159 +       kstat_incr_softirqs_this_cpu(vec_nr);
16160 +
16161 +       trace_softirq_entry(vec_nr);
16162 +       h->action(h);
16163 +       trace_softirq_exit(vec_nr);
16164 +       if (unlikely(prev_count != preempt_count())) {
16165 +               pr_err("huh, entered softirq %u %s %p with preempt_count %08x, exited with %08x?\n",
16166 +                      vec_nr, softirq_to_name[vec_nr], h->action,
16167 +                      prev_count, preempt_count());
16168 +               preempt_count_set(prev_count);
16169 +       }
16170 +}
16171 +
16172  /*
16173   * If ksoftirqd is scheduled, we do not want to process pending softirqs
16174   * right now. Let ksoftirqd handle this at its own rate, to get fairness.
16175 @@ -88,6 +217,48 @@
16176         return tsk && (tsk->state == TASK_RUNNING);
16177  }
16178
16179 +#ifndef CONFIG_PREEMPT_RT_FULL
16180 +static inline int ksoftirqd_softirq_pending(void)
16181 +{
16182 +       return local_softirq_pending();
16183 +}
16184 +
16185 +static void handle_pending_softirqs(u32 pending)
16186 +{
16187 +       struct softirq_action *h = softirq_vec;
16188 +       int softirq_bit;
16189 +
16190 +       local_irq_enable();
16191 +
16192 +       h = softirq_vec;
16193 +
16194 +       while ((softirq_bit = ffs(pending))) {
16195 +               unsigned int vec_nr;
16196 +
16197 +               h += softirq_bit - 1;
16198 +               vec_nr = h - softirq_vec;
16199 +               handle_softirq(vec_nr);
16200 +
16201 +               h++;
16202 +               pending >>= softirq_bit;
16203 +       }
16204 +
16205 +       rcu_bh_qs();
16206 +       local_irq_disable();
16207 +}
16208 +
16209 +static void run_ksoftirqd(unsigned int cpu)
16210 +{
16211 +       local_irq_disable();
16212 +       if (ksoftirqd_softirq_pending()) {
16213 +               __do_softirq();
16214 +               local_irq_enable();
16215 +               cond_resched_rcu_qs();
16216 +               return;
16217 +       }
16218 +       local_irq_enable();
16219 +}
16220 +
16221  /*
16222   * preempt_count and SOFTIRQ_OFFSET usage:
16223   * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving
16224 @@ -243,10 +414,8 @@
16225         unsigned long end = jiffies + MAX_SOFTIRQ_TIME;
16226         unsigned long old_flags = current->flags;
16227         int max_restart = MAX_SOFTIRQ_RESTART;
16228 -       struct softirq_action *h;
16229         bool in_hardirq;
16230         __u32 pending;
16231 -       int softirq_bit;
16232
16233         /*
16234          * Mask out PF_MEMALLOC s current task context is borrowed for the
16235 @@ -265,36 +434,7 @@
16236         /* Reset the pending bitmask before enabling irqs */
16237         set_softirq_pending(0);
16238
16239 -       local_irq_enable();
16240 -
16241 -       h = softirq_vec;
16242 -
16243 -       while ((softirq_bit = ffs(pending))) {
16244 -               unsigned int vec_nr;
16245 -               int prev_count;
16246 -
16247 -               h += softirq_bit - 1;
16248 -
16249 -               vec_nr = h - softirq_vec;
16250 -               prev_count = preempt_count();
16251 -
16252 -               kstat_incr_softirqs_this_cpu(vec_nr);
16253 -
16254 -               trace_softirq_entry(vec_nr);
16255 -               h->action(h);
16256 -               trace_softirq_exit(vec_nr);
16257 -               if (unlikely(prev_count != preempt_count())) {
16258 -                       pr_err("huh, entered softirq %u %s %p with preempt_count %08x, exited with %08x?\n",
16259 -                              vec_nr, softirq_to_name[vec_nr], h->action,
16260 -                              prev_count, preempt_count());
16261 -                       preempt_count_set(prev_count);
16262 -               }
16263 -               h++;
16264 -               pending >>= softirq_bit;
16265 -       }
16266 -
16267 -       rcu_bh_qs();
16268 -       local_irq_disable();
16269 +       handle_pending_softirqs(pending);
16270
16271         pending = local_softirq_pending();
16272         if (pending) {
16273 @@ -331,6 +471,309 @@
16274  }
16275
16276  /*
16277 + * This function must run with irqs disabled!
16278 + */
16279 +void raise_softirq_irqoff(unsigned int nr)
16280 +{
16281 +       __raise_softirq_irqoff(nr);
16282 +
16283 +       /*
16284 +        * If we're in an interrupt or softirq, we're done
16285 +        * (this also catches softirq-disabled code). We will
16286 +        * actually run the softirq once we return from
16287 +        * the irq or softirq.
16288 +        *
16289 +        * Otherwise we wake up ksoftirqd to make sure we
16290 +        * schedule the softirq soon.
16291 +        */
16292 +       if (!in_interrupt())
16293 +               wakeup_softirqd();
16294 +}
16295 +
16296 +void __raise_softirq_irqoff(unsigned int nr)
16297 +{
16298 +       trace_softirq_raise(nr);
16299 +       or_softirq_pending(1UL << nr);
16300 +}
16301 +
16302 +static inline void local_bh_disable_nort(void) { local_bh_disable(); }
16303 +static inline void _local_bh_enable_nort(void) { _local_bh_enable(); }
16304 +static void ksoftirqd_set_sched_params(unsigned int cpu) { }
16305 +
16306 +#else /* !PREEMPT_RT_FULL */
16307 +
16308 +/*
16309 + * On RT we serialize softirq execution with a cpu local lock per softirq
16310 + */
16311 +static DEFINE_PER_CPU(struct local_irq_lock [NR_SOFTIRQS], local_softirq_locks);
16312 +
16313 +void __init softirq_early_init(void)
16314 +{
16315 +       int i;
16316 +
16317 +       for (i = 0; i < NR_SOFTIRQS; i++)
16318 +               local_irq_lock_init(local_softirq_locks[i]);
16319 +}
16320 +
16321 +static void lock_softirq(int which)
16322 +{
16323 +       local_lock(local_softirq_locks[which]);
16324 +}
16325 +
16326 +static void unlock_softirq(int which)
16327 +{
16328 +       local_unlock(local_softirq_locks[which]);
16329 +}
16330 +
16331 +static void do_single_softirq(int which)
16332 +{
16333 +       unsigned long old_flags = current->flags;
16334 +
16335 +       current->flags &= ~PF_MEMALLOC;
16336 +       vtime_account_irq_enter(current);
16337 +       current->flags |= PF_IN_SOFTIRQ;
16338 +       lockdep_softirq_enter();
16339 +       local_irq_enable();
16340 +       handle_softirq(which);
16341 +       local_irq_disable();
16342 +       lockdep_softirq_exit();
16343 +       current->flags &= ~PF_IN_SOFTIRQ;
16344 +       vtime_account_irq_enter(current);
16345 +       tsk_restore_flags(current, old_flags, PF_MEMALLOC);
16346 +}
16347 +
16348 +/*
16349 + * Called with interrupts disabled. Process softirqs which were raised
16350 + * in current context (or on behalf of ksoftirqd).
16351 + */
16352 +static void do_current_softirqs(void)
16353 +{
16354 +       while (current->softirqs_raised) {
16355 +               int i = __ffs(current->softirqs_raised);
16356 +               unsigned int pending, mask = (1U << i);
16357 +
16358 +               current->softirqs_raised &= ~mask;
16359 +               local_irq_enable();
16360 +
16361 +               /*
16362 +                * If the lock is contended, we boost the owner to
16363 +                * process the softirq or leave the critical section
16364 +                * now.
16365 +                */
16366 +               lock_softirq(i);
16367 +               local_irq_disable();
16368 +               softirq_set_runner(i);
16369 +               /*
16370 +                * Check with the local_softirq_pending() bits,
16371 +                * whether we need to process this still or if someone
16372 +                * else took care of it.
16373 +                */
16374 +               pending = local_softirq_pending();
16375 +               if (pending & mask) {
16376 +                       set_softirq_pending(pending & ~mask);
16377 +                       do_single_softirq(i);
16378 +               }
16379 +               softirq_clr_runner(i);
16380 +               WARN_ON(current->softirq_nestcnt != 1);
16381 +               local_irq_enable();
16382 +               unlock_softirq(i);
16383 +               local_irq_disable();
16384 +       }
16385 +}
16386 +
16387 +void __local_bh_disable(void)
16388 +{
16389 +       if (++current->softirq_nestcnt == 1)
16390 +               migrate_disable();
16391 +}
16392 +EXPORT_SYMBOL(__local_bh_disable);
16393 +
16394 +void __local_bh_enable(void)
16395 +{
16396 +       if (WARN_ON(current->softirq_nestcnt == 0))
16397 +               return;
16398 +
16399 +       local_irq_disable();
16400 +       if (current->softirq_nestcnt == 1 && current->softirqs_raised)
16401 +               do_current_softirqs();
16402 +       local_irq_enable();
16403 +
16404 +       if (--current->softirq_nestcnt == 0)
16405 +               migrate_enable();
16406 +}
16407 +EXPORT_SYMBOL(__local_bh_enable);
16408 +
16409 +void _local_bh_enable(void)
16410 +{
16411 +       if (WARN_ON(current->softirq_nestcnt == 0))
16412 +               return;
16413 +       if (--current->softirq_nestcnt == 0)
16414 +               migrate_enable();
16415 +}
16416 +EXPORT_SYMBOL(_local_bh_enable);
16417 +
16418 +int in_serving_softirq(void)
16419 +{
16420 +       return current->flags & PF_IN_SOFTIRQ;
16421 +}
16422 +EXPORT_SYMBOL(in_serving_softirq);
16423 +
16424 +/* Called with preemption disabled */
16425 +static void run_ksoftirqd(unsigned int cpu)
16426 +{
16427 +       local_irq_disable();
16428 +       current->softirq_nestcnt++;
16429 +
16430 +       do_current_softirqs();
16431 +       current->softirq_nestcnt--;
16432 +       local_irq_enable();
16433 +       cond_resched_rcu_qs();
16434 +}
16435 +
16436 +/*
16437 + * Called from netif_rx_ni(). Preemption enabled, but migration
16438 + * disabled. So the cpu can't go away under us.
16439 + */
16440 +void thread_do_softirq(void)
16441 +{
16442 +       if (!in_serving_softirq() && current->softirqs_raised) {
16443 +               current->softirq_nestcnt++;
16444 +               do_current_softirqs();
16445 +               current->softirq_nestcnt--;
16446 +       }
16447 +}
16448 +
16449 +static void do_raise_softirq_irqoff(unsigned int nr)
16450 +{
16451 +       unsigned int mask;
16452 +
16453 +       mask = 1UL << nr;
16454 +
16455 +       trace_softirq_raise(nr);
16456 +       or_softirq_pending(mask);
16457 +
16458 +       /*
16459 +        * If we are not in a hard interrupt and inside a bh disabled
16460 +        * region, we simply raise the flag on current. local_bh_enable()
16461 +        * will make sure that the softirq is executed. Otherwise we
16462 +        * delegate it to ksoftirqd.
16463 +        */
16464 +       if (!in_irq() && current->softirq_nestcnt)
16465 +               current->softirqs_raised |= mask;
16466 +       else if (!__this_cpu_read(ksoftirqd) || !__this_cpu_read(ktimer_softirqd))
16467 +               return;
16468 +
16469 +       if (mask & TIMER_SOFTIRQS)
16470 +               __this_cpu_read(ktimer_softirqd)->softirqs_raised |= mask;
16471 +       else
16472 +               __this_cpu_read(ksoftirqd)->softirqs_raised |= mask;
16473 +}
16474 +
16475 +static void wakeup_proper_softirq(unsigned int nr)
16476 +{
16477 +       if ((1UL << nr) & TIMER_SOFTIRQS)
16478 +               wakeup_timer_softirqd();
16479 +       else
16480 +               wakeup_softirqd();
16481 +}
16482 +
16483 +void __raise_softirq_irqoff(unsigned int nr)
16484 +{
16485 +       do_raise_softirq_irqoff(nr);
16486 +       if (!in_irq() && !current->softirq_nestcnt)
16487 +               wakeup_proper_softirq(nr);
16488 +}
16489 +
16490 +/*
16491 + * Same as __raise_softirq_irqoff() but will process them in ksoftirqd
16492 + */
16493 +void __raise_softirq_irqoff_ksoft(unsigned int nr)
16494 +{
16495 +       unsigned int mask;
16496 +
16497 +       if (WARN_ON_ONCE(!__this_cpu_read(ksoftirqd) ||
16498 +                        !__this_cpu_read(ktimer_softirqd)))
16499 +               return;
16500 +       mask = 1UL << nr;
16501 +
16502 +       trace_softirq_raise(nr);
16503 +       or_softirq_pending(mask);
16504 +       if (mask & TIMER_SOFTIRQS)
16505 +               __this_cpu_read(ktimer_softirqd)->softirqs_raised |= mask;
16506 +       else
16507 +               __this_cpu_read(ksoftirqd)->softirqs_raised |= mask;
16508 +       wakeup_proper_softirq(nr);
16509 +}
16510 +
16511 +/*
16512 + * This function must run with irqs disabled!
16513 + */
16514 +void raise_softirq_irqoff(unsigned int nr)
16515 +{
16516 +       do_raise_softirq_irqoff(nr);
16517 +
16518 +       /*
16519 +        * If we're in an hard interrupt we let irq return code deal
16520 +        * with the wakeup of ksoftirqd.
16521 +        */
16522 +       if (in_irq())
16523 +               return;
16524 +       /*
16525 +        * If we are in thread context but outside of a bh disabled
16526 +        * region, we need to wake ksoftirqd as well.
16527 +        *
16528 +        * CHECKME: Some of the places which do that could be wrapped
16529 +        * into local_bh_disable/enable pairs. Though it's unclear
16530 +        * whether this is worth the effort. To find those places just
16531 +        * raise a WARN() if the condition is met.
16532 +        */
16533 +       if (!current->softirq_nestcnt)
16534 +               wakeup_proper_softirq(nr);
16535 +}
16536 +
16537 +static inline int ksoftirqd_softirq_pending(void)
16538 +{
16539 +       return current->softirqs_raised;
16540 +}
16541 +
16542 +static inline void local_bh_disable_nort(void) { }
16543 +static inline void _local_bh_enable_nort(void) { }
16544 +
16545 +static inline void ksoftirqd_set_sched_params(unsigned int cpu)
16546 +{
16547 +       /* Take over all but timer pending softirqs when starting */
16548 +       local_irq_disable();
16549 +       current->softirqs_raised = local_softirq_pending() & ~TIMER_SOFTIRQS;
16550 +       local_irq_enable();
16551 +}
16552 +
16553 +static inline void ktimer_softirqd_set_sched_params(unsigned int cpu)
16554 +{
16555 +       struct sched_param param = { .sched_priority = 1 };
16556 +
16557 +       sched_setscheduler(current, SCHED_FIFO, &param);
16558 +
16559 +       /* Take over timer pending softirqs when starting */
16560 +       local_irq_disable();
16561 +       current->softirqs_raised = local_softirq_pending() & TIMER_SOFTIRQS;
16562 +       local_irq_enable();
16563 +}
16564 +
16565 +static inline void ktimer_softirqd_clr_sched_params(unsigned int cpu,
16566 +                                                   bool online)
16567 +{
16568 +       struct sched_param param = { .sched_priority = 0 };
16569 +
16570 +       sched_setscheduler(current, SCHED_NORMAL, &param);
16571 +}
16572 +
16573 +static int ktimer_softirqd_should_run(unsigned int cpu)
16574 +{
16575 +       return current->softirqs_raised;
16576 +}
16577 +
16578 +#endif /* PREEMPT_RT_FULL */
16579 +/*
16580   * Enter an interrupt context.
16581   */
16582  void irq_enter(void)
16583 @@ -341,9 +784,9 @@
16584                  * Prevent raise_softirq from needlessly waking up ksoftirqd
16585                  * here, as softirq will be serviced on return from interrupt.
16586                  */
16587 -               local_bh_disable();
16588 +               local_bh_disable_nort();
16589                 tick_irq_enter();
16590 -               _local_bh_enable();
16591 +               _local_bh_enable_nort();
16592         }
16593
16594         __irq_enter();
16595 @@ -351,9 +794,13 @@
16596
16597  static inline void invoke_softirq(void)
16598  {
16599 +#ifdef CONFIG_PREEMPT_RT_FULL
16600 +       unsigned long flags;
16601 +#endif
16602 +
16603         if (ksoftirqd_running())
16604                 return;
16605 -
16606 +#ifndef CONFIG_PREEMPT_RT_FULL
16607         if (!force_irqthreads) {
16608  #ifdef CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK
16609                 /*
16610 @@ -373,6 +820,17 @@
16611         } else {
16612                 wakeup_softirqd();
16613         }
16614 +#else /* PREEMPT_RT_FULL */
16615 +
16616 +       local_irq_save(flags);
16617 +       if (__this_cpu_read(ksoftirqd) &&
16618 +                       __this_cpu_read(ksoftirqd)->softirqs_raised)
16619 +               wakeup_softirqd();
16620 +       if (__this_cpu_read(ktimer_softirqd) &&
16621 +                       __this_cpu_read(ktimer_softirqd)->softirqs_raised)
16622 +               wakeup_timer_softirqd();
16623 +       local_irq_restore(flags);
16624 +#endif
16625  }
16626
16627  static inline void tick_irq_exit(void)
16628 @@ -409,26 +867,6 @@
16629         trace_hardirq_exit(); /* must be last! */
16630  }
16631
16632 -/*
16633 - * This function must run with irqs disabled!
16634 - */
16635 -inline void raise_softirq_irqoff(unsigned int nr)
16636 -{
16637 -       __raise_softirq_irqoff(nr);
16638 -
16639 -       /*
16640 -        * If we're in an interrupt or softirq, we're done
16641 -        * (this also catches softirq-disabled code). We will
16642 -        * actually run the softirq once we return from
16643 -        * the irq or softirq.
16644 -        *
16645 -        * Otherwise we wake up ksoftirqd to make sure we
16646 -        * schedule the softirq soon.
16647 -        */
16648 -       if (!in_interrupt())
16649 -               wakeup_softirqd();
16650 -}
16651 -
16652  void raise_softirq(unsigned int nr)
16653  {
16654         unsigned long flags;
16655 @@ -438,12 +876,6 @@
16656         local_irq_restore(flags);
16657  }
16658
16659 -void __raise_softirq_irqoff(unsigned int nr)
16660 -{
16661 -       trace_softirq_raise(nr);
16662 -       or_softirq_pending(1UL << nr);
16663 -}
16664 -
16665  void open_softirq(int nr, void (*action)(struct softirq_action *))
16666  {
16667         softirq_vec[nr].action = action;
16668 @@ -460,15 +892,45 @@
16669  static DEFINE_PER_CPU(struct tasklet_head, tasklet_vec);
16670  static DEFINE_PER_CPU(struct tasklet_head, tasklet_hi_vec);
16671
16672 +static void inline
16673 +__tasklet_common_schedule(struct tasklet_struct *t, struct tasklet_head *head, unsigned int nr)
16674 +{
16675 +       if (tasklet_trylock(t)) {
16676 +again:
16677 +               /* We may have been preempted before tasklet_trylock
16678 +                * and __tasklet_action may have already run.
16679 +                * So double check the sched bit while the takslet
16680 +                * is locked before adding it to the list.
16681 +                */
16682 +               if (test_bit(TASKLET_STATE_SCHED, &t->state)) {
16683 +                       t->next = NULL;
16684 +                       *head->tail = t;
16685 +                       head->tail = &(t->next);
16686 +                       raise_softirq_irqoff(nr);
16687 +                       tasklet_unlock(t);
16688 +               } else {
16689 +                       /* This is subtle. If we hit the corner case above
16690 +                        * It is possible that we get preempted right here,
16691 +                        * and another task has successfully called
16692 +                        * tasklet_schedule(), then this function, and
16693 +                        * failed on the trylock. Thus we must be sure
16694 +                        * before releasing the tasklet lock, that the
16695 +                        * SCHED_BIT is clear. Otherwise the tasklet
16696 +                        * may get its SCHED_BIT set, but not added to the
16697 +                        * list
16698 +                        */
16699 +                       if (!tasklet_tryunlock(t))
16700 +                               goto again;
16701 +               }
16702 +       }
16703 +}
16704 +
16705  void __tasklet_schedule(struct tasklet_struct *t)
16706  {
16707         unsigned long flags;
16708
16709         local_irq_save(flags);
16710 -       t->next = NULL;
16711 -       *__this_cpu_read(tasklet_vec.tail) = t;
16712 -       __this_cpu_write(tasklet_vec.tail, &(t->next));
16713 -       raise_softirq_irqoff(TASKLET_SOFTIRQ);
16714 +       __tasklet_common_schedule(t, this_cpu_ptr(&tasklet_vec), TASKLET_SOFTIRQ);
16715         local_irq_restore(flags);
16716  }
16717  EXPORT_SYMBOL(__tasklet_schedule);
16718 @@ -478,10 +940,7 @@
16719         unsigned long flags;
16720
16721         local_irq_save(flags);
16722 -       t->next = NULL;
16723 -       *__this_cpu_read(tasklet_hi_vec.tail) = t;
16724 -       __this_cpu_write(tasklet_hi_vec.tail,  &(t->next));
16725 -       raise_softirq_irqoff(HI_SOFTIRQ);
16726 +       __tasklet_common_schedule(t, this_cpu_ptr(&tasklet_hi_vec), HI_SOFTIRQ);
16727         local_irq_restore(flags);
16728  }
16729  EXPORT_SYMBOL(__tasklet_hi_schedule);
16730 @@ -490,82 +949,122 @@
16731  {
16732         BUG_ON(!irqs_disabled());
16733
16734 -       t->next = __this_cpu_read(tasklet_hi_vec.head);
16735 -       __this_cpu_write(tasklet_hi_vec.head, t);
16736 -       __raise_softirq_irqoff(HI_SOFTIRQ);
16737 +       __tasklet_hi_schedule(t);
16738  }
16739  EXPORT_SYMBOL(__tasklet_hi_schedule_first);
16740
16741 -static __latent_entropy void tasklet_action(struct softirq_action *a)
16742 +void  tasklet_enable(struct tasklet_struct *t)
16743  {
16744 -       struct tasklet_struct *list;
16745 +       if (!atomic_dec_and_test(&t->count))
16746 +               return;
16747 +       if (test_and_clear_bit(TASKLET_STATE_PENDING, &t->state))
16748 +               tasklet_schedule(t);
16749 +}
16750 +EXPORT_SYMBOL(tasklet_enable);
16751
16752 -       local_irq_disable();
16753 -       list = __this_cpu_read(tasklet_vec.head);
16754 -       __this_cpu_write(tasklet_vec.head, NULL);
16755 -       __this_cpu_write(tasklet_vec.tail, this_cpu_ptr(&tasklet_vec.head));
16756 -       local_irq_enable();
16757 +static void __tasklet_action(struct softirq_action *a,
16758 +                            struct tasklet_struct *list)
16759 +{
16760 +       int loops = 1000000;
16761
16762         while (list) {
16763                 struct tasklet_struct *t = list;
16764
16765                 list = list->next;
16766
16767 -               if (tasklet_trylock(t)) {
16768 -                       if (!atomic_read(&t->count)) {
16769 -                               if (!test_and_clear_bit(TASKLET_STATE_SCHED,
16770 -                                                       &t->state))
16771 -                                       BUG();
16772 -                               t->func(t->data);
16773 -                               tasklet_unlock(t);
16774 -                               continue;
16775 -                       }
16776 -                       tasklet_unlock(t);
16777 +               /*
16778 +                * Should always succeed - after a tasklist got on the
16779 +                * list (after getting the SCHED bit set from 0 to 1),
16780 +                * nothing but the tasklet softirq it got queued to can
16781 +                * lock it:
16782 +                */
16783 +               if (!tasklet_trylock(t)) {
16784 +                       WARN_ON(1);
16785 +                       continue;
16786                 }
16787
16788 -               local_irq_disable();
16789                 t->next = NULL;
16790 -               *__this_cpu_read(tasklet_vec.tail) = t;
16791 -               __this_cpu_write(tasklet_vec.tail, &(t->next));
16792 -               __raise_softirq_irqoff(TASKLET_SOFTIRQ);
16793 -               local_irq_enable();
16794 +
16795 +               /*
16796 +                * If we cannot handle the tasklet because it's disabled,
16797 +                * mark it as pending. tasklet_enable() will later
16798 +                * re-schedule the tasklet.
16799 +                */
16800 +               if (unlikely(atomic_read(&t->count))) {
16801 +out_disabled:
16802 +                       /* implicit unlock: */
16803 +                       wmb();
16804 +                       t->state = TASKLET_STATEF_PENDING;
16805 +                       continue;
16806 +               }
16807 +
16808 +               /*
16809 +                * After this point on the tasklet might be rescheduled
16810 +                * on another CPU, but it can only be added to another
16811 +                * CPU's tasklet list if we unlock the tasklet (which we
16812 +                * dont do yet).
16813 +                */
16814 +               if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
16815 +                       WARN_ON(1);
16816 +
16817 +again:
16818 +               t->func(t->data);
16819 +
16820 +               /*
16821 +                * Try to unlock the tasklet. We must use cmpxchg, because
16822 +                * another CPU might have scheduled or disabled the tasklet.
16823 +                * We only allow the STATE_RUN -> 0 transition here.
16824 +                */
16825 +               while (!tasklet_tryunlock(t)) {
16826 +                       /*
16827 +                        * If it got disabled meanwhile, bail out:
16828 +                        */
16829 +                       if (atomic_read(&t->count))
16830 +                               goto out_disabled;
16831 +                       /*
16832 +                        * If it got scheduled meanwhile, re-execute
16833 +                        * the tasklet function:
16834 +                        */
16835 +                       if (test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
16836 +                               goto again;
16837 +                       if (!--loops) {
16838 +                               printk("hm, tasklet state: %08lx\n", t->state);
16839 +                               WARN_ON(1);
16840 +                               tasklet_unlock(t);
16841 +                               break;
16842 +                       }
16843 +               }
16844         }
16845  }
16846
16847 +static void tasklet_action(struct softirq_action *a)
16848 +{
16849 +       struct tasklet_struct *list;
16850 +
16851 +       local_irq_disable();
16852 +
16853 +       list = __this_cpu_read(tasklet_vec.head);
16854 +       __this_cpu_write(tasklet_vec.head, NULL);
16855 +       __this_cpu_write(tasklet_vec.tail, this_cpu_ptr(&tasklet_vec.head));
16856 +
16857 +       local_irq_enable();
16858 +
16859 +       __tasklet_action(a, list);
16860 +}
16861 +
16862  static __latent_entropy void tasklet_hi_action(struct softirq_action *a)
16863  {
16864         struct tasklet_struct *list;
16865
16866         local_irq_disable();
16867 +
16868         list = __this_cpu_read(tasklet_hi_vec.head);
16869         __this_cpu_write(tasklet_hi_vec.head, NULL);
16870         __this_cpu_write(tasklet_hi_vec.tail, this_cpu_ptr(&tasklet_hi_vec.head));
16871 -       local_irq_enable();
16872
16873 -       while (list) {
16874 -               struct tasklet_struct *t = list;
16875 -
16876 -               list = list->next;
16877 -
16878 -               if (tasklet_trylock(t)) {
16879 -                       if (!atomic_read(&t->count)) {
16880 -                               if (!test_and_clear_bit(TASKLET_STATE_SCHED,
16881 -                                                       &t->state))
16882 -                                       BUG();
16883 -                               t->func(t->data);
16884 -                               tasklet_unlock(t);
16885 -                               continue;
16886 -                       }
16887 -                       tasklet_unlock(t);
16888 -               }
16889 +       local_irq_enable();
16890
16891 -               local_irq_disable();
16892 -               t->next = NULL;
16893 -               *__this_cpu_read(tasklet_hi_vec.tail) = t;
16894 -               __this_cpu_write(tasklet_hi_vec.tail, &(t->next));
16895 -               __raise_softirq_irqoff(HI_SOFTIRQ);
16896 -               local_irq_enable();
16897 -       }
16898 +       __tasklet_action(a, list);
16899  }
16900
16901  void tasklet_init(struct tasklet_struct *t,
16902 @@ -586,7 +1085,7 @@
16903
16904         while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) {
16905                 do {
16906 -                       yield();
16907 +                       msleep(1);
16908                 } while (test_bit(TASKLET_STATE_SCHED, &t->state));
16909         }
16910         tasklet_unlock_wait(t);
16911 @@ -660,25 +1159,26 @@
16912         open_softirq(HI_SOFTIRQ, tasklet_hi_action);
16913  }
16914
16915 -static int ksoftirqd_should_run(unsigned int cpu)
16916 +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
16917 +void tasklet_unlock_wait(struct tasklet_struct *t)
16918  {
16919 -       return local_softirq_pending();
16920 -}
16921 -
16922 -static void run_ksoftirqd(unsigned int cpu)
16923 -{
16924 -       local_irq_disable();
16925 -       if (local_softirq_pending()) {
16926 +       while (test_bit(TASKLET_STATE_RUN, &(t)->state)) {
16927                 /*
16928 -                * We can safely run softirq on inline stack, as we are not deep
16929 -                * in the task stack here.
16930 +                * Hack for now to avoid this busy-loop:
16931                  */
16932 -               __do_softirq();
16933 -               local_irq_enable();
16934 -               cond_resched_rcu_qs();
16935 -               return;
16936 +#ifdef CONFIG_PREEMPT_RT_FULL
16937 +               msleep(1);
16938 +#else
16939 +               barrier();
16940 +#endif
16941         }
16942 -       local_irq_enable();
16943 +}
16944 +EXPORT_SYMBOL(tasklet_unlock_wait);
16945 +#endif
16946 +
16947 +static int ksoftirqd_should_run(unsigned int cpu)
16948 +{
16949 +       return ksoftirqd_softirq_pending();
16950  }
16951
16952  #ifdef CONFIG_HOTPLUG_CPU
16953 @@ -745,17 +1245,31 @@
16954
16955  static struct smp_hotplug_thread softirq_threads = {
16956         .store                  = &ksoftirqd,
16957 +       .setup                  = ksoftirqd_set_sched_params,
16958         .thread_should_run      = ksoftirqd_should_run,
16959         .thread_fn              = run_ksoftirqd,
16960         .thread_comm            = "ksoftirqd/%u",
16961  };
16962
16963 +#ifdef CONFIG_PREEMPT_RT_FULL
16964 +static struct smp_hotplug_thread softirq_timer_threads = {
16965 +       .store                  = &ktimer_softirqd,
16966 +       .setup                  = ktimer_softirqd_set_sched_params,
16967 +       .cleanup                = ktimer_softirqd_clr_sched_params,
16968 +       .thread_should_run      = ktimer_softirqd_should_run,
16969 +       .thread_fn              = run_ksoftirqd,
16970 +       .thread_comm            = "ktimersoftd/%u",
16971 +};
16972 +#endif
16973 +
16974  static __init int spawn_ksoftirqd(void)
16975  {
16976         cpuhp_setup_state_nocalls(CPUHP_SOFTIRQ_DEAD, "softirq:dead", NULL,
16977                                   takeover_tasklets);
16978         BUG_ON(smpboot_register_percpu_thread(&softirq_threads));
16979 -
16980 +#ifdef CONFIG_PREEMPT_RT_FULL
16981 +       BUG_ON(smpboot_register_percpu_thread(&softirq_timer_threads));
16982 +#endif
16983         return 0;
16984  }
16985  early_initcall(spawn_ksoftirqd);
16986 diff -Nur linux-4.9.6.orig/kernel/stop_machine.c linux-4.9.6/kernel/stop_machine.c
16987 --- linux-4.9.6.orig/kernel/stop_machine.c      2017-01-26 08:25:24.000000000 +0100
16988 +++ linux-4.9.6/kernel/stop_machine.c   2017-01-28 13:59:10.087662182 +0100
16989 @@ -36,7 +36,7 @@
16990  struct cpu_stopper {
16991         struct task_struct      *thread;
16992
16993 -       spinlock_t              lock;
16994 +       raw_spinlock_t          lock;
16995         bool                    enabled;        /* is this stopper enabled? */
16996         struct list_head        works;          /* list of pending works */
16997
16998 @@ -78,14 +78,14 @@
16999         unsigned long flags;
17000         bool enabled;
17001
17002 -       spin_lock_irqsave(&stopper->lock, flags);
17003 +       raw_spin_lock_irqsave(&stopper->lock, flags);
17004         enabled = stopper->enabled;
17005         if (enabled)
17006                 __cpu_stop_queue_work(stopper, work);
17007         else if (work->done)
17008                 cpu_stop_signal_done(work->done);
17009 -       spin_unlock_irqrestore(&stopper->lock, flags);
17010
17011 +       raw_spin_unlock_irqrestore(&stopper->lock, flags);
17012         return enabled;
17013  }
17014
17015 @@ -231,8 +231,8 @@
17016         struct cpu_stopper *stopper2 = per_cpu_ptr(&cpu_stopper, cpu2);
17017         int err;
17018  retry:
17019 -       spin_lock_irq(&stopper1->lock);
17020 -       spin_lock_nested(&stopper2->lock, SINGLE_DEPTH_NESTING);
17021 +       raw_spin_lock_irq(&stopper1->lock);
17022 +       raw_spin_lock_nested(&stopper2->lock, SINGLE_DEPTH_NESTING);
17023
17024         err = -ENOENT;
17025         if (!stopper1->enabled || !stopper2->enabled)
17026 @@ -255,8 +255,8 @@
17027         __cpu_stop_queue_work(stopper1, work1);
17028         __cpu_stop_queue_work(stopper2, work2);
17029  unlock:
17030 -       spin_unlock(&stopper2->lock);
17031 -       spin_unlock_irq(&stopper1->lock);
17032 +       raw_spin_unlock(&stopper2->lock);
17033 +       raw_spin_unlock_irq(&stopper1->lock);
17034
17035         if (unlikely(err == -EDEADLK)) {
17036                 while (stop_cpus_in_progress)
17037 @@ -448,9 +448,9 @@
17038         unsigned long flags;
17039         int run;
17040
17041 -       spin_lock_irqsave(&stopper->lock, flags);
17042 +       raw_spin_lock_irqsave(&stopper->lock, flags);
17043         run = !list_empty(&stopper->works);
17044 -       spin_unlock_irqrestore(&stopper->lock, flags);
17045 +       raw_spin_unlock_irqrestore(&stopper->lock, flags);
17046         return run;
17047  }
17048
17049 @@ -461,13 +461,13 @@
17050
17051  repeat:
17052         work = NULL;
17053 -       spin_lock_irq(&stopper->lock);
17054 +       raw_spin_lock_irq(&stopper->lock);
17055         if (!list_empty(&stopper->works)) {
17056                 work = list_first_entry(&stopper->works,
17057                                         struct cpu_stop_work, list);
17058                 list_del_init(&work->list);
17059         }
17060 -       spin_unlock_irq(&stopper->lock);
17061 +       raw_spin_unlock_irq(&stopper->lock);
17062
17063         if (work) {
17064                 cpu_stop_fn_t fn = work->fn;
17065 @@ -475,6 +475,8 @@
17066                 struct cpu_stop_done *done = work->done;
17067                 int ret;
17068
17069 +               /* XXX */
17070 +
17071                 /* cpu stop callbacks must not sleep, make in_atomic() == T */
17072                 preempt_count_inc();
17073                 ret = fn(arg);
17074 @@ -541,7 +543,7 @@
17075         for_each_possible_cpu(cpu) {
17076                 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
17077
17078 -               spin_lock_init(&stopper->lock);
17079 +               raw_spin_lock_init(&stopper->lock);
17080                 INIT_LIST_HEAD(&stopper->works);
17081         }
17082
17083 diff -Nur linux-4.9.6.orig/kernel/time/hrtimer.c linux-4.9.6/kernel/time/hrtimer.c
17084 --- linux-4.9.6.orig/kernel/time/hrtimer.c      2017-01-26 08:25:24.000000000 +0100
17085 +++ linux-4.9.6/kernel/time/hrtimer.c   2017-01-28 13:59:10.087662182 +0100
17086 @@ -53,6 +53,7 @@
17087  #include <asm/uaccess.h>
17088
17089  #include <trace/events/timer.h>
17090 +#include <trace/events/hist.h>
17091
17092  #include "tick-internal.h"
17093
17094 @@ -695,6 +696,29 @@
17095         retrigger_next_event(NULL);
17096  }
17097
17098 +#ifdef CONFIG_PREEMPT_RT_FULL
17099 +
17100 +static struct swork_event clock_set_delay_work;
17101 +
17102 +static void run_clock_set_delay(struct swork_event *event)
17103 +{
17104 +       clock_was_set();
17105 +}
17106 +
17107 +void clock_was_set_delayed(void)
17108 +{
17109 +       swork_queue(&clock_set_delay_work);
17110 +}
17111 +
17112 +static __init int create_clock_set_delay_thread(void)
17113 +{
17114 +       WARN_ON(swork_get());
17115 +       INIT_SWORK(&clock_set_delay_work, run_clock_set_delay);
17116 +       return 0;
17117 +}
17118 +early_initcall(create_clock_set_delay_thread);
17119 +#else /* PREEMPT_RT_FULL */
17120 +
17121  static void clock_was_set_work(struct work_struct *work)
17122  {
17123         clock_was_set();
17124 @@ -710,6 +734,7 @@
17125  {
17126         schedule_work(&hrtimer_work);
17127  }
17128 +#endif
17129
17130  #else
17131
17132 @@ -719,11 +744,8 @@
17133  static inline void hrtimer_switch_to_hres(void) { }
17134  static inline void
17135  hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { }
17136 -static inline int hrtimer_reprogram(struct hrtimer *timer,
17137 -                                   struct hrtimer_clock_base *base)
17138 -{
17139 -       return 0;
17140 -}
17141 +static inline void hrtimer_reprogram(struct hrtimer *timer,
17142 +                                    struct hrtimer_clock_base *base) { }
17143  static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { }
17144  static inline void retrigger_next_event(void *arg) { }
17145
17146 @@ -855,6 +877,32 @@
17147  }
17148  EXPORT_SYMBOL_GPL(hrtimer_forward);
17149
17150 +#ifdef CONFIG_PREEMPT_RT_BASE
17151 +# define wake_up_timer_waiters(b)      wake_up(&(b)->wait)
17152 +
17153 +/**
17154 + * hrtimer_wait_for_timer - Wait for a running timer
17155 + *
17156 + * @timer:     timer to wait for
17157 + *
17158 + * The function waits in case the timers callback function is
17159 + * currently executed on the waitqueue of the timer base. The
17160 + * waitqueue is woken up after the timer callback function has
17161 + * finished execution.
17162 + */
17163 +void hrtimer_wait_for_timer(const struct hrtimer *timer)
17164 +{
17165 +       struct hrtimer_clock_base *base = timer->base;
17166 +
17167 +       if (base && base->cpu_base && !timer->irqsafe)
17168 +               wait_event(base->cpu_base->wait,
17169 +                               !(hrtimer_callback_running(timer)));
17170 +}
17171 +
17172 +#else
17173 +# define wake_up_timer_waiters(b)      do { } while (0)
17174 +#endif
17175 +
17176  /*
17177   * enqueue_hrtimer - internal function to (re)start a timer
17178   *
17179 @@ -896,6 +944,11 @@
17180         if (!(state & HRTIMER_STATE_ENQUEUED))
17181                 return;
17182
17183 +       if (unlikely(!list_empty(&timer->cb_entry))) {
17184 +               list_del_init(&timer->cb_entry);
17185 +               return;
17186 +       }
17187 +
17188         if (!timerqueue_del(&base->active, &timer->node))
17189                 cpu_base->active_bases &= ~(1 << base->index);
17190
17191 @@ -991,7 +1044,16 @@
17192         new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED);
17193
17194         timer_stats_hrtimer_set_start_info(timer);
17195 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
17196 +       {
17197 +               ktime_t now = new_base->get_time();
17198
17199 +               if (ktime_to_ns(tim) < ktime_to_ns(now))
17200 +                       timer->praecox = now;
17201 +               else
17202 +                       timer->praecox = ktime_set(0, 0);
17203 +       }
17204 +#endif
17205         leftmost = enqueue_hrtimer(timer, new_base);
17206         if (!leftmost)
17207                 goto unlock;
17208 @@ -1063,7 +1125,7 @@
17209
17210                 if (ret >= 0)
17211                         return ret;
17212 -               cpu_relax();
17213 +               hrtimer_wait_for_timer(timer);
17214         }
17215  }
17216  EXPORT_SYMBOL_GPL(hrtimer_cancel);
17217 @@ -1127,6 +1189,7 @@
17218
17219         base = hrtimer_clockid_to_base(clock_id);
17220         timer->base = &cpu_base->clock_base[base];
17221 +       INIT_LIST_HEAD(&timer->cb_entry);
17222         timerqueue_init(&timer->node);
17223
17224  #ifdef CONFIG_TIMER_STATS
17225 @@ -1167,6 +1230,7 @@
17226                 seq = raw_read_seqcount_begin(&cpu_base->seq);
17227
17228                 if (timer->state != HRTIMER_STATE_INACTIVE ||
17229 +                   cpu_base->running_soft == timer ||
17230                     cpu_base->running == timer)
17231                         return true;
17232
17233 @@ -1265,10 +1329,112 @@
17234         cpu_base->running = NULL;
17235  }
17236
17237 +#ifdef CONFIG_PREEMPT_RT_BASE
17238 +static void hrtimer_rt_reprogram(int restart, struct hrtimer *timer,
17239 +                                struct hrtimer_clock_base *base)
17240 +{
17241 +       int leftmost;
17242 +
17243 +       if (restart != HRTIMER_NORESTART &&
17244 +           !(timer->state & HRTIMER_STATE_ENQUEUED)) {
17245 +
17246 +               leftmost = enqueue_hrtimer(timer, base);
17247 +               if (!leftmost)
17248 +                       return;
17249 +#ifdef CONFIG_HIGH_RES_TIMERS
17250 +               if (!hrtimer_is_hres_active(timer)) {
17251 +                       /*
17252 +                        * Kick to reschedule the next tick to handle the new timer
17253 +                        * on dynticks target.
17254 +                        */
17255 +                       if (base->cpu_base->nohz_active)
17256 +                               wake_up_nohz_cpu(base->cpu_base->cpu);
17257 +               } else {
17258 +
17259 +                       hrtimer_reprogram(timer, base);
17260 +               }
17261 +#endif
17262 +       }
17263 +}
17264 +
17265 +/*
17266 + * The changes in mainline which removed the callback modes from
17267 + * hrtimer are not yet working with -rt. The non wakeup_process()
17268 + * based callbacks which involve sleeping locks need to be treated
17269 + * seperately.
17270 + */
17271 +static void hrtimer_rt_run_pending(void)
17272 +{
17273 +       enum hrtimer_restart (*fn)(struct hrtimer *);
17274 +       struct hrtimer_cpu_base *cpu_base;
17275 +       struct hrtimer_clock_base *base;
17276 +       struct hrtimer *timer;
17277 +       int index, restart;
17278 +
17279 +       local_irq_disable();
17280 +       cpu_base = &per_cpu(hrtimer_bases, smp_processor_id());
17281 +
17282 +       raw_spin_lock(&cpu_base->lock);
17283 +
17284 +       for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) {
17285 +               base = &cpu_base->clock_base[index];
17286 +
17287 +               while (!list_empty(&base->expired)) {
17288 +                       timer = list_first_entry(&base->expired,
17289 +                                                struct hrtimer, cb_entry);
17290 +
17291 +                       /*
17292 +                        * Same as the above __run_hrtimer function
17293 +                        * just we run with interrupts enabled.
17294 +                        */
17295 +                       debug_deactivate(timer);
17296 +                       cpu_base->running_soft = timer;
17297 +                       raw_write_seqcount_barrier(&cpu_base->seq);
17298 +
17299 +                       __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0);
17300 +                       timer_stats_account_hrtimer(timer);
17301 +                       fn = timer->function;
17302 +
17303 +                       raw_spin_unlock_irq(&cpu_base->lock);
17304 +                       restart = fn(timer);
17305 +                       raw_spin_lock_irq(&cpu_base->lock);
17306 +
17307 +                       hrtimer_rt_reprogram(restart, timer, base);
17308 +                       raw_write_seqcount_barrier(&cpu_base->seq);
17309 +
17310 +                       WARN_ON_ONCE(cpu_base->running_soft != timer);
17311 +                       cpu_base->running_soft = NULL;
17312 +               }
17313 +       }
17314 +
17315 +       raw_spin_unlock_irq(&cpu_base->lock);
17316 +
17317 +       wake_up_timer_waiters(cpu_base);
17318 +}
17319 +
17320 +static int hrtimer_rt_defer(struct hrtimer *timer)
17321 +{
17322 +       if (timer->irqsafe)
17323 +               return 0;
17324 +
17325 +       __remove_hrtimer(timer, timer->base, timer->state, 0);
17326 +       list_add_tail(&timer->cb_entry, &timer->base->expired);
17327 +       return 1;
17328 +}
17329 +
17330 +#else
17331 +
17332 +static inline int hrtimer_rt_defer(struct hrtimer *timer) { return 0; }
17333 +
17334 +#endif
17335 +
17336 +static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer);
17337 +
17338  static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)
17339  {
17340         struct hrtimer_clock_base *base = cpu_base->clock_base;
17341         unsigned int active = cpu_base->active_bases;
17342 +       int raise = 0;
17343
17344         for (; active; base++, active >>= 1) {
17345                 struct timerqueue_node *node;
17346 @@ -1284,6 +1450,15 @@
17347
17348                         timer = container_of(node, struct hrtimer, node);
17349
17350 +                       trace_hrtimer_interrupt(raw_smp_processor_id(),
17351 +                           ktime_to_ns(ktime_sub(ktime_to_ns(timer->praecox) ?
17352 +                               timer->praecox : hrtimer_get_expires(timer),
17353 +                               basenow)),
17354 +                           current,
17355 +                           timer->function == hrtimer_wakeup ?
17356 +                           container_of(timer, struct hrtimer_sleeper,
17357 +                               timer)->task : NULL);
17358 +
17359                         /*
17360                          * The immediate goal for using the softexpires is
17361                          * minimizing wakeups, not running timers at the
17362 @@ -1299,9 +1474,14 @@
17363                         if (basenow.tv64 < hrtimer_get_softexpires_tv64(timer))
17364                                 break;
17365
17366 -                       __run_hrtimer(cpu_base, base, timer, &basenow);
17367 +                       if (!hrtimer_rt_defer(timer))
17368 +                               __run_hrtimer(cpu_base, base, timer, &basenow);
17369 +                       else
17370 +                               raise = 1;
17371                 }
17372         }
17373 +       if (raise)
17374 +               raise_softirq_irqoff(HRTIMER_SOFTIRQ);
17375  }
17376
17377  #ifdef CONFIG_HIGH_RES_TIMERS
17378 @@ -1464,16 +1644,18 @@
17379  void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
17380  {
17381         sl->timer.function = hrtimer_wakeup;
17382 +       sl->timer.irqsafe = 1;
17383         sl->task = task;
17384  }
17385  EXPORT_SYMBOL_GPL(hrtimer_init_sleeper);
17386
17387 -static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode)
17388 +static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode,
17389 +                               unsigned long state)
17390  {
17391         hrtimer_init_sleeper(t, current);
17392
17393         do {
17394 -               set_current_state(TASK_INTERRUPTIBLE);
17395 +               set_current_state(state);
17396                 hrtimer_start_expires(&t->timer, mode);
17397
17398                 if (likely(t->task))
17399 @@ -1515,7 +1697,8 @@
17400                                 HRTIMER_MODE_ABS);
17401         hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires);
17402
17403 -       if (do_nanosleep(&t, HRTIMER_MODE_ABS))
17404 +       /* cpu_chill() does not care about restart state. */
17405 +       if (do_nanosleep(&t, HRTIMER_MODE_ABS, TASK_INTERRUPTIBLE))
17406                 goto out;
17407
17408         rmtp = restart->nanosleep.rmtp;
17409 @@ -1532,8 +1715,10 @@
17410         return ret;
17411  }
17412
17413 -long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
17414 -                      const enum hrtimer_mode mode, const clockid_t clockid)
17415 +static long
17416 +__hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
17417 +                   const enum hrtimer_mode mode, const clockid_t clockid,
17418 +                   unsigned long state)
17419  {
17420         struct restart_block *restart;
17421         struct hrtimer_sleeper t;
17422 @@ -1546,7 +1731,7 @@
17423
17424         hrtimer_init_on_stack(&t.timer, clockid, mode);
17425         hrtimer_set_expires_range_ns(&t.timer, timespec_to_ktime(*rqtp), slack);
17426 -       if (do_nanosleep(&t, mode))
17427 +       if (do_nanosleep(&t, mode, state))
17428                 goto out;
17429
17430         /* Absolute timers do not update the rmtp value and restart: */
17431 @@ -1573,6 +1758,12 @@
17432         return ret;
17433  }
17434
17435 +long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
17436 +                      const enum hrtimer_mode mode, const clockid_t clockid)
17437 +{
17438 +       return __hrtimer_nanosleep(rqtp, rmtp, mode, clockid, TASK_INTERRUPTIBLE);
17439 +}
17440 +
17441  SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp,
17442                 struct timespec __user *, rmtp)
17443  {
17444 @@ -1587,6 +1778,26 @@
17445         return hrtimer_nanosleep(&tu, rmtp, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
17446  }
17447
17448 +#ifdef CONFIG_PREEMPT_RT_FULL
17449 +/*
17450 + * Sleep for 1 ms in hope whoever holds what we want will let it go.
17451 + */
17452 +void cpu_chill(void)
17453 +{
17454 +       struct timespec tu = {
17455 +               .tv_nsec = NSEC_PER_MSEC,
17456 +       };
17457 +       unsigned int freeze_flag = current->flags & PF_NOFREEZE;
17458 +
17459 +       current->flags |= PF_NOFREEZE;
17460 +       __hrtimer_nanosleep(&tu, NULL, HRTIMER_MODE_REL, CLOCK_MONOTONIC,
17461 +                           TASK_UNINTERRUPTIBLE);
17462 +       if (!freeze_flag)
17463 +               current->flags &= ~PF_NOFREEZE;
17464 +}
17465 +EXPORT_SYMBOL(cpu_chill);
17466 +#endif
17467 +
17468  /*
17469   * Functions related to boot-time initialization:
17470   */
17471 @@ -1598,10 +1809,14 @@
17472         for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
17473                 cpu_base->clock_base[i].cpu_base = cpu_base;
17474                 timerqueue_init_head(&cpu_base->clock_base[i].active);
17475 +               INIT_LIST_HEAD(&cpu_base->clock_base[i].expired);
17476         }
17477
17478         cpu_base->cpu = cpu;
17479         hrtimer_init_hres(cpu_base);
17480 +#ifdef CONFIG_PREEMPT_RT_BASE
17481 +       init_waitqueue_head(&cpu_base->wait);
17482 +#endif
17483         return 0;
17484  }
17485
17486 @@ -1671,9 +1886,26 @@
17487
17488  #endif /* CONFIG_HOTPLUG_CPU */
17489
17490 +#ifdef CONFIG_PREEMPT_RT_BASE
17491 +
17492 +static void run_hrtimer_softirq(struct softirq_action *h)
17493 +{
17494 +       hrtimer_rt_run_pending();
17495 +}
17496 +
17497 +static void hrtimers_open_softirq(void)
17498 +{
17499 +       open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq);
17500 +}
17501 +
17502 +#else
17503 +static void hrtimers_open_softirq(void) { }
17504 +#endif
17505 +
17506  void __init hrtimers_init(void)
17507  {
17508         hrtimers_prepare_cpu(smp_processor_id());
17509 +       hrtimers_open_softirq();
17510  }
17511
17512  /**
17513 diff -Nur linux-4.9.6.orig/kernel/time/itimer.c linux-4.9.6/kernel/time/itimer.c
17514 --- linux-4.9.6.orig/kernel/time/itimer.c       2017-01-26 08:25:24.000000000 +0100
17515 +++ linux-4.9.6/kernel/time/itimer.c    2017-01-28 13:59:10.087662182 +0100
17516 @@ -213,6 +213,7 @@
17517                 /* We are sharing ->siglock with it_real_fn() */
17518                 if (hrtimer_try_to_cancel(timer) < 0) {
17519                         spin_unlock_irq(&tsk->sighand->siglock);
17520 +                       hrtimer_wait_for_timer(&tsk->signal->real_timer);
17521                         goto again;
17522                 }
17523                 expires = timeval_to_ktime(value->it_value);
17524 diff -Nur linux-4.9.6.orig/kernel/time/jiffies.c linux-4.9.6/kernel/time/jiffies.c
17525 --- linux-4.9.6.orig/kernel/time/jiffies.c      2017-01-26 08:25:24.000000000 +0100
17526 +++ linux-4.9.6/kernel/time/jiffies.c   2017-01-28 13:59:10.087662182 +0100
17527 @@ -74,7 +74,8 @@
17528         .max_cycles     = 10,
17529  };
17530
17531 -__cacheline_aligned_in_smp DEFINE_SEQLOCK(jiffies_lock);
17532 +__cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(jiffies_lock);
17533 +__cacheline_aligned_in_smp seqcount_t jiffies_seq;
17534
17535  #if (BITS_PER_LONG < 64)
17536  u64 get_jiffies_64(void)
17537 @@ -83,9 +84,9 @@
17538         u64 ret;
17539
17540         do {
17541 -               seq = read_seqbegin(&jiffies_lock);
17542 +               seq = read_seqcount_begin(&jiffies_seq);
17543                 ret = jiffies_64;
17544 -       } while (read_seqretry(&jiffies_lock, seq));
17545 +       } while (read_seqcount_retry(&jiffies_seq, seq));
17546         return ret;
17547  }
17548  EXPORT_SYMBOL(get_jiffies_64);
17549 diff -Nur linux-4.9.6.orig/kernel/time/ntp.c linux-4.9.6/kernel/time/ntp.c
17550 --- linux-4.9.6.orig/kernel/time/ntp.c  2017-01-26 08:25:24.000000000 +0100
17551 +++ linux-4.9.6/kernel/time/ntp.c       2017-01-28 13:59:10.087662182 +0100
17552 @@ -17,6 +17,7 @@
17553  #include <linux/module.h>
17554  #include <linux/rtc.h>
17555  #include <linux/math64.h>
17556 +#include <linux/swork.h>
17557
17558  #include "ntp_internal.h"
17559  #include "timekeeping_internal.h"
17560 @@ -568,10 +569,35 @@
17561                            &sync_cmos_work, timespec64_to_jiffies(&next));
17562  }
17563
17564 +#ifdef CONFIG_PREEMPT_RT_FULL
17565 +
17566 +static void run_clock_set_delay(struct swork_event *event)
17567 +{
17568 +       queue_delayed_work(system_power_efficient_wq, &sync_cmos_work, 0);
17569 +}
17570 +
17571 +static struct swork_event ntp_cmos_swork;
17572 +
17573 +void ntp_notify_cmos_timer(void)
17574 +{
17575 +       swork_queue(&ntp_cmos_swork);
17576 +}
17577 +
17578 +static __init int create_cmos_delay_thread(void)
17579 +{
17580 +       WARN_ON(swork_get());
17581 +       INIT_SWORK(&ntp_cmos_swork, run_clock_set_delay);
17582 +       return 0;
17583 +}
17584 +early_initcall(create_cmos_delay_thread);
17585 +
17586 +#else
17587 +
17588  void ntp_notify_cmos_timer(void)
17589  {
17590         queue_delayed_work(system_power_efficient_wq, &sync_cmos_work, 0);
17591  }
17592 +#endif /* CONFIG_PREEMPT_RT_FULL */
17593
17594  #else
17595  void ntp_notify_cmos_timer(void) { }
17596 diff -Nur linux-4.9.6.orig/kernel/time/posix-cpu-timers.c linux-4.9.6/kernel/time/posix-cpu-timers.c
17597 --- linux-4.9.6.orig/kernel/time/posix-cpu-timers.c     2017-01-26 08:25:24.000000000 +0100
17598 +++ linux-4.9.6/kernel/time/posix-cpu-timers.c  2017-01-28 13:59:10.087662182 +0100
17599 @@ -3,6 +3,7 @@
17600   */
17601
17602  #include <linux/sched.h>
17603 +#include <linux/sched/rt.h>
17604  #include <linux/posix-timers.h>
17605  #include <linux/errno.h>
17606  #include <linux/math64.h>
17607 @@ -620,7 +621,7 @@
17608         /*
17609          * Disarm any old timer after extracting its expiry time.
17610          */
17611 -       WARN_ON_ONCE(!irqs_disabled());
17612 +       WARN_ON_ONCE_NONRT(!irqs_disabled());
17613
17614         ret = 0;
17615         old_incr = timer->it.cpu.incr;
17616 @@ -1064,7 +1065,7 @@
17617         /*
17618          * Now re-arm for the new expiry time.
17619          */
17620 -       WARN_ON_ONCE(!irqs_disabled());
17621 +       WARN_ON_ONCE_NONRT(!irqs_disabled());
17622         arm_timer(timer);
17623         unlock_task_sighand(p, &flags);
17624
17625 @@ -1153,13 +1154,13 @@
17626   * already updated our counts.  We need to check if any timers fire now.
17627   * Interrupts are disabled.
17628   */
17629 -void run_posix_cpu_timers(struct task_struct *tsk)
17630 +static void __run_posix_cpu_timers(struct task_struct *tsk)
17631  {
17632         LIST_HEAD(firing);
17633         struct k_itimer *timer, *next;
17634         unsigned long flags;
17635
17636 -       WARN_ON_ONCE(!irqs_disabled());
17637 +       WARN_ON_ONCE_NONRT(!irqs_disabled());
17638
17639         /*
17640          * The fast path checks that there are no expired thread or thread
17641 @@ -1213,6 +1214,190 @@
17642         }
17643  }
17644
17645 +#ifdef CONFIG_PREEMPT_RT_BASE
17646 +#include <linux/kthread.h>
17647 +#include <linux/cpu.h>
17648 +DEFINE_PER_CPU(struct task_struct *, posix_timer_task);
17649 +DEFINE_PER_CPU(struct task_struct *, posix_timer_tasklist);
17650 +
17651 +static int posix_cpu_timers_thread(void *data)
17652 +{
17653 +       int cpu = (long)data;
17654 +
17655 +       BUG_ON(per_cpu(posix_timer_task,cpu) != current);
17656 +
17657 +       while (!kthread_should_stop()) {
17658 +               struct task_struct *tsk = NULL;
17659 +               struct task_struct *next = NULL;
17660 +
17661 +               if (cpu_is_offline(cpu))
17662 +                       goto wait_to_die;
17663 +
17664 +               /* grab task list */
17665 +               raw_local_irq_disable();
17666 +               tsk = per_cpu(posix_timer_tasklist, cpu);
17667 +               per_cpu(posix_timer_tasklist, cpu) = NULL;
17668 +               raw_local_irq_enable();
17669 +
17670 +               /* its possible the list is empty, just return */
17671 +               if (!tsk) {
17672 +                       set_current_state(TASK_INTERRUPTIBLE);
17673 +                       schedule();
17674 +                       __set_current_state(TASK_RUNNING);
17675 +                       continue;
17676 +               }
17677 +
17678 +               /* Process task list */
17679 +               while (1) {
17680 +                       /* save next */
17681 +                       next = tsk->posix_timer_list;
17682 +
17683 +                       /* run the task timers, clear its ptr and
17684 +                        * unreference it
17685 +                        */
17686 +                       __run_posix_cpu_timers(tsk);
17687 +                       tsk->posix_timer_list = NULL;
17688 +                       put_task_struct(tsk);
17689 +
17690 +                       /* check if this is the last on the list */
17691 +                       if (next == tsk)
17692 +                               break;
17693 +                       tsk = next;
17694 +               }
17695 +       }
17696 +       return 0;
17697 +
17698 +wait_to_die:
17699 +       /* Wait for kthread_stop */
17700 +       set_current_state(TASK_INTERRUPTIBLE);
17701 +       while (!kthread_should_stop()) {
17702 +               schedule();
17703 +               set_current_state(TASK_INTERRUPTIBLE);
17704 +       }
17705 +       __set_current_state(TASK_RUNNING);
17706 +       return 0;
17707 +}
17708 +
17709 +static inline int __fastpath_timer_check(struct task_struct *tsk)
17710 +{
17711 +       /* tsk == current, ensure it is safe to use ->signal/sighand */
17712 +       if (unlikely(tsk->exit_state))
17713 +               return 0;
17714 +
17715 +       if (!task_cputime_zero(&tsk->cputime_expires))
17716 +                       return 1;
17717 +
17718 +       if (!task_cputime_zero(&tsk->signal->cputime_expires))
17719 +                       return 1;
17720 +
17721 +       return 0;
17722 +}
17723 +
17724 +void run_posix_cpu_timers(struct task_struct *tsk)
17725 +{
17726 +       unsigned long cpu = smp_processor_id();
17727 +       struct task_struct *tasklist;
17728 +
17729 +       BUG_ON(!irqs_disabled());
17730 +       if(!per_cpu(posix_timer_task, cpu))
17731 +               return;
17732 +       /* get per-cpu references */
17733 +       tasklist = per_cpu(posix_timer_tasklist, cpu);
17734 +
17735 +       /* check to see if we're already queued */
17736 +       if (!tsk->posix_timer_list && __fastpath_timer_check(tsk)) {
17737 +               get_task_struct(tsk);
17738 +               if (tasklist) {
17739 +                       tsk->posix_timer_list = tasklist;
17740 +               } else {
17741 +                       /*
17742 +                        * The list is terminated by a self-pointing
17743 +                        * task_struct
17744 +                        */
17745 +                       tsk->posix_timer_list = tsk;
17746 +               }
17747 +               per_cpu(posix_timer_tasklist, cpu) = tsk;
17748 +
17749 +               wake_up_process(per_cpu(posix_timer_task, cpu));
17750 +       }
17751 +}
17752 +
17753 +/*
17754 + * posix_cpu_thread_call - callback that gets triggered when a CPU is added.
17755 + * Here we can start up the necessary migration thread for the new CPU.
17756 + */
17757 +static int posix_cpu_thread_call(struct notifier_block *nfb,
17758 +                                unsigned long action, void *hcpu)
17759 +{
17760 +       int cpu = (long)hcpu;
17761 +       struct task_struct *p;
17762 +       struct sched_param param;
17763 +
17764 +       switch (action) {
17765 +       case CPU_UP_PREPARE:
17766 +               p = kthread_create(posix_cpu_timers_thread, hcpu,
17767 +                                       "posixcputmr/%d",cpu);
17768 +               if (IS_ERR(p))
17769 +                       return NOTIFY_BAD;
17770 +               p->flags |= PF_NOFREEZE;
17771 +               kthread_bind(p, cpu);
17772 +               /* Must be high prio to avoid getting starved */
17773 +               param.sched_priority = MAX_RT_PRIO-1;
17774 +               sched_setscheduler(p, SCHED_FIFO, &param);
17775 +               per_cpu(posix_timer_task,cpu) = p;
17776 +               break;
17777 +       case CPU_ONLINE:
17778 +               /* Strictly unneccessary, as first user will wake it. */
17779 +               wake_up_process(per_cpu(posix_timer_task,cpu));
17780 +               break;
17781 +#ifdef CONFIG_HOTPLUG_CPU
17782 +       case CPU_UP_CANCELED:
17783 +               /* Unbind it from offline cpu so it can run.  Fall thru. */
17784 +               kthread_bind(per_cpu(posix_timer_task, cpu),
17785 +                            cpumask_any(cpu_online_mask));
17786 +               kthread_stop(per_cpu(posix_timer_task,cpu));
17787 +               per_cpu(posix_timer_task,cpu) = NULL;
17788 +               break;
17789 +       case CPU_DEAD:
17790 +               kthread_stop(per_cpu(posix_timer_task,cpu));
17791 +               per_cpu(posix_timer_task,cpu) = NULL;
17792 +               break;
17793 +#endif
17794 +       }
17795 +       return NOTIFY_OK;
17796 +}
17797 +
17798 +/* Register at highest priority so that task migration (migrate_all_tasks)
17799 + * happens before everything else.
17800 + */
17801 +static struct notifier_block posix_cpu_thread_notifier = {
17802 +       .notifier_call = posix_cpu_thread_call,
17803 +       .priority = 10
17804 +};
17805 +
17806 +static int __init posix_cpu_thread_init(void)
17807 +{
17808 +       void *hcpu = (void *)(long)smp_processor_id();
17809 +       /* Start one for boot CPU. */
17810 +       unsigned long cpu;
17811 +
17812 +       /* init the per-cpu posix_timer_tasklets */
17813 +       for_each_possible_cpu(cpu)
17814 +               per_cpu(posix_timer_tasklist, cpu) = NULL;
17815 +
17816 +       posix_cpu_thread_call(&posix_cpu_thread_notifier, CPU_UP_PREPARE, hcpu);
17817 +       posix_cpu_thread_call(&posix_cpu_thread_notifier, CPU_ONLINE, hcpu);
17818 +       register_cpu_notifier(&posix_cpu_thread_notifier);
17819 +       return 0;
17820 +}
17821 +early_initcall(posix_cpu_thread_init);
17822 +#else /* CONFIG_PREEMPT_RT_BASE */
17823 +void run_posix_cpu_timers(struct task_struct *tsk)
17824 +{
17825 +       __run_posix_cpu_timers(tsk);
17826 +}
17827 +#endif /* CONFIG_PREEMPT_RT_BASE */
17828 +
17829  /*
17830   * Set one of the process-wide special case CPU timers or RLIMIT_CPU.
17831   * The tsk->sighand->siglock must be held by the caller.
17832 diff -Nur linux-4.9.6.orig/kernel/time/posix-timers.c linux-4.9.6/kernel/time/posix-timers.c
17833 --- linux-4.9.6.orig/kernel/time/posix-timers.c 2017-01-26 08:25:24.000000000 +0100
17834 +++ linux-4.9.6/kernel/time/posix-timers.c      2017-01-28 13:59:10.087662182 +0100
17835 @@ -506,6 +506,7 @@
17836  static struct pid *good_sigevent(sigevent_t * event)
17837  {
17838         struct task_struct *rtn = current->group_leader;
17839 +       int sig = event->sigev_signo;
17840
17841         if ((event->sigev_notify & SIGEV_THREAD_ID ) &&
17842                 (!(rtn = find_task_by_vpid(event->sigev_notify_thread_id)) ||
17843 @@ -514,7 +515,8 @@
17844                 return NULL;
17845
17846         if (((event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) &&
17847 -           ((event->sigev_signo <= 0) || (event->sigev_signo > SIGRTMAX)))
17848 +           (sig <= 0 || sig > SIGRTMAX || sig_kernel_only(sig) ||
17849 +            sig_kernel_coredump(sig)))
17850                 return NULL;
17851
17852         return task_pid(rtn);
17853 @@ -826,6 +828,20 @@
17854         return overrun;
17855  }
17856
17857 +/*
17858 + * Protected by RCU!
17859 + */
17860 +static void timer_wait_for_callback(struct k_clock *kc, struct k_itimer *timr)
17861 +{
17862 +#ifdef CONFIG_PREEMPT_RT_FULL
17863 +       if (kc->timer_set == common_timer_set)
17864 +               hrtimer_wait_for_timer(&timr->it.real.timer);
17865 +       else
17866 +               /* FIXME: Whacky hack for posix-cpu-timers */
17867 +               schedule_timeout(1);
17868 +#endif
17869 +}
17870 +
17871  /* Set a POSIX.1b interval timer. */
17872  /* timr->it_lock is taken. */
17873  static int
17874 @@ -903,6 +919,7 @@
17875         if (!timr)
17876                 return -EINVAL;
17877
17878 +       rcu_read_lock();
17879         kc = clockid_to_kclock(timr->it_clock);
17880         if (WARN_ON_ONCE(!kc || !kc->timer_set))
17881                 error = -EINVAL;
17882 @@ -911,9 +928,12 @@
17883
17884         unlock_timer(timr, flag);
17885         if (error == TIMER_RETRY) {
17886 +               timer_wait_for_callback(kc, timr);
17887                 rtn = NULL;     // We already got the old time...
17888 +               rcu_read_unlock();
17889                 goto retry;
17890         }
17891 +       rcu_read_unlock();
17892
17893         if (old_setting && !error &&
17894             copy_to_user(old_setting, &old_spec, sizeof (old_spec)))
17895 @@ -951,10 +971,15 @@
17896         if (!timer)
17897                 return -EINVAL;
17898
17899 +       rcu_read_lock();
17900         if (timer_delete_hook(timer) == TIMER_RETRY) {
17901                 unlock_timer(timer, flags);
17902 +               timer_wait_for_callback(clockid_to_kclock(timer->it_clock),
17903 +                                       timer);
17904 +               rcu_read_unlock();
17905                 goto retry_delete;
17906         }
17907 +       rcu_read_unlock();
17908
17909         spin_lock(&current->sighand->siglock);
17910         list_del(&timer->list);
17911 @@ -980,8 +1005,18 @@
17912  retry_delete:
17913         spin_lock_irqsave(&timer->it_lock, flags);
17914
17915 +       /* On RT we can race with a deletion */
17916 +       if (!timer->it_signal) {
17917 +               unlock_timer(timer, flags);
17918 +               return;
17919 +       }
17920 +
17921         if (timer_delete_hook(timer) == TIMER_RETRY) {
17922 +               rcu_read_lock();
17923                 unlock_timer(timer, flags);
17924 +               timer_wait_for_callback(clockid_to_kclock(timer->it_clock),
17925 +                                       timer);
17926 +               rcu_read_unlock();
17927                 goto retry_delete;
17928         }
17929         list_del(&timer->list);
17930 diff -Nur linux-4.9.6.orig/kernel/time/tick-broadcast-hrtimer.c linux-4.9.6/kernel/time/tick-broadcast-hrtimer.c
17931 --- linux-4.9.6.orig/kernel/time/tick-broadcast-hrtimer.c       2017-01-26 08:25:24.000000000 +0100
17932 +++ linux-4.9.6/kernel/time/tick-broadcast-hrtimer.c    2017-01-28 13:59:10.087662182 +0100
17933 @@ -107,5 +107,6 @@
17934  {
17935         hrtimer_init(&bctimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
17936         bctimer.function = bc_handler;
17937 +       bctimer.irqsafe = true;
17938         clockevents_register_device(&ce_broadcast_hrtimer);
17939  }
17940 diff -Nur linux-4.9.6.orig/kernel/time/tick-common.c linux-4.9.6/kernel/time/tick-common.c
17941 --- linux-4.9.6.orig/kernel/time/tick-common.c  2017-01-26 08:25:24.000000000 +0100
17942 +++ linux-4.9.6/kernel/time/tick-common.c       2017-01-28 13:59:10.087662182 +0100
17943 @@ -79,13 +79,15 @@
17944  static void tick_periodic(int cpu)
17945  {
17946         if (tick_do_timer_cpu == cpu) {
17947 -               write_seqlock(&jiffies_lock);
17948 +               raw_spin_lock(&jiffies_lock);
17949 +               write_seqcount_begin(&jiffies_seq);
17950
17951                 /* Keep track of the next tick event */
17952                 tick_next_period = ktime_add(tick_next_period, tick_period);
17953
17954                 do_timer(1);
17955 -               write_sequnlock(&jiffies_lock);
17956 +               write_seqcount_end(&jiffies_seq);
17957 +               raw_spin_unlock(&jiffies_lock);
17958                 update_wall_time();
17959         }
17960
17961 @@ -157,9 +159,9 @@
17962                 ktime_t next;
17963
17964                 do {
17965 -                       seq = read_seqbegin(&jiffies_lock);
17966 +                       seq = read_seqcount_begin(&jiffies_seq);
17967                         next = tick_next_period;
17968 -               } while (read_seqretry(&jiffies_lock, seq));
17969 +               } while (read_seqcount_retry(&jiffies_seq, seq));
17970
17971                 clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT);
17972
17973 diff -Nur linux-4.9.6.orig/kernel/time/tick-sched.c linux-4.9.6/kernel/time/tick-sched.c
17974 --- linux-4.9.6.orig/kernel/time/tick-sched.c   2017-01-26 08:25:24.000000000 +0100
17975 +++ linux-4.9.6/kernel/time/tick-sched.c        2017-01-28 13:59:10.087662182 +0100
17976 @@ -62,7 +62,8 @@
17977                 return;
17978
17979         /* Reevaluate with jiffies_lock held */
17980 -       write_seqlock(&jiffies_lock);
17981 +       raw_spin_lock(&jiffies_lock);
17982 +       write_seqcount_begin(&jiffies_seq);
17983
17984         delta = ktime_sub(now, last_jiffies_update);
17985         if (delta.tv64 >= tick_period.tv64) {
17986 @@ -85,10 +86,12 @@
17987                 /* Keep the tick_next_period variable up to date */
17988                 tick_next_period = ktime_add(last_jiffies_update, tick_period);
17989         } else {
17990 -               write_sequnlock(&jiffies_lock);
17991 +               write_seqcount_end(&jiffies_seq);
17992 +               raw_spin_unlock(&jiffies_lock);
17993                 return;
17994         }
17995 -       write_sequnlock(&jiffies_lock);
17996 +       write_seqcount_end(&jiffies_seq);
17997 +       raw_spin_unlock(&jiffies_lock);
17998         update_wall_time();
17999  }
18000
18001 @@ -99,12 +102,14 @@
18002  {
18003         ktime_t period;
18004
18005 -       write_seqlock(&jiffies_lock);
18006 +       raw_spin_lock(&jiffies_lock);
18007 +       write_seqcount_begin(&jiffies_seq);
18008         /* Did we start the jiffies update yet ? */
18009         if (last_jiffies_update.tv64 == 0)
18010                 last_jiffies_update = tick_next_period;
18011         period = last_jiffies_update;
18012 -       write_sequnlock(&jiffies_lock);
18013 +       write_seqcount_end(&jiffies_seq);
18014 +       raw_spin_unlock(&jiffies_lock);
18015         return period;
18016  }
18017
18018 @@ -215,6 +220,7 @@
18019
18020  static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = {
18021         .func = nohz_full_kick_func,
18022 +       .flags = IRQ_WORK_HARD_IRQ,
18023  };
18024
18025  /*
18026 @@ -673,10 +679,10 @@
18027
18028         /* Read jiffies and the time when jiffies were updated last */
18029         do {
18030 -               seq = read_seqbegin(&jiffies_lock);
18031 +               seq = read_seqcount_begin(&jiffies_seq);
18032                 basemono = last_jiffies_update.tv64;
18033                 basejiff = jiffies;
18034 -       } while (read_seqretry(&jiffies_lock, seq));
18035 +       } while (read_seqcount_retry(&jiffies_seq, seq));
18036         ts->last_jiffies = basejiff;
18037
18038         if (rcu_needs_cpu(basemono, &next_rcu) ||
18039 @@ -877,14 +883,7 @@
18040                 return false;
18041
18042         if (unlikely(local_softirq_pending() && cpu_online(cpu))) {
18043 -               static int ratelimit;
18044 -
18045 -               if (ratelimit < 10 &&
18046 -                   (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) {
18047 -                       pr_warn("NOHZ: local_softirq_pending %02x\n",
18048 -                               (unsigned int) local_softirq_pending());
18049 -                       ratelimit++;
18050 -               }
18051 +               softirq_check_pending_idle();
18052                 return false;
18053         }
18054
18055 @@ -1193,6 +1192,7 @@
18056          * Emulate tick processing via per-CPU hrtimers:
18057          */
18058         hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
18059 +       ts->sched_timer.irqsafe = 1;
18060         ts->sched_timer.function = tick_sched_timer;
18061
18062         /* Get the next period (per-CPU) */
18063 diff -Nur linux-4.9.6.orig/kernel/time/timekeeping.c linux-4.9.6/kernel/time/timekeeping.c
18064 --- linux-4.9.6.orig/kernel/time/timekeeping.c  2017-01-26 08:25:24.000000000 +0100
18065 +++ linux-4.9.6/kernel/time/timekeeping.c       2017-01-28 13:59:10.087662182 +0100
18066 @@ -2328,8 +2328,10 @@
18067   */
18068  void xtime_update(unsigned long ticks)
18069  {
18070 -       write_seqlock(&jiffies_lock);
18071 +       raw_spin_lock(&jiffies_lock);
18072 +       write_seqcount_begin(&jiffies_seq);
18073         do_timer(ticks);
18074 -       write_sequnlock(&jiffies_lock);
18075 +       write_seqcount_end(&jiffies_seq);
18076 +       raw_spin_unlock(&jiffies_lock);
18077         update_wall_time();
18078  }
18079 diff -Nur linux-4.9.6.orig/kernel/time/timekeeping.h linux-4.9.6/kernel/time/timekeeping.h
18080 --- linux-4.9.6.orig/kernel/time/timekeeping.h  2017-01-26 08:25:24.000000000 +0100
18081 +++ linux-4.9.6/kernel/time/timekeeping.h       2017-01-28 13:59:10.087662182 +0100
18082 @@ -19,7 +19,8 @@
18083  extern void do_timer(unsigned long ticks);
18084  extern void update_wall_time(void);
18085
18086 -extern seqlock_t jiffies_lock;
18087 +extern raw_spinlock_t jiffies_lock;
18088 +extern seqcount_t jiffies_seq;
18089
18090  #define CS_NAME_LEN    32
18091
18092 diff -Nur linux-4.9.6.orig/kernel/time/timer.c linux-4.9.6/kernel/time/timer.c
18093 --- linux-4.9.6.orig/kernel/time/timer.c        2017-01-26 08:25:24.000000000 +0100
18094 +++ linux-4.9.6/kernel/time/timer.c     2017-01-28 13:59:10.087662182 +0100
18095 @@ -193,8 +193,11 @@
18096  #endif
18097
18098  struct timer_base {
18099 -       spinlock_t              lock;
18100 +       raw_spinlock_t          lock;
18101         struct timer_list       *running_timer;
18102 +#ifdef CONFIG_PREEMPT_RT_FULL
18103 +       struct swait_queue_head wait_for_running_timer;
18104 +#endif
18105         unsigned long           clk;
18106         unsigned long           next_expiry;
18107         unsigned int            cpu;
18108 @@ -948,10 +951,10 @@
18109
18110                 if (!(tf & TIMER_MIGRATING)) {
18111                         base = get_timer_base(tf);
18112 -                       spin_lock_irqsave(&base->lock, *flags);
18113 +                       raw_spin_lock_irqsave(&base->lock, *flags);
18114                         if (timer->flags == tf)
18115                                 return base;
18116 -                       spin_unlock_irqrestore(&base->lock, *flags);
18117 +                       raw_spin_unlock_irqrestore(&base->lock, *flags);
18118                 }
18119                 cpu_relax();
18120         }
18121 @@ -1023,9 +1026,9 @@
18122                         /* See the comment in lock_timer_base() */
18123                         timer->flags |= TIMER_MIGRATING;
18124
18125 -                       spin_unlock(&base->lock);
18126 +                       raw_spin_unlock(&base->lock);
18127                         base = new_base;
18128 -                       spin_lock(&base->lock);
18129 +                       raw_spin_lock(&base->lock);
18130                         WRITE_ONCE(timer->flags,
18131                                    (timer->flags & ~TIMER_BASEMASK) | base->cpu);
18132                 }
18133 @@ -1050,7 +1053,7 @@
18134         }
18135
18136  out_unlock:
18137 -       spin_unlock_irqrestore(&base->lock, flags);
18138 +       raw_spin_unlock_irqrestore(&base->lock, flags);
18139
18140         return ret;
18141  }
18142 @@ -1144,19 +1147,46 @@
18143         if (base != new_base) {
18144                 timer->flags |= TIMER_MIGRATING;
18145
18146 -               spin_unlock(&base->lock);
18147 +               raw_spin_unlock(&base->lock);
18148                 base = new_base;
18149 -               spin_lock(&base->lock);
18150 +               raw_spin_lock(&base->lock);
18151                 WRITE_ONCE(timer->flags,
18152                            (timer->flags & ~TIMER_BASEMASK) | cpu);
18153         }
18154
18155         debug_activate(timer, timer->expires);
18156         internal_add_timer(base, timer);
18157 -       spin_unlock_irqrestore(&base->lock, flags);
18158 +       raw_spin_unlock_irqrestore(&base->lock, flags);
18159  }
18160  EXPORT_SYMBOL_GPL(add_timer_on);
18161
18162 +#ifdef CONFIG_PREEMPT_RT_FULL
18163 +/*
18164 + * Wait for a running timer
18165 + */
18166 +static void wait_for_running_timer(struct timer_list *timer)
18167 +{
18168 +       struct timer_base *base;
18169 +       u32 tf = timer->flags;
18170 +
18171 +       if (tf & TIMER_MIGRATING)
18172 +               return;
18173 +
18174 +       base = get_timer_base(tf);
18175 +       swait_event(base->wait_for_running_timer,
18176 +                  base->running_timer != timer);
18177 +}
18178 +
18179 +# define wakeup_timer_waiters(b)       swake_up_all(&(b)->wait_for_running_timer)
18180 +#else
18181 +static inline void wait_for_running_timer(struct timer_list *timer)
18182 +{
18183 +       cpu_relax();
18184 +}
18185 +
18186 +# define wakeup_timer_waiters(b)       do { } while (0)
18187 +#endif
18188 +
18189  /**
18190   * del_timer - deactive a timer.
18191   * @timer: the timer to be deactivated
18192 @@ -1180,7 +1210,7 @@
18193         if (timer_pending(timer)) {
18194                 base = lock_timer_base(timer, &flags);
18195                 ret = detach_if_pending(timer, base, true);
18196 -               spin_unlock_irqrestore(&base->lock, flags);
18197 +               raw_spin_unlock_irqrestore(&base->lock, flags);
18198         }
18199
18200         return ret;
18201 @@ -1208,13 +1238,13 @@
18202                 timer_stats_timer_clear_start_info(timer);
18203                 ret = detach_if_pending(timer, base, true);
18204         }
18205 -       spin_unlock_irqrestore(&base->lock, flags);
18206 +       raw_spin_unlock_irqrestore(&base->lock, flags);
18207
18208         return ret;
18209  }
18210  EXPORT_SYMBOL(try_to_del_timer_sync);
18211
18212 -#ifdef CONFIG_SMP
18213 +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
18214  /**
18215   * del_timer_sync - deactivate a timer and wait for the handler to finish.
18216   * @timer: the timer to be deactivated
18217 @@ -1274,7 +1304,7 @@
18218                 int ret = try_to_del_timer_sync(timer);
18219                 if (ret >= 0)
18220                         return ret;
18221 -               cpu_relax();
18222 +               wait_for_running_timer(timer);
18223         }
18224  }
18225  EXPORT_SYMBOL(del_timer_sync);
18226 @@ -1339,14 +1369,17 @@
18227                 fn = timer->function;
18228                 data = timer->data;
18229
18230 -               if (timer->flags & TIMER_IRQSAFE) {
18231 -                       spin_unlock(&base->lock);
18232 +               if (!IS_ENABLED(CONFIG_PREEMPT_RT_FULL) &&
18233 +                   timer->flags & TIMER_IRQSAFE) {
18234 +                       raw_spin_unlock(&base->lock);
18235                         call_timer_fn(timer, fn, data);
18236 -                       spin_lock(&base->lock);
18237 +                       base->running_timer = NULL;
18238 +                       raw_spin_lock(&base->lock);
18239                 } else {
18240 -                       spin_unlock_irq(&base->lock);
18241 +                       raw_spin_unlock_irq(&base->lock);
18242                         call_timer_fn(timer, fn, data);
18243 -                       spin_lock_irq(&base->lock);
18244 +                       base->running_timer = NULL;
18245 +                       raw_spin_lock_irq(&base->lock);
18246                 }
18247         }
18248  }
18249 @@ -1515,7 +1548,7 @@
18250         if (cpu_is_offline(smp_processor_id()))
18251                 return expires;
18252
18253 -       spin_lock(&base->lock);
18254 +       raw_spin_lock(&base->lock);
18255         nextevt = __next_timer_interrupt(base);
18256         is_max_delta = (nextevt == base->clk + NEXT_TIMER_MAX_DELTA);
18257         base->next_expiry = nextevt;
18258 @@ -1543,7 +1576,7 @@
18259                 if ((expires - basem) > TICK_NSEC)
18260                         base->is_idle = true;
18261         }
18262 -       spin_unlock(&base->lock);
18263 +       raw_spin_unlock(&base->lock);
18264
18265         return cmp_next_hrtimer_event(basem, expires);
18266  }
18267 @@ -1608,13 +1641,13 @@
18268
18269         /* Note: this timer irq context must be accounted for as well. */
18270         account_process_tick(p, user_tick);
18271 +       scheduler_tick();
18272         run_local_timers();
18273         rcu_check_callbacks(user_tick);
18274 -#ifdef CONFIG_IRQ_WORK
18275 +#if defined(CONFIG_IRQ_WORK)
18276         if (in_irq())
18277                 irq_work_tick();
18278  #endif
18279 -       scheduler_tick();
18280         run_posix_cpu_timers(p);
18281  }
18282
18283 @@ -1630,7 +1663,7 @@
18284         if (!time_after_eq(jiffies, base->clk))
18285                 return;
18286
18287 -       spin_lock_irq(&base->lock);
18288 +       raw_spin_lock_irq(&base->lock);
18289
18290         while (time_after_eq(jiffies, base->clk)) {
18291
18292 @@ -1640,8 +1673,8 @@
18293                 while (levels--)
18294                         expire_timers(base, heads + levels);
18295         }
18296 -       base->running_timer = NULL;
18297 -       spin_unlock_irq(&base->lock);
18298 +       raw_spin_unlock_irq(&base->lock);
18299 +       wakeup_timer_waiters(base);
18300  }
18301
18302  /*
18303 @@ -1651,6 +1684,8 @@
18304  {
18305         struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
18306
18307 +       irq_work_tick_soft();
18308 +
18309         __run_timers(base);
18310         if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active)
18311                 __run_timers(this_cpu_ptr(&timer_bases[BASE_DEF]));
18312 @@ -1836,16 +1871,16 @@
18313                  * The caller is globally serialized and nobody else
18314                  * takes two locks at once, deadlock is not possible.
18315                  */
18316 -               spin_lock_irq(&new_base->lock);
18317 -               spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
18318 +               raw_spin_lock_irq(&new_base->lock);
18319 +               raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
18320
18321                 BUG_ON(old_base->running_timer);
18322
18323                 for (i = 0; i < WHEEL_SIZE; i++)
18324                         migrate_timer_list(new_base, old_base->vectors + i);
18325
18326 -               spin_unlock(&old_base->lock);
18327 -               spin_unlock_irq(&new_base->lock);
18328 +               raw_spin_unlock(&old_base->lock);
18329 +               raw_spin_unlock_irq(&new_base->lock);
18330                 put_cpu_ptr(&timer_bases);
18331         }
18332         return 0;
18333 @@ -1861,8 +1896,11 @@
18334         for (i = 0; i < NR_BASES; i++) {
18335                 base = per_cpu_ptr(&timer_bases[i], cpu);
18336                 base->cpu = cpu;
18337 -               spin_lock_init(&base->lock);
18338 +               raw_spin_lock_init(&base->lock);
18339                 base->clk = jiffies;
18340 +#ifdef CONFIG_PREEMPT_RT_FULL
18341 +               init_swait_queue_head(&base->wait_for_running_timer);
18342 +#endif
18343         }
18344  }
18345
18346 diff -Nur linux-4.9.6.orig/kernel/trace/Kconfig linux-4.9.6/kernel/trace/Kconfig
18347 --- linux-4.9.6.orig/kernel/trace/Kconfig       2017-01-26 08:25:24.000000000 +0100
18348 +++ linux-4.9.6/kernel/trace/Kconfig    2017-01-28 13:59:10.087662182 +0100
18349 @@ -182,6 +182,24 @@
18350           enabled. This option and the preempt-off timing option can be
18351           used together or separately.)
18352
18353 +config INTERRUPT_OFF_HIST
18354 +       bool "Interrupts-off Latency Histogram"
18355 +       depends on IRQSOFF_TRACER
18356 +       help
18357 +         This option generates continuously updated histograms (one per cpu)
18358 +         of the duration of time periods with interrupts disabled. The
18359 +         histograms are disabled by default. To enable them, write a non-zero
18360 +         number to
18361 +
18362 +             /sys/kernel/debug/tracing/latency_hist/enable/preemptirqsoff
18363 +
18364 +         If PREEMPT_OFF_HIST is also selected, additional histograms (one
18365 +         per cpu) are generated that accumulate the duration of time periods
18366 +         when both interrupts and preemption are disabled. The histogram data
18367 +         will be located in the debug file system at
18368 +
18369 +             /sys/kernel/debug/tracing/latency_hist/irqsoff
18370 +
18371  config PREEMPT_TRACER
18372         bool "Preemption-off Latency Tracer"
18373         default n
18374 @@ -206,6 +224,24 @@
18375           enabled. This option and the irqs-off timing option can be
18376           used together or separately.)
18377
18378 +config PREEMPT_OFF_HIST
18379 +       bool "Preemption-off Latency Histogram"
18380 +       depends on PREEMPT_TRACER
18381 +       help
18382 +         This option generates continuously updated histograms (one per cpu)
18383 +         of the duration of time periods with preemption disabled. The
18384 +         histograms are disabled by default. To enable them, write a non-zero
18385 +         number to
18386 +
18387 +             /sys/kernel/debug/tracing/latency_hist/enable/preemptirqsoff
18388 +
18389 +         If INTERRUPT_OFF_HIST is also selected, additional histograms (one
18390 +         per cpu) are generated that accumulate the duration of time periods
18391 +         when both interrupts and preemption are disabled. The histogram data
18392 +         will be located in the debug file system at
18393 +
18394 +             /sys/kernel/debug/tracing/latency_hist/preemptoff
18395 +
18396  config SCHED_TRACER
18397         bool "Scheduling Latency Tracer"
18398         select GENERIC_TRACER
18399 @@ -251,6 +287,74 @@
18400          file. Every time a latency is greater than tracing_thresh, it will
18401          be recorded into the ring buffer.
18402
18403 +config WAKEUP_LATENCY_HIST
18404 +       bool "Scheduling Latency Histogram"
18405 +       depends on SCHED_TRACER
18406 +       help
18407 +         This option generates continuously updated histograms (one per cpu)
18408 +         of the scheduling latency of the highest priority task.
18409 +         The histograms are disabled by default. To enable them, write a
18410 +         non-zero number to
18411 +
18412 +             /sys/kernel/debug/tracing/latency_hist/enable/wakeup
18413 +
18414 +         Two different algorithms are used, one to determine the latency of
18415 +         processes that exclusively use the highest priority of the system and
18416 +         another one to determine the latency of processes that share the
18417 +         highest system priority with other processes. The former is used to
18418 +         improve hardware and system software, the latter to optimize the
18419 +         priority design of a given system. The histogram data will be
18420 +         located in the debug file system at
18421 +
18422 +             /sys/kernel/debug/tracing/latency_hist/wakeup
18423 +
18424 +         and
18425 +
18426 +             /sys/kernel/debug/tracing/latency_hist/wakeup/sharedprio
18427 +
18428 +         If both Scheduling Latency Histogram and Missed Timer Offsets
18429 +         Histogram are selected, additional histogram data will be collected
18430 +         that contain, in addition to the wakeup latency, the timer latency, in
18431 +         case the wakeup was triggered by an expired timer. These histograms
18432 +         are available in the
18433 +
18434 +             /sys/kernel/debug/tracing/latency_hist/timerandwakeup
18435 +
18436 +         directory. They reflect the apparent interrupt and scheduling latency
18437 +         and are best suitable to determine the worst-case latency of a given
18438 +         system. To enable these histograms, write a non-zero number to
18439 +
18440 +             /sys/kernel/debug/tracing/latency_hist/enable/timerandwakeup
18441 +
18442 +config MISSED_TIMER_OFFSETS_HIST
18443 +       depends on HIGH_RES_TIMERS
18444 +       select GENERIC_TRACER
18445 +       bool "Missed Timer Offsets Histogram"
18446 +       help
18447 +         Generate a histogram of missed timer offsets in microseconds. The
18448 +         histograms are disabled by default. To enable them, write a non-zero
18449 +         number to
18450 +
18451 +             /sys/kernel/debug/tracing/latency_hist/enable/missed_timer_offsets
18452 +
18453 +         The histogram data will be located in the debug file system at
18454 +
18455 +             /sys/kernel/debug/tracing/latency_hist/missed_timer_offsets
18456 +
18457 +         If both Scheduling Latency Histogram and Missed Timer Offsets
18458 +         Histogram are selected, additional histogram data will be collected
18459 +         that contain, in addition to the wakeup latency, the timer latency, in
18460 +         case the wakeup was triggered by an expired timer. These histograms
18461 +         are available in the
18462 +
18463 +             /sys/kernel/debug/tracing/latency_hist/timerandwakeup
18464 +
18465 +         directory. They reflect the apparent interrupt and scheduling latency
18466 +         and are best suitable to determine the worst-case latency of a given
18467 +         system. To enable these histograms, write a non-zero number to
18468 +
18469 +             /sys/kernel/debug/tracing/latency_hist/enable/timerandwakeup
18470 +
18471  config ENABLE_DEFAULT_TRACERS
18472         bool "Trace process context switches and events"
18473         depends on !GENERIC_TRACER
18474 diff -Nur linux-4.9.6.orig/kernel/trace/latency_hist.c linux-4.9.6/kernel/trace/latency_hist.c
18475 --- linux-4.9.6.orig/kernel/trace/latency_hist.c        1970-01-01 01:00:00.000000000 +0100
18476 +++ linux-4.9.6/kernel/trace/latency_hist.c     2017-01-28 13:59:10.091662336 +0100
18477 @@ -0,0 +1,1178 @@
18478 +/*
18479 + * kernel/trace/latency_hist.c
18480 + *
18481 + * Add support for histograms of preemption-off latency and
18482 + * interrupt-off latency and wakeup latency, it depends on
18483 + * Real-Time Preemption Support.
18484 + *
18485 + *  Copyright (C) 2005 MontaVista Software, Inc.
18486 + *  Yi Yang <yyang@ch.mvista.com>
18487 + *
18488 + *  Converted to work with the new latency tracer.
18489 + *  Copyright (C) 2008 Red Hat, Inc.
18490 + *    Steven Rostedt <srostedt@redhat.com>
18491 + *
18492 + */
18493 +#include <linux/module.h>
18494 +#include <linux/debugfs.h>
18495 +#include <linux/seq_file.h>
18496 +#include <linux/percpu.h>
18497 +#include <linux/kallsyms.h>
18498 +#include <linux/uaccess.h>
18499 +#include <linux/sched.h>
18500 +#include <linux/sched/rt.h>
18501 +#include <linux/slab.h>
18502 +#include <linux/atomic.h>
18503 +#include <asm/div64.h>
18504 +
18505 +#include "trace.h"
18506 +#include <trace/events/sched.h>
18507 +
18508 +#define NSECS_PER_USECS 1000L
18509 +
18510 +#define CREATE_TRACE_POINTS
18511 +#include <trace/events/hist.h>
18512 +
18513 +enum {
18514 +       IRQSOFF_LATENCY = 0,
18515 +       PREEMPTOFF_LATENCY,
18516 +       PREEMPTIRQSOFF_LATENCY,
18517 +       WAKEUP_LATENCY,
18518 +       WAKEUP_LATENCY_SHAREDPRIO,
18519 +       MISSED_TIMER_OFFSETS,
18520 +       TIMERANDWAKEUP_LATENCY,
18521 +       MAX_LATENCY_TYPE,
18522 +};
18523 +
18524 +#define MAX_ENTRY_NUM 10240
18525 +
18526 +struct hist_data {
18527 +       atomic_t hist_mode; /* 0 log, 1 don't log */
18528 +       long offset; /* set it to MAX_ENTRY_NUM/2 for a bipolar scale */
18529 +       long min_lat;
18530 +       long max_lat;
18531 +       unsigned long long below_hist_bound_samples;
18532 +       unsigned long long above_hist_bound_samples;
18533 +       long long accumulate_lat;
18534 +       unsigned long long total_samples;
18535 +       unsigned long long hist_array[MAX_ENTRY_NUM];
18536 +};
18537 +
18538 +struct enable_data {
18539 +       int latency_type;
18540 +       int enabled;
18541 +};
18542 +
18543 +static char *latency_hist_dir_root = "latency_hist";
18544 +
18545 +#ifdef CONFIG_INTERRUPT_OFF_HIST
18546 +static DEFINE_PER_CPU(struct hist_data, irqsoff_hist);
18547 +static char *irqsoff_hist_dir = "irqsoff";
18548 +static DEFINE_PER_CPU(cycles_t, hist_irqsoff_start);
18549 +static DEFINE_PER_CPU(int, hist_irqsoff_counting);
18550 +#endif
18551 +
18552 +#ifdef CONFIG_PREEMPT_OFF_HIST
18553 +static DEFINE_PER_CPU(struct hist_data, preemptoff_hist);
18554 +static char *preemptoff_hist_dir = "preemptoff";
18555 +static DEFINE_PER_CPU(cycles_t, hist_preemptoff_start);
18556 +static DEFINE_PER_CPU(int, hist_preemptoff_counting);
18557 +#endif
18558 +
18559 +#if defined(CONFIG_PREEMPT_OFF_HIST) && defined(CONFIG_INTERRUPT_OFF_HIST)
18560 +static DEFINE_PER_CPU(struct hist_data, preemptirqsoff_hist);
18561 +static char *preemptirqsoff_hist_dir = "preemptirqsoff";
18562 +static DEFINE_PER_CPU(cycles_t, hist_preemptirqsoff_start);
18563 +static DEFINE_PER_CPU(int, hist_preemptirqsoff_counting);
18564 +#endif
18565 +
18566 +#if defined(CONFIG_PREEMPT_OFF_HIST) || defined(CONFIG_INTERRUPT_OFF_HIST)
18567 +static notrace void probe_preemptirqsoff_hist(void *v, int reason, int start);
18568 +static struct enable_data preemptirqsoff_enabled_data = {
18569 +       .latency_type = PREEMPTIRQSOFF_LATENCY,
18570 +       .enabled = 0,
18571 +};
18572 +#endif
18573 +
18574 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
18575 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
18576 +struct maxlatproc_data {
18577 +       char comm[FIELD_SIZEOF(struct task_struct, comm)];
18578 +       char current_comm[FIELD_SIZEOF(struct task_struct, comm)];
18579 +       int pid;
18580 +       int current_pid;
18581 +       int prio;
18582 +       int current_prio;
18583 +       long latency;
18584 +       long timeroffset;
18585 +       cycle_t timestamp;
18586 +};
18587 +#endif
18588 +
18589 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
18590 +static DEFINE_PER_CPU(struct hist_data, wakeup_latency_hist);
18591 +static DEFINE_PER_CPU(struct hist_data, wakeup_latency_hist_sharedprio);
18592 +static char *wakeup_latency_hist_dir = "wakeup";
18593 +static char *wakeup_latency_hist_dir_sharedprio = "sharedprio";
18594 +static notrace void probe_wakeup_latency_hist_start(void *v,
18595 +       struct task_struct *p);
18596 +static notrace void probe_wakeup_latency_hist_stop(void *v,
18597 +       bool preempt, struct task_struct *prev, struct task_struct *next);
18598 +static notrace void probe_sched_migrate_task(void *,
18599 +       struct task_struct *task, int cpu);
18600 +static struct enable_data wakeup_latency_enabled_data = {
18601 +       .latency_type = WAKEUP_LATENCY,
18602 +       .enabled = 0,
18603 +};
18604 +static DEFINE_PER_CPU(struct maxlatproc_data, wakeup_maxlatproc);
18605 +static DEFINE_PER_CPU(struct maxlatproc_data, wakeup_maxlatproc_sharedprio);
18606 +static DEFINE_PER_CPU(struct task_struct *, wakeup_task);
18607 +static DEFINE_PER_CPU(int, wakeup_sharedprio);
18608 +static unsigned long wakeup_pid;
18609 +#endif
18610 +
18611 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
18612 +static DEFINE_PER_CPU(struct hist_data, missed_timer_offsets);
18613 +static char *missed_timer_offsets_dir = "missed_timer_offsets";
18614 +static notrace void probe_hrtimer_interrupt(void *v, int cpu,
18615 +       long long offset, struct task_struct *curr, struct task_struct *task);
18616 +static struct enable_data missed_timer_offsets_enabled_data = {
18617 +       .latency_type = MISSED_TIMER_OFFSETS,
18618 +       .enabled = 0,
18619 +};
18620 +static DEFINE_PER_CPU(struct maxlatproc_data, missed_timer_offsets_maxlatproc);
18621 +static unsigned long missed_timer_offsets_pid;
18622 +#endif
18623 +
18624 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
18625 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
18626 +static DEFINE_PER_CPU(struct hist_data, timerandwakeup_latency_hist);
18627 +static char *timerandwakeup_latency_hist_dir = "timerandwakeup";
18628 +static struct enable_data timerandwakeup_enabled_data = {
18629 +       .latency_type = TIMERANDWAKEUP_LATENCY,
18630 +       .enabled = 0,
18631 +};
18632 +static DEFINE_PER_CPU(struct maxlatproc_data, timerandwakeup_maxlatproc);
18633 +#endif
18634 +
18635 +void notrace latency_hist(int latency_type, int cpu, long latency,
18636 +                         long timeroffset, cycle_t stop,
18637 +                         struct task_struct *p)
18638 +{
18639 +       struct hist_data *my_hist;
18640 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
18641 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
18642 +       struct maxlatproc_data *mp = NULL;
18643 +#endif
18644 +
18645 +       if (!cpu_possible(cpu) || latency_type < 0 ||
18646 +           latency_type >= MAX_LATENCY_TYPE)
18647 +               return;
18648 +
18649 +       switch (latency_type) {
18650 +#ifdef CONFIG_INTERRUPT_OFF_HIST
18651 +       case IRQSOFF_LATENCY:
18652 +               my_hist = &per_cpu(irqsoff_hist, cpu);
18653 +               break;
18654 +#endif
18655 +#ifdef CONFIG_PREEMPT_OFF_HIST
18656 +       case PREEMPTOFF_LATENCY:
18657 +               my_hist = &per_cpu(preemptoff_hist, cpu);
18658 +               break;
18659 +#endif
18660 +#if defined(CONFIG_PREEMPT_OFF_HIST) && defined(CONFIG_INTERRUPT_OFF_HIST)
18661 +       case PREEMPTIRQSOFF_LATENCY:
18662 +               my_hist = &per_cpu(preemptirqsoff_hist, cpu);
18663 +               break;
18664 +#endif
18665 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
18666 +       case WAKEUP_LATENCY:
18667 +               my_hist = &per_cpu(wakeup_latency_hist, cpu);
18668 +               mp = &per_cpu(wakeup_maxlatproc, cpu);
18669 +               break;
18670 +       case WAKEUP_LATENCY_SHAREDPRIO:
18671 +               my_hist = &per_cpu(wakeup_latency_hist_sharedprio, cpu);
18672 +               mp = &per_cpu(wakeup_maxlatproc_sharedprio, cpu);
18673 +               break;
18674 +#endif
18675 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
18676 +       case MISSED_TIMER_OFFSETS:
18677 +               my_hist = &per_cpu(missed_timer_offsets, cpu);
18678 +               mp = &per_cpu(missed_timer_offsets_maxlatproc, cpu);
18679 +               break;
18680 +#endif
18681 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
18682 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
18683 +       case TIMERANDWAKEUP_LATENCY:
18684 +               my_hist = &per_cpu(timerandwakeup_latency_hist, cpu);
18685 +               mp = &per_cpu(timerandwakeup_maxlatproc, cpu);
18686 +               break;
18687 +#endif
18688 +
18689 +       default:
18690 +               return;
18691 +       }
18692 +
18693 +       latency += my_hist->offset;
18694 +
18695 +       if (atomic_read(&my_hist->hist_mode) == 0)
18696 +               return;
18697 +
18698 +       if (latency < 0 || latency >= MAX_ENTRY_NUM) {
18699 +               if (latency < 0)
18700 +                       my_hist->below_hist_bound_samples++;
18701 +               else
18702 +                       my_hist->above_hist_bound_samples++;
18703 +       } else
18704 +               my_hist->hist_array[latency]++;
18705 +
18706 +       if (unlikely(latency > my_hist->max_lat ||
18707 +           my_hist->min_lat == LONG_MAX)) {
18708 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
18709 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
18710 +               if (latency_type == WAKEUP_LATENCY ||
18711 +                   latency_type == WAKEUP_LATENCY_SHAREDPRIO ||
18712 +                   latency_type == MISSED_TIMER_OFFSETS ||
18713 +                   latency_type == TIMERANDWAKEUP_LATENCY) {
18714 +                       strncpy(mp->comm, p->comm, sizeof(mp->comm));
18715 +                       strncpy(mp->current_comm, current->comm,
18716 +                           sizeof(mp->current_comm));
18717 +                       mp->pid = task_pid_nr(p);
18718 +                       mp->current_pid = task_pid_nr(current);
18719 +                       mp->prio = p->prio;
18720 +                       mp->current_prio = current->prio;
18721 +                       mp->latency = latency;
18722 +                       mp->timeroffset = timeroffset;
18723 +                       mp->timestamp = stop;
18724 +               }
18725 +#endif
18726 +               my_hist->max_lat = latency;
18727 +       }
18728 +       if (unlikely(latency < my_hist->min_lat))
18729 +               my_hist->min_lat = latency;
18730 +       my_hist->total_samples++;
18731 +       my_hist->accumulate_lat += latency;
18732 +}
18733 +
18734 +static void *l_start(struct seq_file *m, loff_t *pos)
18735 +{
18736 +       loff_t *index_ptr = NULL;
18737 +       loff_t index = *pos;
18738 +       struct hist_data *my_hist = m->private;
18739 +
18740 +       if (index == 0) {
18741 +               char minstr[32], avgstr[32], maxstr[32];
18742 +
18743 +               atomic_dec(&my_hist->hist_mode);
18744 +
18745 +               if (likely(my_hist->total_samples)) {
18746 +                       long avg = (long) div64_s64(my_hist->accumulate_lat,
18747 +                           my_hist->total_samples);
18748 +                       snprintf(minstr, sizeof(minstr), "%ld",
18749 +                           my_hist->min_lat - my_hist->offset);
18750 +                       snprintf(avgstr, sizeof(avgstr), "%ld",
18751 +                           avg - my_hist->offset);
18752 +                       snprintf(maxstr, sizeof(maxstr), "%ld",
18753 +                           my_hist->max_lat - my_hist->offset);
18754 +               } else {
18755 +                       strcpy(minstr, "<undef>");
18756 +                       strcpy(avgstr, minstr);
18757 +                       strcpy(maxstr, minstr);
18758 +               }
18759 +
18760 +               seq_printf(m, "#Minimum latency: %s microseconds\n"
18761 +                          "#Average latency: %s microseconds\n"
18762 +                          "#Maximum latency: %s microseconds\n"
18763 +                          "#Total samples: %llu\n"
18764 +                          "#There are %llu samples lower than %ld"
18765 +                          " microseconds.\n"
18766 +                          "#There are %llu samples greater or equal"
18767 +                          " than %ld microseconds.\n"
18768 +                          "#usecs\t%16s\n",
18769 +                          minstr, avgstr, maxstr,
18770 +                          my_hist->total_samples,
18771 +                          my_hist->below_hist_bound_samples,
18772 +                          -my_hist->offset,
18773 +                          my_hist->above_hist_bound_samples,
18774 +                          MAX_ENTRY_NUM - my_hist->offset,
18775 +                          "samples");
18776 +       }
18777 +       if (index < MAX_ENTRY_NUM) {
18778 +               index_ptr = kmalloc(sizeof(loff_t), GFP_KERNEL);
18779 +               if (index_ptr)
18780 +                       *index_ptr = index;
18781 +       }
18782 +
18783 +       return index_ptr;
18784 +}
18785 +
18786 +static void *l_next(struct seq_file *m, void *p, loff_t *pos)
18787 +{
18788 +       loff_t *index_ptr = p;
18789 +       struct hist_data *my_hist = m->private;
18790 +
18791 +       if (++*pos >= MAX_ENTRY_NUM) {
18792 +               atomic_inc(&my_hist->hist_mode);
18793 +               return NULL;
18794 +       }
18795 +       *index_ptr = *pos;
18796 +       return index_ptr;
18797 +}
18798 +
18799 +static void l_stop(struct seq_file *m, void *p)
18800 +{
18801 +       kfree(p);
18802 +}
18803 +
18804 +static int l_show(struct seq_file *m, void *p)
18805 +{
18806 +       int index = *(loff_t *) p;
18807 +       struct hist_data *my_hist = m->private;
18808 +
18809 +       seq_printf(m, "%6ld\t%16llu\n", index - my_hist->offset,
18810 +           my_hist->hist_array[index]);
18811 +       return 0;
18812 +}
18813 +
18814 +static const struct seq_operations latency_hist_seq_op = {
18815 +       .start = l_start,
18816 +       .next  = l_next,
18817 +       .stop  = l_stop,
18818 +       .show  = l_show
18819 +};
18820 +
18821 +static int latency_hist_open(struct inode *inode, struct file *file)
18822 +{
18823 +       int ret;
18824 +
18825 +       ret = seq_open(file, &latency_hist_seq_op);
18826 +       if (!ret) {
18827 +               struct seq_file *seq = file->private_data;
18828 +               seq->private = inode->i_private;
18829 +       }
18830 +       return ret;
18831 +}
18832 +
18833 +static const struct file_operations latency_hist_fops = {
18834 +       .open = latency_hist_open,
18835 +       .read = seq_read,
18836 +       .llseek = seq_lseek,
18837 +       .release = seq_release,
18838 +};
18839 +
18840 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
18841 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
18842 +static void clear_maxlatprocdata(struct maxlatproc_data *mp)
18843 +{
18844 +       mp->comm[0] = mp->current_comm[0] = '\0';
18845 +       mp->prio = mp->current_prio = mp->pid = mp->current_pid =
18846 +           mp->latency = mp->timeroffset = -1;
18847 +       mp->timestamp = 0;
18848 +}
18849 +#endif
18850 +
18851 +static void hist_reset(struct hist_data *hist)
18852 +{
18853 +       atomic_dec(&hist->hist_mode);
18854 +
18855 +       memset(hist->hist_array, 0, sizeof(hist->hist_array));
18856 +       hist->below_hist_bound_samples = 0ULL;
18857 +       hist->above_hist_bound_samples = 0ULL;
18858 +       hist->min_lat = LONG_MAX;
18859 +       hist->max_lat = LONG_MIN;
18860 +       hist->total_samples = 0ULL;
18861 +       hist->accumulate_lat = 0LL;
18862 +
18863 +       atomic_inc(&hist->hist_mode);
18864 +}
18865 +
18866 +static ssize_t
18867 +latency_hist_reset(struct file *file, const char __user *a,
18868 +                  size_t size, loff_t *off)
18869 +{
18870 +       int cpu;
18871 +       struct hist_data *hist = NULL;
18872 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
18873 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
18874 +       struct maxlatproc_data *mp = NULL;
18875 +#endif
18876 +       off_t latency_type = (off_t) file->private_data;
18877 +
18878 +       for_each_online_cpu(cpu) {
18879 +
18880 +               switch (latency_type) {
18881 +#ifdef CONFIG_PREEMPT_OFF_HIST
18882 +               case PREEMPTOFF_LATENCY:
18883 +                       hist = &per_cpu(preemptoff_hist, cpu);
18884 +                       break;
18885 +#endif
18886 +#ifdef CONFIG_INTERRUPT_OFF_HIST
18887 +               case IRQSOFF_LATENCY:
18888 +                       hist = &per_cpu(irqsoff_hist, cpu);
18889 +                       break;
18890 +#endif
18891 +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
18892 +               case PREEMPTIRQSOFF_LATENCY:
18893 +                       hist = &per_cpu(preemptirqsoff_hist, cpu);
18894 +                       break;
18895 +#endif
18896 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
18897 +               case WAKEUP_LATENCY:
18898 +                       hist = &per_cpu(wakeup_latency_hist, cpu);
18899 +                       mp = &per_cpu(wakeup_maxlatproc, cpu);
18900 +                       break;
18901 +               case WAKEUP_LATENCY_SHAREDPRIO:
18902 +                       hist = &per_cpu(wakeup_latency_hist_sharedprio, cpu);
18903 +                       mp = &per_cpu(wakeup_maxlatproc_sharedprio, cpu);
18904 +                       break;
18905 +#endif
18906 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
18907 +               case MISSED_TIMER_OFFSETS:
18908 +                       hist = &per_cpu(missed_timer_offsets, cpu);
18909 +                       mp = &per_cpu(missed_timer_offsets_maxlatproc, cpu);
18910 +                       break;
18911 +#endif
18912 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
18913 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
18914 +               case TIMERANDWAKEUP_LATENCY:
18915 +                       hist = &per_cpu(timerandwakeup_latency_hist, cpu);
18916 +                       mp = &per_cpu(timerandwakeup_maxlatproc, cpu);
18917 +                       break;
18918 +#endif
18919 +               }
18920 +
18921 +               hist_reset(hist);
18922 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
18923 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
18924 +               if (latency_type == WAKEUP_LATENCY ||
18925 +                   latency_type == WAKEUP_LATENCY_SHAREDPRIO ||
18926 +                   latency_type == MISSED_TIMER_OFFSETS ||
18927 +                   latency_type == TIMERANDWAKEUP_LATENCY)
18928 +                       clear_maxlatprocdata(mp);
18929 +#endif
18930 +       }
18931 +
18932 +       return size;
18933 +}
18934 +
18935 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
18936 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
18937 +static ssize_t
18938 +show_pid(struct file *file, char __user *ubuf, size_t cnt, loff_t *ppos)
18939 +{
18940 +       char buf[64];
18941 +       int r;
18942 +       unsigned long *this_pid = file->private_data;
18943 +
18944 +       r = snprintf(buf, sizeof(buf), "%lu\n", *this_pid);
18945 +       return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
18946 +}
18947 +
18948 +static ssize_t do_pid(struct file *file, const char __user *ubuf,
18949 +                     size_t cnt, loff_t *ppos)
18950 +{
18951 +       char buf[64];
18952 +       unsigned long pid;
18953 +       unsigned long *this_pid = file->private_data;
18954 +
18955 +       if (cnt >= sizeof(buf))
18956 +               return -EINVAL;
18957 +
18958 +       if (copy_from_user(&buf, ubuf, cnt))
18959 +               return -EFAULT;
18960 +
18961 +       buf[cnt] = '\0';
18962 +
18963 +       if (kstrtoul(buf, 10, &pid))
18964 +               return -EINVAL;
18965 +
18966 +       *this_pid = pid;
18967 +
18968 +       return cnt;
18969 +}
18970 +#endif
18971 +
18972 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
18973 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
18974 +static ssize_t
18975 +show_maxlatproc(struct file *file, char __user *ubuf, size_t cnt, loff_t *ppos)
18976 +{
18977 +       int r;
18978 +       struct maxlatproc_data *mp = file->private_data;
18979 +       int strmaxlen = (TASK_COMM_LEN * 2) + (8 * 8);
18980 +       unsigned long long t;
18981 +       unsigned long usecs, secs;
18982 +       char *buf;
18983 +
18984 +       if (mp->pid == -1 || mp->current_pid == -1) {
18985 +               buf = "(none)\n";
18986 +               return simple_read_from_buffer(ubuf, cnt, ppos, buf,
18987 +                   strlen(buf));
18988 +       }
18989 +
18990 +       buf = kmalloc(strmaxlen, GFP_KERNEL);
18991 +       if (buf == NULL)
18992 +               return -ENOMEM;
18993 +
18994 +       t = ns2usecs(mp->timestamp);
18995 +       usecs = do_div(t, USEC_PER_SEC);
18996 +       secs = (unsigned long) t;
18997 +       r = snprintf(buf, strmaxlen,
18998 +           "%d %d %ld (%ld) %s <- %d %d %s %lu.%06lu\n", mp->pid,
18999 +           MAX_RT_PRIO-1 - mp->prio, mp->latency, mp->timeroffset, mp->comm,
19000 +           mp->current_pid, MAX_RT_PRIO-1 - mp->current_prio, mp->current_comm,
19001 +           secs, usecs);
19002 +       r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
19003 +       kfree(buf);
19004 +       return r;
19005 +}
19006 +#endif
19007 +
19008 +static ssize_t
19009 +show_enable(struct file *file, char __user *ubuf, size_t cnt, loff_t *ppos)
19010 +{
19011 +       char buf[64];
19012 +       struct enable_data *ed = file->private_data;
19013 +       int r;
19014 +
19015 +       r = snprintf(buf, sizeof(buf), "%d\n", ed->enabled);
19016 +       return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
19017 +}
19018 +
19019 +static ssize_t
19020 +do_enable(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos)
19021 +{
19022 +       char buf[64];
19023 +       long enable;
19024 +       struct enable_data *ed = file->private_data;
19025 +
19026 +       if (cnt >= sizeof(buf))
19027 +               return -EINVAL;
19028 +
19029 +       if (copy_from_user(&buf, ubuf, cnt))
19030 +               return -EFAULT;
19031 +
19032 +       buf[cnt] = 0;
19033 +
19034 +       if (kstrtoul(buf, 10, &enable))
19035 +               return -EINVAL;
19036 +
19037 +       if ((enable && ed->enabled) || (!enable && !ed->enabled))
19038 +               return cnt;
19039 +
19040 +       if (enable) {
19041 +               int ret;
19042 +
19043 +               switch (ed->latency_type) {
19044 +#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST)
19045 +               case PREEMPTIRQSOFF_LATENCY:
19046 +                       ret = register_trace_preemptirqsoff_hist(
19047 +                           probe_preemptirqsoff_hist, NULL);
19048 +                       if (ret) {
19049 +                               pr_info("wakeup trace: Couldn't assign "
19050 +                                   "probe_preemptirqsoff_hist "
19051 +                                   "to trace_preemptirqsoff_hist\n");
19052 +                               return ret;
19053 +                       }
19054 +                       break;
19055 +#endif
19056 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
19057 +               case WAKEUP_LATENCY:
19058 +                       ret = register_trace_sched_wakeup(
19059 +                           probe_wakeup_latency_hist_start, NULL);
19060 +                       if (ret) {
19061 +                               pr_info("wakeup trace: Couldn't assign "
19062 +                                   "probe_wakeup_latency_hist_start "
19063 +                                   "to trace_sched_wakeup\n");
19064 +                               return ret;
19065 +                       }
19066 +                       ret = register_trace_sched_wakeup_new(
19067 +                           probe_wakeup_latency_hist_start, NULL);
19068 +                       if (ret) {
19069 +                               pr_info("wakeup trace: Couldn't assign "
19070 +                                   "probe_wakeup_latency_hist_start "
19071 +                                   "to trace_sched_wakeup_new\n");
19072 +                               unregister_trace_sched_wakeup(
19073 +                                   probe_wakeup_latency_hist_start, NULL);
19074 +                               return ret;
19075 +                       }
19076 +                       ret = register_trace_sched_switch(
19077 +                           probe_wakeup_latency_hist_stop, NULL);
19078 +                       if (ret) {
19079 +                               pr_info("wakeup trace: Couldn't assign "
19080 +                                   "probe_wakeup_latency_hist_stop "
19081 +                                   "to trace_sched_switch\n");
19082 +                               unregister_trace_sched_wakeup(
19083 +                                   probe_wakeup_latency_hist_start, NULL);
19084 +                               unregister_trace_sched_wakeup_new(
19085 +                                   probe_wakeup_latency_hist_start, NULL);
19086 +                               return ret;
19087 +                       }
19088 +                       ret = register_trace_sched_migrate_task(
19089 +                           probe_sched_migrate_task, NULL);
19090 +                       if (ret) {
19091 +                               pr_info("wakeup trace: Couldn't assign "
19092 +                                   "probe_sched_migrate_task "
19093 +                                   "to trace_sched_migrate_task\n");
19094 +                               unregister_trace_sched_wakeup(
19095 +                                   probe_wakeup_latency_hist_start, NULL);
19096 +                               unregister_trace_sched_wakeup_new(
19097 +                                   probe_wakeup_latency_hist_start, NULL);
19098 +                               unregister_trace_sched_switch(
19099 +                                   probe_wakeup_latency_hist_stop, NULL);
19100 +                               return ret;
19101 +                       }
19102 +                       break;
19103 +#endif
19104 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
19105 +               case MISSED_TIMER_OFFSETS:
19106 +                       ret = register_trace_hrtimer_interrupt(
19107 +                           probe_hrtimer_interrupt, NULL);
19108 +                       if (ret) {
19109 +                               pr_info("wakeup trace: Couldn't assign "
19110 +                                   "probe_hrtimer_interrupt "
19111 +                                   "to trace_hrtimer_interrupt\n");
19112 +                               return ret;
19113 +                       }
19114 +                       break;
19115 +#endif
19116 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
19117 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
19118 +               case TIMERANDWAKEUP_LATENCY:
19119 +                       if (!wakeup_latency_enabled_data.enabled ||
19120 +                           !missed_timer_offsets_enabled_data.enabled)
19121 +                               return -EINVAL;
19122 +                       break;
19123 +#endif
19124 +               default:
19125 +                       break;
19126 +               }
19127 +       } else {
19128 +               switch (ed->latency_type) {
19129 +#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST)
19130 +               case PREEMPTIRQSOFF_LATENCY:
19131 +                       {
19132 +                               int cpu;
19133 +
19134 +                               unregister_trace_preemptirqsoff_hist(
19135 +                                   probe_preemptirqsoff_hist, NULL);
19136 +                               for_each_online_cpu(cpu) {
19137 +#ifdef CONFIG_INTERRUPT_OFF_HIST
19138 +                                       per_cpu(hist_irqsoff_counting,
19139 +                                           cpu) = 0;
19140 +#endif
19141 +#ifdef CONFIG_PREEMPT_OFF_HIST
19142 +                                       per_cpu(hist_preemptoff_counting,
19143 +                                           cpu) = 0;
19144 +#endif
19145 +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
19146 +                                       per_cpu(hist_preemptirqsoff_counting,
19147 +                                           cpu) = 0;
19148 +#endif
19149 +                               }
19150 +                       }
19151 +                       break;
19152 +#endif
19153 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
19154 +               case WAKEUP_LATENCY:
19155 +                       {
19156 +                               int cpu;
19157 +
19158 +                               unregister_trace_sched_wakeup(
19159 +                                   probe_wakeup_latency_hist_start, NULL);
19160 +                               unregister_trace_sched_wakeup_new(
19161 +                                   probe_wakeup_latency_hist_start, NULL);
19162 +                               unregister_trace_sched_switch(
19163 +                                   probe_wakeup_latency_hist_stop, NULL);
19164 +                               unregister_trace_sched_migrate_task(
19165 +                                   probe_sched_migrate_task, NULL);
19166 +
19167 +                               for_each_online_cpu(cpu) {
19168 +                                       per_cpu(wakeup_task, cpu) = NULL;
19169 +                                       per_cpu(wakeup_sharedprio, cpu) = 0;
19170 +                               }
19171 +                       }
19172 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
19173 +                       timerandwakeup_enabled_data.enabled = 0;
19174 +#endif
19175 +                       break;
19176 +#endif
19177 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
19178 +               case MISSED_TIMER_OFFSETS:
19179 +                       unregister_trace_hrtimer_interrupt(
19180 +                           probe_hrtimer_interrupt, NULL);
19181 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
19182 +                       timerandwakeup_enabled_data.enabled = 0;
19183 +#endif
19184 +                       break;
19185 +#endif
19186 +               default:
19187 +                       break;
19188 +               }
19189 +       }
19190 +       ed->enabled = enable;
19191 +       return cnt;
19192 +}
19193 +
19194 +static const struct file_operations latency_hist_reset_fops = {
19195 +       .open = tracing_open_generic,
19196 +       .write = latency_hist_reset,
19197 +};
19198 +
19199 +static const struct file_operations enable_fops = {
19200 +       .open = tracing_open_generic,
19201 +       .read = show_enable,
19202 +       .write = do_enable,
19203 +};
19204 +
19205 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
19206 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
19207 +static const struct file_operations pid_fops = {
19208 +       .open = tracing_open_generic,
19209 +       .read = show_pid,
19210 +       .write = do_pid,
19211 +};
19212 +
19213 +static const struct file_operations maxlatproc_fops = {
19214 +       .open = tracing_open_generic,
19215 +       .read = show_maxlatproc,
19216 +};
19217 +#endif
19218 +
19219 +#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST)
19220 +static notrace void probe_preemptirqsoff_hist(void *v, int reason,
19221 +       int starthist)
19222 +{
19223 +       int cpu = raw_smp_processor_id();
19224 +       int time_set = 0;
19225 +
19226 +       if (starthist) {
19227 +               cycle_t uninitialized_var(start);
19228 +
19229 +               if (!preempt_count() && !irqs_disabled())
19230 +                       return;
19231 +
19232 +#ifdef CONFIG_INTERRUPT_OFF_HIST
19233 +               if ((reason == IRQS_OFF || reason == TRACE_START) &&
19234 +                   !per_cpu(hist_irqsoff_counting, cpu)) {
19235 +                       per_cpu(hist_irqsoff_counting, cpu) = 1;
19236 +                       start = ftrace_now(cpu);
19237 +                       time_set++;
19238 +                       per_cpu(hist_irqsoff_start, cpu) = start;
19239 +               }
19240 +#endif
19241 +
19242 +#ifdef CONFIG_PREEMPT_OFF_HIST
19243 +               if ((reason == PREEMPT_OFF || reason == TRACE_START) &&
19244 +                   !per_cpu(hist_preemptoff_counting, cpu)) {
19245 +                       per_cpu(hist_preemptoff_counting, cpu) = 1;
19246 +                       if (!(time_set++))
19247 +                               start = ftrace_now(cpu);
19248 +                       per_cpu(hist_preemptoff_start, cpu) = start;
19249 +               }
19250 +#endif
19251 +
19252 +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
19253 +               if (per_cpu(hist_irqsoff_counting, cpu) &&
19254 +                   per_cpu(hist_preemptoff_counting, cpu) &&
19255 +                   !per_cpu(hist_preemptirqsoff_counting, cpu)) {
19256 +                       per_cpu(hist_preemptirqsoff_counting, cpu) = 1;
19257 +                       if (!time_set)
19258 +                               start = ftrace_now(cpu);
19259 +                       per_cpu(hist_preemptirqsoff_start, cpu) = start;
19260 +               }
19261 +#endif
19262 +       } else {
19263 +               cycle_t uninitialized_var(stop);
19264 +
19265 +#ifdef CONFIG_INTERRUPT_OFF_HIST
19266 +               if ((reason == IRQS_ON || reason == TRACE_STOP) &&
19267 +                   per_cpu(hist_irqsoff_counting, cpu)) {
19268 +                       cycle_t start = per_cpu(hist_irqsoff_start, cpu);
19269 +
19270 +                       stop = ftrace_now(cpu);
19271 +                       time_set++;
19272 +                       if (start) {
19273 +                               long latency = ((long) (stop - start)) /
19274 +                                   NSECS_PER_USECS;
19275 +
19276 +                               latency_hist(IRQSOFF_LATENCY, cpu, latency, 0,
19277 +                                   stop, NULL);
19278 +                       }
19279 +                       per_cpu(hist_irqsoff_counting, cpu) = 0;
19280 +               }
19281 +#endif
19282 +
19283 +#ifdef CONFIG_PREEMPT_OFF_HIST
19284 +               if ((reason == PREEMPT_ON || reason == TRACE_STOP) &&
19285 +                   per_cpu(hist_preemptoff_counting, cpu)) {
19286 +                       cycle_t start = per_cpu(hist_preemptoff_start, cpu);
19287 +
19288 +                       if (!(time_set++))
19289 +                               stop = ftrace_now(cpu);
19290 +                       if (start) {
19291 +                               long latency = ((long) (stop - start)) /
19292 +                                   NSECS_PER_USECS;
19293 +
19294 +                               latency_hist(PREEMPTOFF_LATENCY, cpu, latency,
19295 +                                   0, stop, NULL);
19296 +                       }
19297 +                       per_cpu(hist_preemptoff_counting, cpu) = 0;
19298 +               }
19299 +#endif
19300 +
19301 +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
19302 +               if ((!per_cpu(hist_irqsoff_counting, cpu) ||
19303 +                    !per_cpu(hist_preemptoff_counting, cpu)) &&
19304 +                  per_cpu(hist_preemptirqsoff_counting, cpu)) {
19305 +                       cycle_t start = per_cpu(hist_preemptirqsoff_start, cpu);
19306 +
19307 +                       if (!time_set)
19308 +                               stop = ftrace_now(cpu);
19309 +                       if (start) {
19310 +                               long latency = ((long) (stop - start)) /
19311 +                                   NSECS_PER_USECS;
19312 +
19313 +                               latency_hist(PREEMPTIRQSOFF_LATENCY, cpu,
19314 +                                   latency, 0, stop, NULL);
19315 +                       }
19316 +                       per_cpu(hist_preemptirqsoff_counting, cpu) = 0;
19317 +               }
19318 +#endif
19319 +       }
19320 +}
19321 +#endif
19322 +
19323 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
19324 +static DEFINE_RAW_SPINLOCK(wakeup_lock);
19325 +static notrace void probe_sched_migrate_task(void *v, struct task_struct *task,
19326 +       int cpu)
19327 +{
19328 +       int old_cpu = task_cpu(task);
19329 +
19330 +       if (cpu != old_cpu) {
19331 +               unsigned long flags;
19332 +               struct task_struct *cpu_wakeup_task;
19333 +
19334 +               raw_spin_lock_irqsave(&wakeup_lock, flags);
19335 +
19336 +               cpu_wakeup_task = per_cpu(wakeup_task, old_cpu);
19337 +               if (task == cpu_wakeup_task) {
19338 +                       put_task_struct(cpu_wakeup_task);
19339 +                       per_cpu(wakeup_task, old_cpu) = NULL;
19340 +                       cpu_wakeup_task = per_cpu(wakeup_task, cpu) = task;
19341 +                       get_task_struct(cpu_wakeup_task);
19342 +               }
19343 +
19344 +               raw_spin_unlock_irqrestore(&wakeup_lock, flags);
19345 +       }
19346 +}
19347 +
19348 +static notrace void probe_wakeup_latency_hist_start(void *v,
19349 +       struct task_struct *p)
19350 +{
19351 +       unsigned long flags;
19352 +       struct task_struct *curr = current;
19353 +       int cpu = task_cpu(p);
19354 +       struct task_struct *cpu_wakeup_task;
19355 +
19356 +       raw_spin_lock_irqsave(&wakeup_lock, flags);
19357 +
19358 +       cpu_wakeup_task = per_cpu(wakeup_task, cpu);
19359 +
19360 +       if (wakeup_pid) {
19361 +               if ((cpu_wakeup_task && p->prio == cpu_wakeup_task->prio) ||
19362 +                   p->prio == curr->prio)
19363 +                       per_cpu(wakeup_sharedprio, cpu) = 1;
19364 +               if (likely(wakeup_pid != task_pid_nr(p)))
19365 +                       goto out;
19366 +       } else {
19367 +               if (likely(!rt_task(p)) ||
19368 +                   (cpu_wakeup_task && p->prio > cpu_wakeup_task->prio) ||
19369 +                   p->prio > curr->prio)
19370 +                       goto out;
19371 +               if ((cpu_wakeup_task && p->prio == cpu_wakeup_task->prio) ||
19372 +                   p->prio == curr->prio)
19373 +                       per_cpu(wakeup_sharedprio, cpu) = 1;
19374 +       }
19375 +
19376 +       if (cpu_wakeup_task)
19377 +               put_task_struct(cpu_wakeup_task);
19378 +       cpu_wakeup_task = per_cpu(wakeup_task, cpu) = p;
19379 +       get_task_struct(cpu_wakeup_task);
19380 +       cpu_wakeup_task->preempt_timestamp_hist =
19381 +               ftrace_now(raw_smp_processor_id());
19382 +out:
19383 +       raw_spin_unlock_irqrestore(&wakeup_lock, flags);
19384 +}
19385 +
19386 +static notrace void probe_wakeup_latency_hist_stop(void *v,
19387 +       bool preempt, struct task_struct *prev, struct task_struct *next)
19388 +{
19389 +       unsigned long flags;
19390 +       int cpu = task_cpu(next);
19391 +       long latency;
19392 +       cycle_t stop;
19393 +       struct task_struct *cpu_wakeup_task;
19394 +
19395 +       raw_spin_lock_irqsave(&wakeup_lock, flags);
19396 +
19397 +       cpu_wakeup_task = per_cpu(wakeup_task, cpu);
19398 +
19399 +       if (cpu_wakeup_task == NULL)
19400 +               goto out;
19401 +
19402 +       /* Already running? */
19403 +       if (unlikely(current == cpu_wakeup_task))
19404 +               goto out_reset;
19405 +
19406 +       if (next != cpu_wakeup_task) {
19407 +               if (next->prio < cpu_wakeup_task->prio)
19408 +                       goto out_reset;
19409 +
19410 +               if (next->prio == cpu_wakeup_task->prio)
19411 +                       per_cpu(wakeup_sharedprio, cpu) = 1;
19412 +
19413 +               goto out;
19414 +       }
19415 +
19416 +       if (current->prio == cpu_wakeup_task->prio)
19417 +               per_cpu(wakeup_sharedprio, cpu) = 1;
19418 +
19419 +       /*
19420 +        * The task we are waiting for is about to be switched to.
19421 +        * Calculate latency and store it in histogram.
19422 +        */
19423 +       stop = ftrace_now(raw_smp_processor_id());
19424 +
19425 +       latency = ((long) (stop - next->preempt_timestamp_hist)) /
19426 +           NSECS_PER_USECS;
19427 +
19428 +       if (per_cpu(wakeup_sharedprio, cpu)) {
19429 +               latency_hist(WAKEUP_LATENCY_SHAREDPRIO, cpu, latency, 0, stop,
19430 +                   next);
19431 +               per_cpu(wakeup_sharedprio, cpu) = 0;
19432 +       } else {
19433 +               latency_hist(WAKEUP_LATENCY, cpu, latency, 0, stop, next);
19434 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
19435 +               if (timerandwakeup_enabled_data.enabled) {
19436 +                       latency_hist(TIMERANDWAKEUP_LATENCY, cpu,
19437 +                           next->timer_offset + latency, next->timer_offset,
19438 +                           stop, next);
19439 +               }
19440 +#endif
19441 +       }
19442 +
19443 +out_reset:
19444 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
19445 +       next->timer_offset = 0;
19446 +#endif
19447 +       put_task_struct(cpu_wakeup_task);
19448 +       per_cpu(wakeup_task, cpu) = NULL;
19449 +out:
19450 +       raw_spin_unlock_irqrestore(&wakeup_lock, flags);
19451 +}
19452 +#endif
19453 +
19454 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
19455 +static notrace void probe_hrtimer_interrupt(void *v, int cpu,
19456 +       long long latency_ns, struct task_struct *curr,
19457 +       struct task_struct *task)
19458 +{
19459 +       if (latency_ns <= 0 && task != NULL && rt_task(task) &&
19460 +           (task->prio < curr->prio ||
19461 +           (task->prio == curr->prio &&
19462 +           !cpumask_test_cpu(cpu, &task->cpus_allowed)))) {
19463 +               long latency;
19464 +               cycle_t now;
19465 +
19466 +               if (missed_timer_offsets_pid) {
19467 +                       if (likely(missed_timer_offsets_pid !=
19468 +                           task_pid_nr(task)))
19469 +                               return;
19470 +               }
19471 +
19472 +               now = ftrace_now(cpu);
19473 +               latency = (long) div_s64(-latency_ns, NSECS_PER_USECS);
19474 +               latency_hist(MISSED_TIMER_OFFSETS, cpu, latency, latency, now,
19475 +                   task);
19476 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
19477 +               task->timer_offset = latency;
19478 +#endif
19479 +       }
19480 +}
19481 +#endif
19482 +
19483 +static __init int latency_hist_init(void)
19484 +{
19485 +       struct dentry *latency_hist_root = NULL;
19486 +       struct dentry *dentry;
19487 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
19488 +       struct dentry *dentry_sharedprio;
19489 +#endif
19490 +       struct dentry *entry;
19491 +       struct dentry *enable_root;
19492 +       int i = 0;
19493 +       struct hist_data *my_hist;
19494 +       char name[64];
19495 +       char *cpufmt = "CPU%d";
19496 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
19497 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
19498 +       char *cpufmt_maxlatproc = "max_latency-CPU%d";
19499 +       struct maxlatproc_data *mp = NULL;
19500 +#endif
19501 +
19502 +       dentry = tracing_init_dentry();
19503 +       latency_hist_root = debugfs_create_dir(latency_hist_dir_root, dentry);
19504 +       enable_root = debugfs_create_dir("enable", latency_hist_root);
19505 +
19506 +#ifdef CONFIG_INTERRUPT_OFF_HIST
19507 +       dentry = debugfs_create_dir(irqsoff_hist_dir, latency_hist_root);
19508 +       for_each_possible_cpu(i) {
19509 +               sprintf(name, cpufmt, i);
19510 +               entry = debugfs_create_file(name, 0444, dentry,
19511 +                   &per_cpu(irqsoff_hist, i), &latency_hist_fops);
19512 +               my_hist = &per_cpu(irqsoff_hist, i);
19513 +               atomic_set(&my_hist->hist_mode, 1);
19514 +               my_hist->min_lat = LONG_MAX;
19515 +       }
19516 +       entry = debugfs_create_file("reset", 0644, dentry,
19517 +           (void *)IRQSOFF_LATENCY, &latency_hist_reset_fops);
19518 +#endif
19519 +
19520 +#ifdef CONFIG_PREEMPT_OFF_HIST
19521 +       dentry = debugfs_create_dir(preemptoff_hist_dir,
19522 +           latency_hist_root);
19523 +       for_each_possible_cpu(i) {
19524 +               sprintf(name, cpufmt, i);
19525 +               entry = debugfs_create_file(name, 0444, dentry,
19526 +                   &per_cpu(preemptoff_hist, i), &latency_hist_fops);
19527 +               my_hist = &per_cpu(preemptoff_hist, i);
19528 +               atomic_set(&my_hist->hist_mode, 1);
19529 +               my_hist->min_lat = LONG_MAX;
19530 +       }
19531 +       entry = debugfs_create_file("reset", 0644, dentry,
19532 +           (void *)PREEMPTOFF_LATENCY, &latency_hist_reset_fops);
19533 +#endif
19534 +
19535 +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
19536 +       dentry = debugfs_create_dir(preemptirqsoff_hist_dir,
19537 +           latency_hist_root);
19538 +       for_each_possible_cpu(i) {
19539 +               sprintf(name, cpufmt, i);
19540 +               entry = debugfs_create_file(name, 0444, dentry,
19541 +                   &per_cpu(preemptirqsoff_hist, i), &latency_hist_fops);
19542 +               my_hist = &per_cpu(preemptirqsoff_hist, i);
19543 +               atomic_set(&my_hist->hist_mode, 1);
19544 +               my_hist->min_lat = LONG_MAX;
19545 +       }
19546 +       entry = debugfs_create_file("reset", 0644, dentry,
19547 +           (void *)PREEMPTIRQSOFF_LATENCY, &latency_hist_reset_fops);
19548 +#endif
19549 +
19550 +#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST)
19551 +       entry = debugfs_create_file("preemptirqsoff", 0644,
19552 +           enable_root, (void *)&preemptirqsoff_enabled_data,
19553 +           &enable_fops);
19554 +#endif
19555 +
19556 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
19557 +       dentry = debugfs_create_dir(wakeup_latency_hist_dir,
19558 +           latency_hist_root);
19559 +       dentry_sharedprio = debugfs_create_dir(
19560 +           wakeup_latency_hist_dir_sharedprio, dentry);
19561 +       for_each_possible_cpu(i) {
19562 +               sprintf(name, cpufmt, i);
19563 +
19564 +               entry = debugfs_create_file(name, 0444, dentry,
19565 +                   &per_cpu(wakeup_latency_hist, i),
19566 +                   &latency_hist_fops);
19567 +               my_hist = &per_cpu(wakeup_latency_hist, i);
19568 +               atomic_set(&my_hist->hist_mode, 1);
19569 +               my_hist->min_lat = LONG_MAX;
19570 +
19571 +               entry = debugfs_create_file(name, 0444, dentry_sharedprio,
19572 +                   &per_cpu(wakeup_latency_hist_sharedprio, i),
19573 +                   &latency_hist_fops);
19574 +               my_hist = &per_cpu(wakeup_latency_hist_sharedprio, i);
19575 +               atomic_set(&my_hist->hist_mode, 1);
19576 +               my_hist->min_lat = LONG_MAX;
19577 +
19578 +               sprintf(name, cpufmt_maxlatproc, i);
19579 +
19580 +               mp = &per_cpu(wakeup_maxlatproc, i);
19581 +               entry = debugfs_create_file(name, 0444, dentry, mp,
19582 +                   &maxlatproc_fops);
19583 +               clear_maxlatprocdata(mp);
19584 +
19585 +               mp = &per_cpu(wakeup_maxlatproc_sharedprio, i);
19586 +               entry = debugfs_create_file(name, 0444, dentry_sharedprio, mp,
19587 +                   &maxlatproc_fops);
19588 +               clear_maxlatprocdata(mp);
19589 +       }
19590 +       entry = debugfs_create_file("pid", 0644, dentry,
19591 +           (void *)&wakeup_pid, &pid_fops);
19592 +       entry = debugfs_create_file("reset", 0644, dentry,
19593 +           (void *)WAKEUP_LATENCY, &latency_hist_reset_fops);
19594 +       entry = debugfs_create_file("reset", 0644, dentry_sharedprio,
19595 +           (void *)WAKEUP_LATENCY_SHAREDPRIO, &latency_hist_reset_fops);
19596 +       entry = debugfs_create_file("wakeup", 0644,
19597 +           enable_root, (void *)&wakeup_latency_enabled_data,
19598 +           &enable_fops);
19599 +#endif
19600 +
19601 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
19602 +       dentry = debugfs_create_dir(missed_timer_offsets_dir,
19603 +           latency_hist_root);
19604 +       for_each_possible_cpu(i) {
19605 +               sprintf(name, cpufmt, i);
19606 +               entry = debugfs_create_file(name, 0444, dentry,
19607 +                   &per_cpu(missed_timer_offsets, i), &latency_hist_fops);
19608 +               my_hist = &per_cpu(missed_timer_offsets, i);
19609 +               atomic_set(&my_hist->hist_mode, 1);
19610 +               my_hist->min_lat = LONG_MAX;
19611 +
19612 +               sprintf(name, cpufmt_maxlatproc, i);
19613 +               mp = &per_cpu(missed_timer_offsets_maxlatproc, i);
19614 +               entry = debugfs_create_file(name, 0444, dentry, mp,
19615 +                   &maxlatproc_fops);
19616 +               clear_maxlatprocdata(mp);
19617 +       }
19618 +       entry = debugfs_create_file("pid", 0644, dentry,
19619 +           (void *)&missed_timer_offsets_pid, &pid_fops);
19620 +       entry = debugfs_create_file("reset", 0644, dentry,
19621 +           (void *)MISSED_TIMER_OFFSETS, &latency_hist_reset_fops);
19622 +       entry = debugfs_create_file("missed_timer_offsets", 0644,
19623 +           enable_root, (void *)&missed_timer_offsets_enabled_data,
19624 +           &enable_fops);
19625 +#endif
19626 +
19627 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
19628 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
19629 +       dentry = debugfs_create_dir(timerandwakeup_latency_hist_dir,
19630 +           latency_hist_root);
19631 +       for_each_possible_cpu(i) {
19632 +               sprintf(name, cpufmt, i);
19633 +               entry = debugfs_create_file(name, 0444, dentry,
19634 +                   &per_cpu(timerandwakeup_latency_hist, i),
19635 +                   &latency_hist_fops);
19636 +               my_hist = &per_cpu(timerandwakeup_latency_hist, i);
19637 +               atomic_set(&my_hist->hist_mode, 1);
19638 +               my_hist->min_lat = LONG_MAX;
19639 +
19640 +               sprintf(name, cpufmt_maxlatproc, i);
19641 +               mp = &per_cpu(timerandwakeup_maxlatproc, i);
19642 +               entry = debugfs_create_file(name, 0444, dentry, mp,
19643 +                   &maxlatproc_fops);
19644 +               clear_maxlatprocdata(mp);
19645 +       }
19646 +       entry = debugfs_create_file("reset", 0644, dentry,
19647 +           (void *)TIMERANDWAKEUP_LATENCY, &latency_hist_reset_fops);
19648 +       entry = debugfs_create_file("timerandwakeup", 0644,
19649 +           enable_root, (void *)&timerandwakeup_enabled_data,
19650 +           &enable_fops);
19651 +#endif
19652 +       return 0;
19653 +}
19654 +
19655 +device_initcall(latency_hist_init);
19656 diff -Nur linux-4.9.6.orig/kernel/trace/Makefile linux-4.9.6/kernel/trace/Makefile
19657 --- linux-4.9.6.orig/kernel/trace/Makefile      2017-01-26 08:25:24.000000000 +0100
19658 +++ linux-4.9.6/kernel/trace/Makefile   2017-01-28 13:59:10.091662336 +0100
19659 @@ -38,6 +38,10 @@
19660  obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o
19661  obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o
19662  obj-$(CONFIG_HWLAT_TRACER) += trace_hwlat.o
19663 +obj-$(CONFIG_INTERRUPT_OFF_HIST) += latency_hist.o
19664 +obj-$(CONFIG_PREEMPT_OFF_HIST) += latency_hist.o
19665 +obj-$(CONFIG_WAKEUP_LATENCY_HIST) += latency_hist.o
19666 +obj-$(CONFIG_MISSED_TIMER_OFFSETS_HIST) += latency_hist.o
19667  obj-$(CONFIG_NOP_TRACER) += trace_nop.o
19668  obj-$(CONFIG_STACK_TRACER) += trace_stack.o
19669  obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
19670 diff -Nur linux-4.9.6.orig/kernel/trace/trace.c linux-4.9.6/kernel/trace/trace.c
19671 --- linux-4.9.6.orig/kernel/trace/trace.c       2017-01-26 08:25:24.000000000 +0100
19672 +++ linux-4.9.6/kernel/trace/trace.c    2017-01-28 13:59:10.091662336 +0100
19673 @@ -1897,6 +1897,7 @@
19674         struct task_struct *tsk = current;
19675
19676         entry->preempt_count            = pc & 0xff;
19677 +       entry->preempt_lazy_count       = preempt_lazy_count();
19678         entry->pid                      = (tsk) ? tsk->pid : 0;
19679         entry->flags =
19680  #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
19681 @@ -1907,8 +1908,11 @@
19682                 ((pc & NMI_MASK    ) ? TRACE_FLAG_NMI     : 0) |
19683                 ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) |
19684                 ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) |
19685 -               (tif_need_resched() ? TRACE_FLAG_NEED_RESCHED : 0) |
19686 +               (tif_need_resched_now() ? TRACE_FLAG_NEED_RESCHED : 0) |
19687 +               (need_resched_lazy() ? TRACE_FLAG_NEED_RESCHED_LAZY : 0) |
19688                 (test_preempt_need_resched() ? TRACE_FLAG_PREEMPT_RESCHED : 0);
19689 +
19690 +       entry->migrate_disable = (tsk) ? __migrate_disabled(tsk) & 0xFF : 0;
19691  }
19692  EXPORT_SYMBOL_GPL(tracing_generic_entry_update);
19693
19694 @@ -2892,14 +2896,17 @@
19695
19696  static void print_lat_help_header(struct seq_file *m)
19697  {
19698 -       seq_puts(m, "#                  _------=> CPU#            \n"
19699 -                   "#                 / _-----=> irqs-off        \n"
19700 -                   "#                | / _----=> need-resched    \n"
19701 -                   "#                || / _---=> hardirq/softirq \n"
19702 -                   "#                ||| / _--=> preempt-depth   \n"
19703 -                   "#                |||| /     delay            \n"
19704 -                   "#  cmd     pid   ||||| time  |   caller      \n"
19705 -                   "#     \\   /      |||||  \\    |   /         \n");
19706 +       seq_puts(m, "#                  _--------=> CPU#              \n"
19707 +                   "#                 / _-------=> irqs-off          \n"
19708 +                   "#                | / _------=> need-resched      \n"
19709 +                   "#                || / _-----=> need-resched_lazy \n"
19710 +                   "#                ||| / _----=> hardirq/softirq   \n"
19711 +                   "#                |||| / _---=> preempt-depth     \n"
19712 +                   "#                ||||| / _--=> preempt-lazy-depth\n"
19713 +                   "#                |||||| / _-=> migrate-disable   \n"
19714 +                   "#                ||||||| /     delay             \n"
19715 +                   "# cmd     pid    |||||||| time   |  caller       \n"
19716 +                   "#     \\   /      ||||||||   \\    |  /            \n");
19717  }
19718
19719  static void print_event_info(struct trace_buffer *buf, struct seq_file *m)
19720 @@ -2925,11 +2932,14 @@
19721         print_event_info(buf, m);
19722         seq_puts(m, "#                              _-----=> irqs-off\n"
19723                     "#                             / _----=> need-resched\n"
19724 -                   "#                            | / _---=> hardirq/softirq\n"
19725 -                   "#                            || / _--=> preempt-depth\n"
19726 -                   "#                            ||| /     delay\n"
19727 -                   "#           TASK-PID   CPU#  ||||    TIMESTAMP  FUNCTION\n"
19728 -                   "#              | |       |   ||||       |         |\n");
19729 +                   "#                            |/  _-----=> need-resched_lazy\n"
19730 +                   "#                            || / _---=> hardirq/softirq\n"
19731 +                   "#                            ||| / _--=> preempt-depth\n"
19732 +                   "#                            |||| / _-=> preempt-lazy-depth\n"
19733 +                   "#                            ||||| / _-=> migrate-disable   \n"
19734 +                   "#                            |||||| /    delay\n"
19735 +                   "#           TASK-PID   CPU#  |||||||   TIMESTAMP  FUNCTION\n"
19736 +                   "#              | |       |   |||||||      |         |\n");
19737  }
19738
19739  void
19740 diff -Nur linux-4.9.6.orig/kernel/trace/trace_events.c linux-4.9.6/kernel/trace/trace_events.c
19741 --- linux-4.9.6.orig/kernel/trace/trace_events.c        2017-01-26 08:25:24.000000000 +0100
19742 +++ linux-4.9.6/kernel/trace/trace_events.c     2017-01-28 13:59:10.091662336 +0100
19743 @@ -187,6 +187,8 @@
19744         __common_field(unsigned char, flags);
19745         __common_field(unsigned char, preempt_count);
19746         __common_field(int, pid);
19747 +       __common_field(unsigned short, migrate_disable);
19748 +       __common_field(unsigned short, padding);
19749
19750         return ret;
19751  }
19752 diff -Nur linux-4.9.6.orig/kernel/trace/trace.h linux-4.9.6/kernel/trace/trace.h
19753 --- linux-4.9.6.orig/kernel/trace/trace.h       2017-01-26 08:25:24.000000000 +0100
19754 +++ linux-4.9.6/kernel/trace/trace.h    2017-01-28 13:59:10.091662336 +0100
19755 @@ -124,6 +124,7 @@
19756   *  NEED_RESCHED       - reschedule is requested
19757   *  HARDIRQ            - inside an interrupt handler
19758   *  SOFTIRQ            - inside a softirq handler
19759 + *  NEED_RESCHED_LAZY  - lazy reschedule is requested
19760   */
19761  enum trace_flag_type {
19762         TRACE_FLAG_IRQS_OFF             = 0x01,
19763 @@ -133,6 +134,7 @@
19764         TRACE_FLAG_SOFTIRQ              = 0x10,
19765         TRACE_FLAG_PREEMPT_RESCHED      = 0x20,
19766         TRACE_FLAG_NMI                  = 0x40,
19767 +       TRACE_FLAG_NEED_RESCHED_LAZY    = 0x80,
19768  };
19769
19770  #define TRACE_BUF_SIZE         1024
19771 diff -Nur linux-4.9.6.orig/kernel/trace/trace_irqsoff.c linux-4.9.6/kernel/trace/trace_irqsoff.c
19772 --- linux-4.9.6.orig/kernel/trace/trace_irqsoff.c       2017-01-26 08:25:24.000000000 +0100
19773 +++ linux-4.9.6/kernel/trace/trace_irqsoff.c    2017-01-28 13:59:10.091662336 +0100
19774 @@ -13,6 +13,7 @@
19775  #include <linux/uaccess.h>
19776  #include <linux/module.h>
19777  #include <linux/ftrace.h>
19778 +#include <trace/events/hist.h>
19779
19780  #include "trace.h"
19781
19782 @@ -424,11 +425,13 @@
19783  {
19784         if (preempt_trace() || irq_trace())
19785                 start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
19786 +       trace_preemptirqsoff_hist_rcuidle(TRACE_START, 1);
19787  }
19788  EXPORT_SYMBOL_GPL(start_critical_timings);
19789
19790  void stop_critical_timings(void)
19791  {
19792 +       trace_preemptirqsoff_hist_rcuidle(TRACE_STOP, 0);
19793         if (preempt_trace() || irq_trace())
19794                 stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
19795  }
19796 @@ -438,6 +441,7 @@
19797  #ifdef CONFIG_PROVE_LOCKING
19798  void time_hardirqs_on(unsigned long a0, unsigned long a1)
19799  {
19800 +       trace_preemptirqsoff_hist_rcuidle(IRQS_ON, 0);
19801         if (!preempt_trace() && irq_trace())
19802                 stop_critical_timing(a0, a1);
19803  }
19804 @@ -446,6 +450,7 @@
19805  {
19806         if (!preempt_trace() && irq_trace())
19807                 start_critical_timing(a0, a1);
19808 +       trace_preemptirqsoff_hist_rcuidle(IRQS_OFF, 1);
19809  }
19810
19811  #else /* !CONFIG_PROVE_LOCKING */
19812 @@ -471,6 +476,7 @@
19813   */
19814  void trace_hardirqs_on(void)
19815  {
19816 +       trace_preemptirqsoff_hist(IRQS_ON, 0);
19817         if (!preempt_trace() && irq_trace())
19818                 stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
19819  }
19820 @@ -480,11 +486,13 @@
19821  {
19822         if (!preempt_trace() && irq_trace())
19823                 start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
19824 +       trace_preemptirqsoff_hist(IRQS_OFF, 1);
19825  }
19826  EXPORT_SYMBOL(trace_hardirqs_off);
19827
19828  __visible void trace_hardirqs_on_caller(unsigned long caller_addr)
19829  {
19830 +       trace_preemptirqsoff_hist(IRQS_ON, 0);
19831         if (!preempt_trace() && irq_trace())
19832                 stop_critical_timing(CALLER_ADDR0, caller_addr);
19833  }
19834 @@ -494,6 +502,7 @@
19835  {
19836         if (!preempt_trace() && irq_trace())
19837                 start_critical_timing(CALLER_ADDR0, caller_addr);
19838 +       trace_preemptirqsoff_hist(IRQS_OFF, 1);
19839  }
19840  EXPORT_SYMBOL(trace_hardirqs_off_caller);
19841
19842 @@ -503,12 +512,14 @@
19843  #ifdef CONFIG_PREEMPT_TRACER
19844  void trace_preempt_on(unsigned long a0, unsigned long a1)
19845  {
19846 +       trace_preemptirqsoff_hist(PREEMPT_ON, 0);
19847         if (preempt_trace() && !irq_trace())
19848                 stop_critical_timing(a0, a1);
19849  }
19850
19851  void trace_preempt_off(unsigned long a0, unsigned long a1)
19852  {
19853 +       trace_preemptirqsoff_hist(PREEMPT_ON, 1);
19854         if (preempt_trace() && !irq_trace())
19855                 start_critical_timing(a0, a1);
19856  }
19857 diff -Nur linux-4.9.6.orig/kernel/trace/trace_output.c linux-4.9.6/kernel/trace/trace_output.c
19858 --- linux-4.9.6.orig/kernel/trace/trace_output.c        2017-01-26 08:25:24.000000000 +0100
19859 +++ linux-4.9.6/kernel/trace/trace_output.c     2017-01-28 13:59:10.091662336 +0100
19860 @@ -386,6 +386,7 @@
19861  {
19862         char hardsoft_irq;
19863         char need_resched;
19864 +       char need_resched_lazy;
19865         char irqs_off;
19866         int hardirq;
19867         int softirq;
19868 @@ -416,6 +417,9 @@
19869                 break;
19870         }
19871
19872 +       need_resched_lazy =
19873 +               (entry->flags & TRACE_FLAG_NEED_RESCHED_LAZY) ? 'L' : '.';
19874 +
19875         hardsoft_irq =
19876                 (nmi && hardirq)     ? 'Z' :
19877                 nmi                  ? 'z' :
19878 @@ -424,14 +428,25 @@
19879                 softirq              ? 's' :
19880                                        '.' ;
19881
19882 -       trace_seq_printf(s, "%c%c%c",
19883 -                        irqs_off, need_resched, hardsoft_irq);
19884 +       trace_seq_printf(s, "%c%c%c%c",
19885 +                        irqs_off, need_resched, need_resched_lazy,
19886 +                        hardsoft_irq);
19887
19888         if (entry->preempt_count)
19889                 trace_seq_printf(s, "%x", entry->preempt_count);
19890         else
19891                 trace_seq_putc(s, '.');
19892
19893 +       if (entry->preempt_lazy_count)
19894 +               trace_seq_printf(s, "%x", entry->preempt_lazy_count);
19895 +       else
19896 +               trace_seq_putc(s, '.');
19897 +
19898 +       if (entry->migrate_disable)
19899 +               trace_seq_printf(s, "%x", entry->migrate_disable);
19900 +       else
19901 +               trace_seq_putc(s, '.');
19902 +
19903         return !trace_seq_has_overflowed(s);
19904  }
19905
19906 diff -Nur linux-4.9.6.orig/kernel/user.c linux-4.9.6/kernel/user.c
19907 --- linux-4.9.6.orig/kernel/user.c      2017-01-26 08:25:24.000000000 +0100
19908 +++ linux-4.9.6/kernel/user.c   2017-01-28 13:59:10.091662336 +0100
19909 @@ -161,11 +161,11 @@
19910         if (!up)
19911                 return;
19912
19913 -       local_irq_save(flags);
19914 +       local_irq_save_nort(flags);
19915         if (atomic_dec_and_lock(&up->__count, &uidhash_lock))
19916                 free_user(up, flags);
19917         else
19918 -               local_irq_restore(flags);
19919 +               local_irq_restore_nort(flags);
19920  }
19921
19922  struct user_struct *alloc_uid(kuid_t uid)
19923 diff -Nur linux-4.9.6.orig/kernel/watchdog.c linux-4.9.6/kernel/watchdog.c
19924 --- linux-4.9.6.orig/kernel/watchdog.c  2017-01-26 08:25:24.000000000 +0100
19925 +++ linux-4.9.6/kernel/watchdog.c       2017-01-28 13:59:10.091662336 +0100
19926 @@ -315,6 +315,8 @@
19927
19928  #ifdef CONFIG_HARDLOCKUP_DETECTOR
19929
19930 +static DEFINE_RAW_SPINLOCK(watchdog_output_lock);
19931 +
19932  static struct perf_event_attr wd_hw_attr = {
19933         .type           = PERF_TYPE_HARDWARE,
19934         .config         = PERF_COUNT_HW_CPU_CYCLES,
19935 @@ -348,6 +350,13 @@
19936                 /* only print hardlockups once */
19937                 if (__this_cpu_read(hard_watchdog_warn) == true)
19938                         return;
19939 +               /*
19940 +                * If early-printk is enabled then make sure we do not
19941 +                * lock up in printk() and kill console logging:
19942 +                */
19943 +               printk_kill();
19944 +
19945 +               raw_spin_lock(&watchdog_output_lock);
19946
19947                 pr_emerg("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
19948                 print_modules();
19949 @@ -365,6 +374,7 @@
19950                                 !test_and_set_bit(0, &hardlockup_allcpu_dumped))
19951                         trigger_allbutself_cpu_backtrace();
19952
19953 +               raw_spin_unlock(&watchdog_output_lock);
19954                 if (hardlockup_panic)
19955                         nmi_panic(regs, "Hard LOCKUP");
19956
19957 @@ -512,6 +522,7 @@
19958         /* kick off the timer for the hardlockup detector */
19959         hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
19960         hrtimer->function = watchdog_timer_fn;
19961 +       hrtimer->irqsafe = 1;
19962
19963         /* Enable the perf event */
19964         watchdog_nmi_enable(cpu);
19965 diff -Nur linux-4.9.6.orig/kernel/workqueue.c linux-4.9.6/kernel/workqueue.c
19966 --- linux-4.9.6.orig/kernel/workqueue.c 2017-01-26 08:25:24.000000000 +0100
19967 +++ linux-4.9.6/kernel/workqueue.c      2017-01-28 13:59:10.095662491 +0100
19968 @@ -48,6 +48,8 @@
19969  #include <linux/nodemask.h>
19970  #include <linux/moduleparam.h>
19971  #include <linux/uaccess.h>
19972 +#include <linux/locallock.h>
19973 +#include <linux/delay.h>
19974
19975  #include "workqueue_internal.h"
19976
19977 @@ -121,11 +123,16 @@
19978   *    cpu or grabbing pool->lock is enough for read access.  If
19979   *    POOL_DISASSOCIATED is set, it's identical to L.
19980   *
19981 + *    On RT we need the extra protection via rt_lock_idle_list() for
19982 + *    the list manipulations against read access from
19983 + *    wq_worker_sleeping(). All other places are nicely serialized via
19984 + *    pool->lock.
19985 + *
19986   * A: pool->attach_mutex protected.
19987   *
19988   * PL: wq_pool_mutex protected.
19989   *
19990 - * PR: wq_pool_mutex protected for writes.  Sched-RCU protected for reads.
19991 + * PR: wq_pool_mutex protected for writes.  RCU protected for reads.
19992   *
19993   * PW: wq_pool_mutex and wq->mutex protected for writes.  Either for reads.
19994   *
19995 @@ -134,7 +141,7 @@
19996   *
19997   * WQ: wq->mutex protected.
19998   *
19999 - * WR: wq->mutex protected for writes.  Sched-RCU protected for reads.
20000 + * WR: wq->mutex protected for writes.  RCU protected for reads.
20001   *
20002   * MD: wq_mayday_lock protected.
20003   */
20004 @@ -185,7 +192,7 @@
20005         atomic_t                nr_running ____cacheline_aligned_in_smp;
20006
20007         /*
20008 -        * Destruction of pool is sched-RCU protected to allow dereferences
20009 +        * Destruction of pool is RCU protected to allow dereferences
20010          * from get_work_pool().
20011          */
20012         struct rcu_head         rcu;
20013 @@ -214,7 +221,7 @@
20014         /*
20015          * Release of unbound pwq is punted to system_wq.  See put_pwq()
20016          * and pwq_unbound_release_workfn() for details.  pool_workqueue
20017 -        * itself is also sched-RCU protected so that the first pwq can be
20018 +        * itself is also RCU protected so that the first pwq can be
20019          * determined without grabbing wq->mutex.
20020          */
20021         struct work_struct      unbound_release_work;
20022 @@ -348,6 +355,8 @@
20023  struct workqueue_struct *system_freezable_power_efficient_wq __read_mostly;
20024  EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq);
20025
20026 +static DEFINE_LOCAL_IRQ_LOCK(pendingb_lock);
20027 +
20028  static int worker_thread(void *__worker);
20029  static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
20030
20031 @@ -355,20 +364,20 @@
20032  #include <trace/events/workqueue.h>
20033
20034  #define assert_rcu_or_pool_mutex()                                     \
20035 -       RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() &&                 \
20036 +       RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&                       \
20037                          !lockdep_is_held(&wq_pool_mutex),              \
20038 -                        "sched RCU or wq_pool_mutex should be held")
20039 +                        "RCU or wq_pool_mutex should be held")
20040
20041  #define assert_rcu_or_wq_mutex(wq)                                     \
20042 -       RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() &&                 \
20043 +       RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&                       \
20044                          !lockdep_is_held(&wq->mutex),                  \
20045 -                        "sched RCU or wq->mutex should be held")
20046 +                        "RCU or wq->mutex should be held")
20047
20048  #define assert_rcu_or_wq_mutex_or_pool_mutex(wq)                       \
20049 -       RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() &&                 \
20050 +       RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&                       \
20051                          !lockdep_is_held(&wq->mutex) &&                \
20052                          !lockdep_is_held(&wq_pool_mutex),              \
20053 -                        "sched RCU, wq->mutex or wq_pool_mutex should be held")
20054 +                        "RCU, wq->mutex or wq_pool_mutex should be held")
20055
20056  #define for_each_cpu_worker_pool(pool, cpu)                            \
20057         for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0];               \
20058 @@ -380,7 +389,7 @@
20059   * @pool: iteration cursor
20060   * @pi: integer used for iteration
20061   *
20062 - * This must be called either with wq_pool_mutex held or sched RCU read
20063 + * This must be called either with wq_pool_mutex held or RCU read
20064   * locked.  If the pool needs to be used beyond the locking in effect, the
20065   * caller is responsible for guaranteeing that the pool stays online.
20066   *
20067 @@ -412,7 +421,7 @@
20068   * @pwq: iteration cursor
20069   * @wq: the target workqueue
20070   *
20071 - * This must be called either with wq->mutex held or sched RCU read locked.
20072 + * This must be called either with wq->mutex held or RCU read locked.
20073   * If the pwq needs to be used beyond the locking in effect, the caller is
20074   * responsible for guaranteeing that the pwq stays online.
20075   *
20076 @@ -424,6 +433,31 @@
20077                 if (({ assert_rcu_or_wq_mutex(wq); false; })) { }       \
20078                 else
20079
20080 +#ifdef CONFIG_PREEMPT_RT_BASE
20081 +static inline void rt_lock_idle_list(struct worker_pool *pool)
20082 +{
20083 +       preempt_disable();
20084 +}
20085 +static inline void rt_unlock_idle_list(struct worker_pool *pool)
20086 +{
20087 +       preempt_enable();
20088 +}
20089 +static inline void sched_lock_idle_list(struct worker_pool *pool) { }
20090 +static inline void sched_unlock_idle_list(struct worker_pool *pool) { }
20091 +#else
20092 +static inline void rt_lock_idle_list(struct worker_pool *pool) { }
20093 +static inline void rt_unlock_idle_list(struct worker_pool *pool) { }
20094 +static inline void sched_lock_idle_list(struct worker_pool *pool)
20095 +{
20096 +       spin_lock_irq(&pool->lock);
20097 +}
20098 +static inline void sched_unlock_idle_list(struct worker_pool *pool)
20099 +{
20100 +       spin_unlock_irq(&pool->lock);
20101 +}
20102 +#endif
20103 +
20104 +
20105  #ifdef CONFIG_DEBUG_OBJECTS_WORK
20106
20107  static struct debug_obj_descr work_debug_descr;
20108 @@ -548,7 +582,7 @@
20109   * @wq: the target workqueue
20110   * @node: the node ID
20111   *
20112 - * This must be called with any of wq_pool_mutex, wq->mutex or sched RCU
20113 + * This must be called with any of wq_pool_mutex, wq->mutex or RCU
20114   * read locked.
20115   * If the pwq needs to be used beyond the locking in effect, the caller is
20116   * responsible for guaranteeing that the pwq stays online.
20117 @@ -692,8 +726,8 @@
20118   * @work: the work item of interest
20119   *
20120   * Pools are created and destroyed under wq_pool_mutex, and allows read
20121 - * access under sched-RCU read lock.  As such, this function should be
20122 - * called under wq_pool_mutex or with preemption disabled.
20123 + * access under RCU read lock.  As such, this function should be
20124 + * called under wq_pool_mutex or inside of a rcu_read_lock() region.
20125   *
20126   * All fields of the returned pool are accessible as long as the above
20127   * mentioned locking is in effect.  If the returned pool needs to be used
20128 @@ -830,50 +864,45 @@
20129   */
20130  static void wake_up_worker(struct worker_pool *pool)
20131  {
20132 -       struct worker *worker = first_idle_worker(pool);
20133 +       struct worker *worker;
20134 +
20135 +       rt_lock_idle_list(pool);
20136 +
20137 +       worker = first_idle_worker(pool);
20138
20139         if (likely(worker))
20140                 wake_up_process(worker->task);
20141 +
20142 +       rt_unlock_idle_list(pool);
20143  }
20144
20145  /**
20146 - * wq_worker_waking_up - a worker is waking up
20147 + * wq_worker_running - a worker is running again
20148   * @task: task waking up
20149 - * @cpu: CPU @task is waking up to
20150 - *
20151 - * This function is called during try_to_wake_up() when a worker is
20152 - * being awoken.
20153   *
20154 - * CONTEXT:
20155 - * spin_lock_irq(rq->lock)
20156 + * This function is called when a worker returns from schedule()
20157   */
20158 -void wq_worker_waking_up(struct task_struct *task, int cpu)
20159 +void wq_worker_running(struct task_struct *task)
20160  {
20161         struct worker *worker = kthread_data(task);
20162
20163 -       if (!(worker->flags & WORKER_NOT_RUNNING)) {
20164 -               WARN_ON_ONCE(worker->pool->cpu != cpu);
20165 +       if (!worker->sleeping)
20166 +               return;
20167 +       if (!(worker->flags & WORKER_NOT_RUNNING))
20168                 atomic_inc(&worker->pool->nr_running);
20169 -       }
20170 +       worker->sleeping = 0;
20171  }
20172
20173  /**
20174   * wq_worker_sleeping - a worker is going to sleep
20175   * @task: task going to sleep
20176   *
20177 - * This function is called during schedule() when a busy worker is
20178 - * going to sleep.  Worker on the same cpu can be woken up by
20179 - * returning pointer to its task.
20180 - *
20181 - * CONTEXT:
20182 - * spin_lock_irq(rq->lock)
20183 - *
20184 - * Return:
20185 - * Worker task on @cpu to wake up, %NULL if none.
20186 + * This function is called from schedule() when a busy worker is
20187 + * going to sleep.
20188   */
20189 -struct task_struct *wq_worker_sleeping(struct task_struct *task)
20190 +void wq_worker_sleeping(struct task_struct *task)
20191  {
20192 -       struct worker *worker = kthread_data(task), *to_wakeup = NULL;
20193 +       struct worker *worker = kthread_data(task);
20194         struct worker_pool *pool;
20195
20196         /*
20197 @@ -882,29 +911,26 @@
20198          * checking NOT_RUNNING.
20199          */
20200         if (worker->flags & WORKER_NOT_RUNNING)
20201 -               return NULL;
20202 +               return;
20203
20204         pool = worker->pool;
20205
20206 -       /* this can only happen on the local cpu */
20207 -       if (WARN_ON_ONCE(pool->cpu != raw_smp_processor_id()))
20208 -               return NULL;
20209 +       if (WARN_ON_ONCE(worker->sleeping))
20210 +               return;
20211 +
20212 +       worker->sleeping = 1;
20213
20214         /*
20215          * The counterpart of the following dec_and_test, implied mb,
20216          * worklist not empty test sequence is in insert_work().
20217          * Please read comment there.
20218 -        *
20219 -        * NOT_RUNNING is clear.  This means that we're bound to and
20220 -        * running on the local cpu w/ rq lock held and preemption
20221 -        * disabled, which in turn means that none else could be
20222 -        * manipulating idle_list, so dereferencing idle_list without pool
20223 -        * lock is safe.
20224          */
20225         if (atomic_dec_and_test(&pool->nr_running) &&
20226 -           !list_empty(&pool->worklist))
20227 -               to_wakeup = first_idle_worker(pool);
20228 -       return to_wakeup ? to_wakeup->task : NULL;
20229 +           !list_empty(&pool->worklist)) {
20230 +               sched_lock_idle_list(pool);
20231 +               wake_up_worker(pool);
20232 +               sched_unlock_idle_list(pool);
20233 +       }
20234  }
20235
20236  /**
20237 @@ -1098,12 +1124,14 @@
20238  {
20239         if (pwq) {
20240                 /*
20241 -                * As both pwqs and pools are sched-RCU protected, the
20242 +                * As both pwqs and pools are RCU protected, the
20243                  * following lock operations are safe.
20244                  */
20245 -               spin_lock_irq(&pwq->pool->lock);
20246 +               rcu_read_lock();
20247 +               local_spin_lock_irq(pendingb_lock, &pwq->pool->lock);
20248                 put_pwq(pwq);
20249 -               spin_unlock_irq(&pwq->pool->lock);
20250 +               local_spin_unlock_irq(pendingb_lock, &pwq->pool->lock);
20251 +               rcu_read_unlock();
20252         }
20253  }
20254
20255 @@ -1207,7 +1235,7 @@
20256         struct worker_pool *pool;
20257         struct pool_workqueue *pwq;
20258
20259 -       local_irq_save(*flags);
20260 +       local_lock_irqsave(pendingb_lock, *flags);
20261
20262         /* try to steal the timer if it exists */
20263         if (is_dwork) {
20264 @@ -1226,6 +1254,7 @@
20265         if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)))
20266                 return 0;
20267
20268 +       rcu_read_lock();
20269         /*
20270          * The queueing is in progress, or it is already queued. Try to
20271          * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
20272 @@ -1264,14 +1293,16 @@
20273                 set_work_pool_and_keep_pending(work, pool->id);
20274
20275                 spin_unlock(&pool->lock);
20276 +               rcu_read_unlock();
20277                 return 1;
20278         }
20279         spin_unlock(&pool->lock);
20280  fail:
20281 -       local_irq_restore(*flags);
20282 +       rcu_read_unlock();
20283 +       local_unlock_irqrestore(pendingb_lock, *flags);
20284         if (work_is_canceling(work))
20285                 return -ENOENT;
20286 -       cpu_relax();
20287 +       cpu_chill();
20288         return -EAGAIN;
20289  }
20290
20291 @@ -1373,7 +1404,7 @@
20292          * queued or lose PENDING.  Grabbing PENDING and queueing should
20293          * happen with IRQ disabled.
20294          */
20295 -       WARN_ON_ONCE(!irqs_disabled());
20296 +       WARN_ON_ONCE_NONRT(!irqs_disabled());
20297
20298         debug_work_activate(work);
20299
20300 @@ -1381,6 +1412,7 @@
20301         if (unlikely(wq->flags & __WQ_DRAINING) &&
20302             WARN_ON_ONCE(!is_chained_work(wq)))
20303                 return;
20304 +       rcu_read_lock();
20305  retry:
20306         if (req_cpu == WORK_CPU_UNBOUND)
20307                 cpu = wq_select_unbound_cpu(raw_smp_processor_id());
20308 @@ -1437,10 +1469,8 @@
20309         /* pwq determined, queue */
20310         trace_workqueue_queue_work(req_cpu, pwq, work);
20311
20312 -       if (WARN_ON(!list_empty(&work->entry))) {
20313 -               spin_unlock(&pwq->pool->lock);
20314 -               return;
20315 -       }
20316 +       if (WARN_ON(!list_empty(&work->entry)))
20317 +               goto out;
20318
20319         pwq->nr_in_flight[pwq->work_color]++;
20320         work_flags = work_color_to_flags(pwq->work_color);
20321 @@ -1458,7 +1488,9 @@
20322
20323         insert_work(pwq, work, worklist, work_flags);
20324
20325 +out:
20326         spin_unlock(&pwq->pool->lock);
20327 +       rcu_read_unlock();
20328  }
20329
20330  /**
20331 @@ -1478,14 +1510,14 @@
20332         bool ret = false;
20333         unsigned long flags;
20334
20335 -       local_irq_save(flags);
20336 +       local_lock_irqsave(pendingb_lock,flags);
20337
20338         if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
20339                 __queue_work(cpu, wq, work);
20340                 ret = true;
20341         }
20342
20343 -       local_irq_restore(flags);
20344 +       local_unlock_irqrestore(pendingb_lock, flags);
20345         return ret;
20346  }
20347  EXPORT_SYMBOL(queue_work_on);
20348 @@ -1552,14 +1584,14 @@
20349         unsigned long flags;
20350
20351         /* read the comment in __queue_work() */
20352 -       local_irq_save(flags);
20353 +       local_lock_irqsave(pendingb_lock, flags);
20354
20355         if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
20356                 __queue_delayed_work(cpu, wq, dwork, delay);
20357                 ret = true;
20358         }
20359
20360 -       local_irq_restore(flags);
20361 +       local_unlock_irqrestore(pendingb_lock, flags);
20362         return ret;
20363  }
20364  EXPORT_SYMBOL(queue_delayed_work_on);
20365 @@ -1594,7 +1626,7 @@
20366
20367         if (likely(ret >= 0)) {
20368                 __queue_delayed_work(cpu, wq, dwork, delay);
20369 -               local_irq_restore(flags);
20370 +               local_unlock_irqrestore(pendingb_lock, flags);
20371         }
20372
20373         /* -ENOENT from try_to_grab_pending() becomes %true */
20374 @@ -1627,7 +1659,9 @@
20375         worker->last_active = jiffies;
20376
20377         /* idle_list is LIFO */
20378 +       rt_lock_idle_list(pool);
20379         list_add(&worker->entry, &pool->idle_list);
20380 +       rt_unlock_idle_list(pool);
20381
20382         if (too_many_workers(pool) && !timer_pending(&pool->idle_timer))
20383                 mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);
20384 @@ -1660,7 +1694,9 @@
20385                 return;
20386         worker_clr_flags(worker, WORKER_IDLE);
20387         pool->nr_idle--;
20388 +       rt_lock_idle_list(pool);
20389         list_del_init(&worker->entry);
20390 +       rt_unlock_idle_list(pool);
20391  }
20392
20393  static struct worker *alloc_worker(int node)
20394 @@ -1826,7 +1862,9 @@
20395         pool->nr_workers--;
20396         pool->nr_idle--;
20397
20398 +       rt_lock_idle_list(pool);
20399         list_del_init(&worker->entry);
20400 +       rt_unlock_idle_list(pool);
20401         worker->flags |= WORKER_DIE;
20402         wake_up_process(worker->task);
20403  }
20404 @@ -2785,14 +2823,14 @@
20405
20406         might_sleep();
20407
20408 -       local_irq_disable();
20409 +       rcu_read_lock();
20410         pool = get_work_pool(work);
20411         if (!pool) {
20412 -               local_irq_enable();
20413 +               rcu_read_unlock();
20414                 return false;
20415         }
20416
20417 -       spin_lock(&pool->lock);
20418 +       spin_lock_irq(&pool->lock);
20419         /* see the comment in try_to_grab_pending() with the same code */
20420         pwq = get_work_pwq(work);
20421         if (pwq) {
20422 @@ -2821,10 +2859,11 @@
20423         else
20424                 lock_map_acquire_read(&pwq->wq->lockdep_map);
20425         lock_map_release(&pwq->wq->lockdep_map);
20426 -
20427 +       rcu_read_unlock();
20428         return true;
20429  already_gone:
20430         spin_unlock_irq(&pool->lock);
20431 +       rcu_read_unlock();
20432         return false;
20433  }
20434
20435 @@ -2911,7 +2950,7 @@
20436
20437         /* tell other tasks trying to grab @work to back off */
20438         mark_work_canceling(work);
20439 -       local_irq_restore(flags);
20440 +       local_unlock_irqrestore(pendingb_lock, flags);
20441
20442         flush_work(work);
20443         clear_work_data(work);
20444 @@ -2966,10 +3005,10 @@
20445   */
20446  bool flush_delayed_work(struct delayed_work *dwork)
20447  {
20448 -       local_irq_disable();
20449 +       local_lock_irq(pendingb_lock);
20450         if (del_timer_sync(&dwork->timer))
20451                 __queue_work(dwork->cpu, dwork->wq, &dwork->work);
20452 -       local_irq_enable();
20453 +       local_unlock_irq(pendingb_lock);
20454         return flush_work(&dwork->work);
20455  }
20456  EXPORT_SYMBOL(flush_delayed_work);
20457 @@ -2987,7 +3026,7 @@
20458                 return false;
20459
20460         set_work_pool_and_clear_pending(work, get_work_pool_id(work));
20461 -       local_irq_restore(flags);
20462 +       local_unlock_irqrestore(pendingb_lock, flags);
20463         return ret;
20464  }
20465
20466 @@ -3245,7 +3284,7 @@
20467   * put_unbound_pool - put a worker_pool
20468   * @pool: worker_pool to put
20469   *
20470 - * Put @pool.  If its refcnt reaches zero, it gets destroyed in sched-RCU
20471 + * Put @pool.  If its refcnt reaches zero, it gets destroyed in RCU
20472   * safe manner.  get_unbound_pool() calls this function on its failure path
20473   * and this function should be able to release pools which went through,
20474   * successfully or not, init_worker_pool().
20475 @@ -3299,8 +3338,8 @@
20476         del_timer_sync(&pool->idle_timer);
20477         del_timer_sync(&pool->mayday_timer);
20478
20479 -       /* sched-RCU protected to allow dereferences from get_work_pool() */
20480 -       call_rcu_sched(&pool->rcu, rcu_free_pool);
20481 +       /* RCU protected to allow dereferences from get_work_pool() */
20482 +       call_rcu(&pool->rcu, rcu_free_pool);
20483  }
20484
20485  /**
20486 @@ -3407,14 +3446,14 @@
20487         put_unbound_pool(pool);
20488         mutex_unlock(&wq_pool_mutex);
20489
20490 -       call_rcu_sched(&pwq->rcu, rcu_free_pwq);
20491 +       call_rcu(&pwq->rcu, rcu_free_pwq);
20492
20493         /*
20494          * If we're the last pwq going away, @wq is already dead and no one
20495          * is gonna access it anymore.  Schedule RCU free.
20496          */
20497         if (is_last)
20498 -               call_rcu_sched(&wq->rcu, rcu_free_wq);
20499 +               call_rcu(&wq->rcu, rcu_free_wq);
20500  }
20501
20502  /**
20503 @@ -4064,7 +4103,7 @@
20504                  * The base ref is never dropped on per-cpu pwqs.  Directly
20505                  * schedule RCU free.
20506                  */
20507 -               call_rcu_sched(&wq->rcu, rcu_free_wq);
20508 +               call_rcu(&wq->rcu, rcu_free_wq);
20509         } else {
20510                 /*
20511                  * We're the sole accessor of @wq at this point.  Directly
20512 @@ -4157,7 +4196,8 @@
20513         struct pool_workqueue *pwq;
20514         bool ret;
20515
20516 -       rcu_read_lock_sched();
20517 +       rcu_read_lock();
20518 +       preempt_disable();
20519
20520         if (cpu == WORK_CPU_UNBOUND)
20521                 cpu = smp_processor_id();
20522 @@ -4168,7 +4208,8 @@
20523                 pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu));
20524
20525         ret = !list_empty(&pwq->delayed_works);
20526 -       rcu_read_unlock_sched();
20527 +       preempt_enable();
20528 +       rcu_read_unlock();
20529
20530         return ret;
20531  }
20532 @@ -4194,15 +4235,15 @@
20533         if (work_pending(work))
20534                 ret |= WORK_BUSY_PENDING;
20535
20536 -       local_irq_save(flags);
20537 +       rcu_read_lock();
20538         pool = get_work_pool(work);
20539         if (pool) {
20540 -               spin_lock(&pool->lock);
20541 +               spin_lock_irqsave(&pool->lock, flags);
20542                 if (find_worker_executing_work(pool, work))
20543                         ret |= WORK_BUSY_RUNNING;
20544 -               spin_unlock(&pool->lock);
20545 +               spin_unlock_irqrestore(&pool->lock, flags);
20546         }
20547 -       local_irq_restore(flags);
20548 +       rcu_read_unlock();
20549
20550         return ret;
20551  }
20552 @@ -4391,7 +4432,7 @@
20553         unsigned long flags;
20554         int pi;
20555
20556 -       rcu_read_lock_sched();
20557 +       rcu_read_lock();
20558
20559         pr_info("Showing busy workqueues and worker pools:\n");
20560
20561 @@ -4444,7 +4485,7 @@
20562                 spin_unlock_irqrestore(&pool->lock, flags);
20563         }
20564
20565 -       rcu_read_unlock_sched();
20566 +       rcu_read_unlock();
20567  }
20568
20569  /*
20570 @@ -4782,16 +4823,16 @@
20571                  * nr_active is monotonically decreasing.  It's safe
20572                  * to peek without lock.
20573                  */
20574 -               rcu_read_lock_sched();
20575 +               rcu_read_lock();
20576                 for_each_pwq(pwq, wq) {
20577                         WARN_ON_ONCE(pwq->nr_active < 0);
20578                         if (pwq->nr_active) {
20579                                 busy = true;
20580 -                               rcu_read_unlock_sched();
20581 +                               rcu_read_unlock();
20582                                 goto out_unlock;
20583                         }
20584                 }
20585 -               rcu_read_unlock_sched();
20586 +               rcu_read_unlock();
20587         }
20588  out_unlock:
20589         mutex_unlock(&wq_pool_mutex);
20590 @@ -4981,7 +5022,8 @@
20591         const char *delim = "";
20592         int node, written = 0;
20593
20594 -       rcu_read_lock_sched();
20595 +       get_online_cpus();
20596 +       rcu_read_lock();
20597         for_each_node(node) {
20598                 written += scnprintf(buf + written, PAGE_SIZE - written,
20599                                      "%s%d:%d", delim, node,
20600 @@ -4989,7 +5031,8 @@
20601                 delim = " ";
20602         }
20603         written += scnprintf(buf + written, PAGE_SIZE - written, "\n");
20604 -       rcu_read_unlock_sched();
20605 +       rcu_read_unlock();
20606 +       put_online_cpus();
20607
20608         return written;
20609  }
20610 diff -Nur linux-4.9.6.orig/kernel/workqueue_internal.h linux-4.9.6/kernel/workqueue_internal.h
20611 --- linux-4.9.6.orig/kernel/workqueue_internal.h        2017-01-26 08:25:24.000000000 +0100
20612 +++ linux-4.9.6/kernel/workqueue_internal.h     2017-01-28 13:59:10.095662491 +0100
20613 @@ -43,6 +43,7 @@
20614         unsigned long           last_active;    /* L: last active timestamp */
20615         unsigned int            flags;          /* X: flags */
20616         int                     id;             /* I: worker id */
20617 +       int                     sleeping;       /* None */
20618
20619         /*
20620          * Opaque string set with work_set_desc().  Printed out with task
20621 @@ -68,7 +69,7 @@
20622   * Scheduler hooks for concurrency managed workqueue.  Only to be used from
20623   * sched/core.c and workqueue.c.
20624   */
20625 -void wq_worker_waking_up(struct task_struct *task, int cpu);
20626 -struct task_struct *wq_worker_sleeping(struct task_struct *task);
20627 +void wq_worker_running(struct task_struct *task);
20628 +void wq_worker_sleeping(struct task_struct *task);
20629
20630  #endif /* _KERNEL_WORKQUEUE_INTERNAL_H */
20631 diff -Nur linux-4.9.6.orig/lib/debugobjects.c linux-4.9.6/lib/debugobjects.c
20632 --- linux-4.9.6.orig/lib/debugobjects.c 2017-01-26 08:25:24.000000000 +0100
20633 +++ linux-4.9.6/lib/debugobjects.c      2017-01-28 13:59:10.095662491 +0100
20634 @@ -308,7 +308,10 @@
20635         struct debug_obj *obj;
20636         unsigned long flags;
20637
20638 -       fill_pool();
20639 +#ifdef CONFIG_PREEMPT_RT_FULL
20640 +       if (preempt_count() == 0 && !irqs_disabled())
20641 +#endif
20642 +               fill_pool();
20643
20644         db = get_bucket((unsigned long) addr);
20645
20646 diff -Nur linux-4.9.6.orig/lib/idr.c linux-4.9.6/lib/idr.c
20647 --- linux-4.9.6.orig/lib/idr.c  2017-01-26 08:25:24.000000000 +0100
20648 +++ linux-4.9.6/lib/idr.c       2017-01-28 13:59:10.095662491 +0100
20649 @@ -30,6 +30,7 @@
20650  #include <linux/idr.h>
20651  #include <linux/spinlock.h>
20652  #include <linux/percpu.h>
20653 +#include <linux/locallock.h>
20654
20655  #define MAX_IDR_SHIFT          (sizeof(int) * 8 - 1)
20656  #define MAX_IDR_BIT            (1U << MAX_IDR_SHIFT)
20657 @@ -45,6 +46,37 @@
20658  static DEFINE_PER_CPU(int, idr_preload_cnt);
20659  static DEFINE_SPINLOCK(simple_ida_lock);
20660
20661 +#ifdef CONFIG_PREEMPT_RT_FULL
20662 +static DEFINE_LOCAL_IRQ_LOCK(idr_lock);
20663 +
20664 +static inline void idr_preload_lock(void)
20665 +{
20666 +       local_lock(idr_lock);
20667 +}
20668 +
20669 +static inline void idr_preload_unlock(void)
20670 +{
20671 +       local_unlock(idr_lock);
20672 +}
20673 +
20674 +void idr_preload_end(void)
20675 +{
20676 +       idr_preload_unlock();
20677 +}
20678 +EXPORT_SYMBOL(idr_preload_end);
20679 +#else
20680 +static inline void idr_preload_lock(void)
20681 +{
20682 +       preempt_disable();
20683 +}
20684 +
20685 +static inline void idr_preload_unlock(void)
20686 +{
20687 +       preempt_enable();
20688 +}
20689 +#endif
20690 +
20691 +
20692  /* the maximum ID which can be allocated given idr->layers */
20693  static int idr_max(int layers)
20694  {
20695 @@ -115,14 +147,14 @@
20696          * context.  See idr_preload() for details.
20697          */
20698         if (!in_interrupt()) {
20699 -               preempt_disable();
20700 +               idr_preload_lock();
20701                 new = __this_cpu_read(idr_preload_head);
20702                 if (new) {
20703                         __this_cpu_write(idr_preload_head, new->ary[0]);
20704                         __this_cpu_dec(idr_preload_cnt);
20705                         new->ary[0] = NULL;
20706                 }
20707 -               preempt_enable();
20708 +               idr_preload_unlock();
20709                 if (new)
20710                         return new;
20711         }
20712 @@ -366,7 +398,6 @@
20713         idr_mark_full(pa, id);
20714  }
20715
20716 -
20717  /**
20718   * idr_preload - preload for idr_alloc()
20719   * @gfp_mask: allocation mask to use for preloading
20720 @@ -401,7 +432,7 @@
20721         WARN_ON_ONCE(in_interrupt());
20722         might_sleep_if(gfpflags_allow_blocking(gfp_mask));
20723
20724 -       preempt_disable();
20725 +       idr_preload_lock();
20726
20727         /*
20728          * idr_alloc() is likely to succeed w/o full idr_layer buffer and
20729 @@ -413,9 +444,9 @@
20730         while (__this_cpu_read(idr_preload_cnt) < MAX_IDR_FREE) {
20731                 struct idr_layer *new;
20732
20733 -               preempt_enable();
20734 +               idr_preload_unlock();
20735                 new = kmem_cache_zalloc(idr_layer_cache, gfp_mask);
20736 -               preempt_disable();
20737 +               idr_preload_lock();
20738                 if (!new)
20739                         break;
20740
20741 diff -Nur linux-4.9.6.orig/lib/irq_poll.c linux-4.9.6/lib/irq_poll.c
20742 --- linux-4.9.6.orig/lib/irq_poll.c     2017-01-26 08:25:24.000000000 +0100
20743 +++ linux-4.9.6/lib/irq_poll.c  2017-01-28 13:59:10.095662491 +0100
20744 @@ -36,6 +36,7 @@
20745         list_add_tail(&iop->list, this_cpu_ptr(&blk_cpu_iopoll));
20746         __raise_softirq_irqoff(IRQ_POLL_SOFTIRQ);
20747         local_irq_restore(flags);
20748 +       preempt_check_resched_rt();
20749  }
20750  EXPORT_SYMBOL(irq_poll_sched);
20751
20752 @@ -71,6 +72,7 @@
20753         local_irq_save(flags);
20754         __irq_poll_complete(iop);
20755         local_irq_restore(flags);
20756 +       preempt_check_resched_rt();
20757  }
20758  EXPORT_SYMBOL(irq_poll_complete);
20759
20760 @@ -95,6 +97,7 @@
20761                 }
20762
20763                 local_irq_enable();
20764 +               preempt_check_resched_rt();
20765
20766                 /* Even though interrupts have been re-enabled, this
20767                  * access is safe because interrupts can only add new
20768 @@ -132,6 +135,7 @@
20769                 __raise_softirq_irqoff(IRQ_POLL_SOFTIRQ);
20770
20771         local_irq_enable();
20772 +       preempt_check_resched_rt();
20773  }
20774
20775  /**
20776 @@ -195,6 +199,7 @@
20777                          this_cpu_ptr(&blk_cpu_iopoll));
20778         __raise_softirq_irqoff(IRQ_POLL_SOFTIRQ);
20779         local_irq_enable();
20780 +       preempt_check_resched_rt();
20781
20782         return 0;
20783  }
20784 diff -Nur linux-4.9.6.orig/lib/Kconfig linux-4.9.6/lib/Kconfig
20785 --- linux-4.9.6.orig/lib/Kconfig        2017-01-26 08:25:24.000000000 +0100
20786 +++ linux-4.9.6/lib/Kconfig     2017-01-28 13:59:10.095662491 +0100
20787 @@ -400,6 +400,7 @@
20788
20789  config CPUMASK_OFFSTACK
20790         bool "Force CPU masks off stack" if DEBUG_PER_CPU_MAPS
20791 +       depends on !PREEMPT_RT_FULL
20792         help
20793           Use dynamic allocation for cpumask_var_t, instead of putting
20794           them on the stack.  This is a bit more expensive, but avoids
20795 diff -Nur linux-4.9.6.orig/lib/locking-selftest.c linux-4.9.6/lib/locking-selftest.c
20796 --- linux-4.9.6.orig/lib/locking-selftest.c     2017-01-26 08:25:24.000000000 +0100
20797 +++ linux-4.9.6/lib/locking-selftest.c  2017-01-28 13:59:10.095662491 +0100
20798 @@ -590,6 +590,8 @@
20799  #include "locking-selftest-spin-hardirq.h"
20800  GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_hard_spin)
20801
20802 +#ifndef CONFIG_PREEMPT_RT_FULL
20803 +
20804  #include "locking-selftest-rlock-hardirq.h"
20805  GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_hard_rlock)
20806
20807 @@ -605,9 +607,12 @@
20808  #include "locking-selftest-wlock-softirq.h"
20809  GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_soft_wlock)
20810
20811 +#endif
20812 +
20813  #undef E1
20814  #undef E2
20815
20816 +#ifndef CONFIG_PREEMPT_RT_FULL
20817  /*
20818   * Enabling hardirqs with a softirq-safe lock held:
20819   */
20820 @@ -640,6 +645,8 @@
20821  #undef E1
20822  #undef E2
20823
20824 +#endif
20825 +
20826  /*
20827   * Enabling irqs with an irq-safe lock held:
20828   */
20829 @@ -663,6 +670,8 @@
20830  #include "locking-selftest-spin-hardirq.h"
20831  GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_hard_spin)
20832
20833 +#ifndef CONFIG_PREEMPT_RT_FULL
20834 +
20835  #include "locking-selftest-rlock-hardirq.h"
20836  GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_hard_rlock)
20837
20838 @@ -678,6 +687,8 @@
20839  #include "locking-selftest-wlock-softirq.h"
20840  GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_soft_wlock)
20841
20842 +#endif
20843 +
20844  #undef E1
20845  #undef E2
20846
20847 @@ -709,6 +720,8 @@
20848  #include "locking-selftest-spin-hardirq.h"
20849  GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_hard_spin)
20850
20851 +#ifndef CONFIG_PREEMPT_RT_FULL
20852 +
20853  #include "locking-selftest-rlock-hardirq.h"
20854  GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_hard_rlock)
20855
20856 @@ -724,6 +737,8 @@
20857  #include "locking-selftest-wlock-softirq.h"
20858  GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_soft_wlock)
20859
20860 +#endif
20861 +
20862  #undef E1
20863  #undef E2
20864  #undef E3
20865 @@ -757,6 +772,8 @@
20866  #include "locking-selftest-spin-hardirq.h"
20867  GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_hard_spin)
20868
20869 +#ifndef CONFIG_PREEMPT_RT_FULL
20870 +
20871  #include "locking-selftest-rlock-hardirq.h"
20872  GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_hard_rlock)
20873
20874 @@ -772,10 +789,14 @@
20875  #include "locking-selftest-wlock-softirq.h"
20876  GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_soft_wlock)
20877
20878 +#endif
20879 +
20880  #undef E1
20881  #undef E2
20882  #undef E3
20883
20884 +#ifndef CONFIG_PREEMPT_RT_FULL
20885 +
20886  /*
20887   * read-lock / write-lock irq inversion.
20888   *
20889 @@ -838,6 +859,10 @@
20890  #undef E2
20891  #undef E3
20892
20893 +#endif
20894 +
20895 +#ifndef CONFIG_PREEMPT_RT_FULL
20896 +
20897  /*
20898   * read-lock / write-lock recursion that is actually safe.
20899   */
20900 @@ -876,6 +901,8 @@
20901  #undef E2
20902  #undef E3
20903
20904 +#endif
20905 +
20906  /*
20907   * read-lock / write-lock recursion that is unsafe.
20908   */
20909 @@ -1858,6 +1885,7 @@
20910
20911         printk("  --------------------------------------------------------------------------\n");
20912
20913 +#ifndef CONFIG_PREEMPT_RT_FULL
20914         /*
20915          * irq-context testcases:
20916          */
20917 @@ -1870,6 +1898,28 @@
20918
20919         DO_TESTCASE_6x2("irq read-recursion", irq_read_recursion);
20920  //     DO_TESTCASE_6x2B("irq read-recursion #2", irq_read_recursion2);
20921 +#else
20922 +       /* On -rt, we only do hardirq context test for raw spinlock */
20923 +       DO_TESTCASE_1B("hard-irqs-on + irq-safe-A", irqsafe1_hard_spin, 12);
20924 +       DO_TESTCASE_1B("hard-irqs-on + irq-safe-A", irqsafe1_hard_spin, 21);
20925 +
20926 +       DO_TESTCASE_1B("hard-safe-A + irqs-on", irqsafe2B_hard_spin, 12);
20927 +       DO_TESTCASE_1B("hard-safe-A + irqs-on", irqsafe2B_hard_spin, 21);
20928 +
20929 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 123);
20930 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 132);
20931 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 213);
20932 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 231);
20933 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 312);
20934 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 321);
20935 +
20936 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 123);
20937 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 132);
20938 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 213);
20939 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 231);
20940 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 312);
20941 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 321);
20942 +#endif
20943
20944         ww_tests();
20945
20946 diff -Nur linux-4.9.6.orig/lib/percpu_ida.c linux-4.9.6/lib/percpu_ida.c
20947 --- linux-4.9.6.orig/lib/percpu_ida.c   2017-01-26 08:25:24.000000000 +0100
20948 +++ linux-4.9.6/lib/percpu_ida.c        2017-01-28 13:59:10.095662491 +0100
20949 @@ -26,6 +26,9 @@
20950  #include <linux/string.h>
20951  #include <linux/spinlock.h>
20952  #include <linux/percpu_ida.h>
20953 +#include <linux/locallock.h>
20954 +
20955 +static DEFINE_LOCAL_IRQ_LOCK(irq_off_lock);
20956
20957  struct percpu_ida_cpu {
20958         /*
20959 @@ -148,13 +151,13 @@
20960         unsigned long flags;
20961         int tag;
20962
20963 -       local_irq_save(flags);
20964 +       local_lock_irqsave(irq_off_lock, flags);
20965         tags = this_cpu_ptr(pool->tag_cpu);
20966
20967         /* Fastpath */
20968         tag = alloc_local_tag(tags);
20969         if (likely(tag >= 0)) {
20970 -               local_irq_restore(flags);
20971 +               local_unlock_irqrestore(irq_off_lock, flags);
20972                 return tag;
20973         }
20974
20975 @@ -173,6 +176,7 @@
20976
20977                 if (!tags->nr_free)
20978                         alloc_global_tags(pool, tags);
20979 +
20980                 if (!tags->nr_free)
20981                         steal_tags(pool, tags);
20982
20983 @@ -184,7 +188,7 @@
20984                 }
20985
20986                 spin_unlock(&pool->lock);
20987 -               local_irq_restore(flags);
20988 +               local_unlock_irqrestore(irq_off_lock, flags);
20989
20990                 if (tag >= 0 || state == TASK_RUNNING)
20991                         break;
20992 @@ -196,7 +200,7 @@
20993
20994                 schedule();
20995
20996 -               local_irq_save(flags);
20997 +               local_lock_irqsave(irq_off_lock, flags);
20998                 tags = this_cpu_ptr(pool->tag_cpu);
20999         }
21000         if (state != TASK_RUNNING)
21001 @@ -221,7 +225,7 @@
21002
21003         BUG_ON(tag >= pool->nr_tags);
21004
21005 -       local_irq_save(flags);
21006 +       local_lock_irqsave(irq_off_lock, flags);
21007         tags = this_cpu_ptr(pool->tag_cpu);
21008
21009         spin_lock(&tags->lock);
21010 @@ -253,7 +257,7 @@
21011                 spin_unlock(&pool->lock);
21012         }
21013
21014 -       local_irq_restore(flags);
21015 +       local_unlock_irqrestore(irq_off_lock, flags);
21016  }
21017  EXPORT_SYMBOL_GPL(percpu_ida_free);
21018
21019 @@ -345,7 +349,7 @@
21020         struct percpu_ida_cpu *remote;
21021         unsigned cpu, i, err = 0;
21022
21023 -       local_irq_save(flags);
21024 +       local_lock_irqsave(irq_off_lock, flags);
21025         for_each_possible_cpu(cpu) {
21026                 remote = per_cpu_ptr(pool->tag_cpu, cpu);
21027                 spin_lock(&remote->lock);
21028 @@ -367,7 +371,7 @@
21029         }
21030         spin_unlock(&pool->lock);
21031  out:
21032 -       local_irq_restore(flags);
21033 +       local_unlock_irqrestore(irq_off_lock, flags);
21034         return err;
21035  }
21036  EXPORT_SYMBOL_GPL(percpu_ida_for_each_free);
21037 diff -Nur linux-4.9.6.orig/lib/radix-tree.c linux-4.9.6/lib/radix-tree.c
21038 --- linux-4.9.6.orig/lib/radix-tree.c   2017-01-26 08:25:24.000000000 +0100
21039 +++ linux-4.9.6/lib/radix-tree.c        2017-01-28 13:59:10.095662491 +0100
21040 @@ -290,13 +290,14 @@
21041                  * succeed in getting a node here (and never reach
21042                  * kmem_cache_alloc)
21043                  */
21044 -               rtp = this_cpu_ptr(&radix_tree_preloads);
21045 +               rtp = &get_cpu_var(radix_tree_preloads);
21046                 if (rtp->nr) {
21047                         ret = rtp->nodes;
21048                         rtp->nodes = ret->private_data;
21049                         ret->private_data = NULL;
21050                         rtp->nr--;
21051                 }
21052 +               put_cpu_var(radix_tree_preloads);
21053                 /*
21054                  * Update the allocation stack trace as this is more useful
21055                  * for debugging.
21056 @@ -336,6 +337,7 @@
21057         call_rcu(&node->rcu_head, radix_tree_node_rcu_free);
21058  }
21059
21060 +#ifndef CONFIG_PREEMPT_RT_FULL
21061  /*
21062   * Load up this CPU's radix_tree_node buffer with sufficient objects to
21063   * ensure that the addition of a single element in the tree cannot fail.  On
21064 @@ -455,6 +457,7 @@
21065
21066         return __radix_tree_preload(gfp_mask, nr_nodes);
21067  }
21068 +#endif
21069
21070  /*
21071   * The maximum index which can be stored in a radix tree
21072 diff -Nur linux-4.9.6.orig/lib/scatterlist.c linux-4.9.6/lib/scatterlist.c
21073 --- linux-4.9.6.orig/lib/scatterlist.c  2017-01-26 08:25:24.000000000 +0100
21074 +++ linux-4.9.6/lib/scatterlist.c       2017-01-28 13:59:10.095662491 +0100
21075 @@ -620,7 +620,7 @@
21076                         flush_kernel_dcache_page(miter->page);
21077
21078                 if (miter->__flags & SG_MITER_ATOMIC) {
21079 -                       WARN_ON_ONCE(preemptible());
21080 +                       WARN_ON_ONCE(!pagefault_disabled());
21081                         kunmap_atomic(miter->addr);
21082                 } else
21083                         kunmap(miter->page);
21084 @@ -664,7 +664,7 @@
21085         if (!sg_miter_skip(&miter, skip))
21086                 return false;
21087
21088 -       local_irq_save(flags);
21089 +       local_irq_save_nort(flags);
21090
21091         while (sg_miter_next(&miter) && offset < buflen) {
21092                 unsigned int len;
21093 @@ -681,7 +681,7 @@
21094
21095         sg_miter_stop(&miter);
21096
21097 -       local_irq_restore(flags);
21098 +       local_irq_restore_nort(flags);
21099         return offset;
21100  }
21101  EXPORT_SYMBOL(sg_copy_buffer);
21102 diff -Nur linux-4.9.6.orig/lib/smp_processor_id.c linux-4.9.6/lib/smp_processor_id.c
21103 --- linux-4.9.6.orig/lib/smp_processor_id.c     2017-01-26 08:25:24.000000000 +0100
21104 +++ linux-4.9.6/lib/smp_processor_id.c  2017-01-28 13:59:10.095662491 +0100
21105 @@ -39,8 +39,9 @@
21106         if (!printk_ratelimit())
21107                 goto out_enable;
21108
21109 -       printk(KERN_ERR "BUG: using %s%s() in preemptible [%08x] code: %s/%d\n",
21110 -               what1, what2, preempt_count() - 1, current->comm, current->pid);
21111 +       printk(KERN_ERR "BUG: using %s%s() in preemptible [%08x %08x] code: %s/%d\n",
21112 +               what1, what2, preempt_count() - 1, __migrate_disabled(current),
21113 +               current->comm, current->pid);
21114
21115         print_symbol("caller is %s\n", (long)__builtin_return_address(0));
21116         dump_stack();
21117 diff -Nur linux-4.9.6.orig/mm/backing-dev.c linux-4.9.6/mm/backing-dev.c
21118 --- linux-4.9.6.orig/mm/backing-dev.c   2017-01-26 08:25:24.000000000 +0100
21119 +++ linux-4.9.6/mm/backing-dev.c        2017-01-28 13:59:10.095662491 +0100
21120 @@ -457,9 +457,9 @@
21121  {
21122         unsigned long flags;
21123
21124 -       local_irq_save(flags);
21125 +       local_irq_save_nort(flags);
21126         if (!atomic_dec_and_lock(&congested->refcnt, &cgwb_lock)) {
21127 -               local_irq_restore(flags);
21128 +               local_irq_restore_nort(flags);
21129                 return;
21130         }
21131
21132 diff -Nur linux-4.9.6.orig/mm/compaction.c linux-4.9.6/mm/compaction.c
21133 --- linux-4.9.6.orig/mm/compaction.c    2017-01-26 08:25:24.000000000 +0100
21134 +++ linux-4.9.6/mm/compaction.c 2017-01-28 13:59:10.095662491 +0100
21135 @@ -1593,10 +1593,12 @@
21136                                 block_start_pfn(cc->migrate_pfn, cc->order);
21137
21138                         if (cc->last_migrated_pfn < current_block_start) {
21139 -                               cpu = get_cpu();
21140 +                               cpu = get_cpu_light();
21141 +                               local_lock_irq(swapvec_lock);
21142                                 lru_add_drain_cpu(cpu);
21143 +                               local_unlock_irq(swapvec_lock);
21144                                 drain_local_pages(zone);
21145 -                               put_cpu();
21146 +                               put_cpu_light();
21147                                 /* No more flushing until we migrate again */
21148                                 cc->last_migrated_pfn = 0;
21149                         }
21150 diff -Nur linux-4.9.6.orig/mm/filemap.c linux-4.9.6/mm/filemap.c
21151 --- linux-4.9.6.orig/mm/filemap.c       2017-01-26 08:25:24.000000000 +0100
21152 +++ linux-4.9.6/mm/filemap.c    2017-01-28 13:59:10.099662644 +0100
21153 @@ -159,9 +159,12 @@
21154                  * node->private_list is protected by
21155                  * mapping->tree_lock.
21156                  */
21157 -               if (!list_empty(&node->private_list))
21158 -                       list_lru_del(&workingset_shadow_nodes,
21159 +               if (!list_empty(&node->private_list)) {
21160 +                       local_lock(workingset_shadow_lock);
21161 +                       list_lru_del(&__workingset_shadow_nodes,
21162                                      &node->private_list);
21163 +                       local_unlock(workingset_shadow_lock);
21164 +               }
21165         }
21166         return 0;
21167  }
21168 @@ -217,8 +220,10 @@
21169                 if (!dax_mapping(mapping) && !workingset_node_pages(node) &&
21170                                 list_empty(&node->private_list)) {
21171                         node->private_data = mapping;
21172 -                       list_lru_add(&workingset_shadow_nodes,
21173 -                                       &node->private_list);
21174 +                       local_lock(workingset_shadow_lock);
21175 +                       list_lru_add(&__workingset_shadow_nodes,
21176 +                                    &node->private_list);
21177 +                       local_unlock(workingset_shadow_lock);
21178                 }
21179         }
21180
21181 diff -Nur linux-4.9.6.orig/mm/highmem.c linux-4.9.6/mm/highmem.c
21182 --- linux-4.9.6.orig/mm/highmem.c       2017-01-26 08:25:24.000000000 +0100
21183 +++ linux-4.9.6/mm/highmem.c    2017-01-28 13:59:10.099662644 +0100
21184 @@ -29,10 +29,11 @@
21185  #include <linux/kgdb.h>
21186  #include <asm/tlbflush.h>
21187
21188 -
21189 +#ifndef CONFIG_PREEMPT_RT_FULL
21190  #if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32)
21191  DEFINE_PER_CPU(int, __kmap_atomic_idx);
21192  #endif
21193 +#endif
21194
21195  /*
21196   * Virtual_count is not a pure "count".
21197 @@ -107,8 +108,9 @@
21198  unsigned long totalhigh_pages __read_mostly;
21199  EXPORT_SYMBOL(totalhigh_pages);
21200
21201 -
21202 +#ifndef CONFIG_PREEMPT_RT_FULL
21203  EXPORT_PER_CPU_SYMBOL(__kmap_atomic_idx);
21204 +#endif
21205
21206  unsigned int nr_free_highpages (void)
21207  {
21208 diff -Nur linux-4.9.6.orig/mm/Kconfig linux-4.9.6/mm/Kconfig
21209 --- linux-4.9.6.orig/mm/Kconfig 2017-01-26 08:25:24.000000000 +0100
21210 +++ linux-4.9.6/mm/Kconfig      2017-01-28 13:59:10.095662491 +0100
21211 @@ -410,7 +410,7 @@
21212
21213  config TRANSPARENT_HUGEPAGE
21214         bool "Transparent Hugepage Support"
21215 -       depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE
21216 +       depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE && !PREEMPT_RT_FULL
21217         select COMPACTION
21218         select RADIX_TREE_MULTIORDER
21219         help
21220 diff -Nur linux-4.9.6.orig/mm/memcontrol.c linux-4.9.6/mm/memcontrol.c
21221 --- linux-4.9.6.orig/mm/memcontrol.c    2017-01-26 08:25:24.000000000 +0100
21222 +++ linux-4.9.6/mm/memcontrol.c 2017-01-28 13:59:10.099662644 +0100
21223 @@ -67,6 +67,7 @@
21224  #include <net/sock.h>
21225  #include <net/ip.h>
21226  #include "slab.h"
21227 +#include <linux/locallock.h>
21228
21229  #include <asm/uaccess.h>
21230
21231 @@ -92,6 +93,8 @@
21232  #define do_swap_account                0
21233  #endif
21234
21235 +static DEFINE_LOCAL_IRQ_LOCK(event_lock);
21236 +
21237  /* Whether legacy memory+swap accounting is active */
21238  static bool do_memsw_account(void)
21239  {
21240 @@ -1692,6 +1695,7 @@
21241  #define FLUSHING_CACHED_CHARGE 0
21242  };
21243  static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
21244 +static DEFINE_LOCAL_IRQ_LOCK(memcg_stock_ll);
21245  static DEFINE_MUTEX(percpu_charge_mutex);
21246
21247  /**
21248 @@ -1714,7 +1718,7 @@
21249         if (nr_pages > CHARGE_BATCH)
21250                 return ret;
21251
21252 -       local_irq_save(flags);
21253 +       local_lock_irqsave(memcg_stock_ll, flags);
21254
21255         stock = this_cpu_ptr(&memcg_stock);
21256         if (memcg == stock->cached && stock->nr_pages >= nr_pages) {
21257 @@ -1722,7 +1726,7 @@
21258                 ret = true;
21259         }
21260
21261 -       local_irq_restore(flags);
21262 +       local_unlock_irqrestore(memcg_stock_ll, flags);
21263
21264         return ret;
21265  }
21266 @@ -1749,13 +1753,13 @@
21267         struct memcg_stock_pcp *stock;
21268         unsigned long flags;
21269
21270 -       local_irq_save(flags);
21271 +       local_lock_irqsave(memcg_stock_ll, flags);
21272
21273         stock = this_cpu_ptr(&memcg_stock);
21274         drain_stock(stock);
21275         clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
21276
21277 -       local_irq_restore(flags);
21278 +       local_unlock_irqrestore(memcg_stock_ll, flags);
21279  }
21280
21281  /*
21282 @@ -1767,7 +1771,7 @@
21283         struct memcg_stock_pcp *stock;
21284         unsigned long flags;
21285
21286 -       local_irq_save(flags);
21287 +       local_lock_irqsave(memcg_stock_ll, flags);
21288
21289         stock = this_cpu_ptr(&memcg_stock);
21290         if (stock->cached != memcg) { /* reset if necessary */
21291 @@ -1776,7 +1780,7 @@
21292         }
21293         stock->nr_pages += nr_pages;
21294
21295 -       local_irq_restore(flags);
21296 +       local_unlock_irqrestore(memcg_stock_ll, flags);
21297  }
21298
21299  /*
21300 @@ -1792,7 +1796,7 @@
21301                 return;
21302         /* Notify other cpus that system-wide "drain" is running */
21303         get_online_cpus();
21304 -       curcpu = get_cpu();
21305 +       curcpu = get_cpu_light();
21306         for_each_online_cpu(cpu) {
21307                 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
21308                 struct mem_cgroup *memcg;
21309 @@ -1809,7 +1813,7 @@
21310                                 schedule_work_on(cpu, &stock->work);
21311                 }
21312         }
21313 -       put_cpu();
21314 +       put_cpu_light();
21315         put_online_cpus();
21316         mutex_unlock(&percpu_charge_mutex);
21317  }
21318 @@ -4548,12 +4552,12 @@
21319
21320         ret = 0;
21321
21322 -       local_irq_disable();
21323 +       local_lock_irq(event_lock);
21324         mem_cgroup_charge_statistics(to, page, compound, nr_pages);
21325         memcg_check_events(to, page);
21326         mem_cgroup_charge_statistics(from, page, compound, -nr_pages);
21327         memcg_check_events(from, page);
21328 -       local_irq_enable();
21329 +       local_unlock_irq(event_lock);
21330  out_unlock:
21331         unlock_page(page);
21332  out:
21333 @@ -5428,10 +5432,10 @@
21334
21335         commit_charge(page, memcg, lrucare);
21336
21337 -       local_irq_disable();
21338 +       local_lock_irq(event_lock);
21339         mem_cgroup_charge_statistics(memcg, page, compound, nr_pages);
21340         memcg_check_events(memcg, page);
21341 -       local_irq_enable();
21342 +       local_unlock_irq(event_lock);
21343
21344         if (do_memsw_account() && PageSwapCache(page)) {
21345                 swp_entry_t entry = { .val = page_private(page) };
21346 @@ -5487,14 +5491,14 @@
21347                 memcg_oom_recover(memcg);
21348         }
21349
21350 -       local_irq_save(flags);
21351 +       local_lock_irqsave(event_lock, flags);
21352         __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon);
21353         __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_file);
21354         __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_huge);
21355         __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout);
21356         __this_cpu_add(memcg->stat->nr_page_events, nr_pages);
21357         memcg_check_events(memcg, dummy_page);
21358 -       local_irq_restore(flags);
21359 +       local_unlock_irqrestore(event_lock, flags);
21360
21361         if (!mem_cgroup_is_root(memcg))
21362                 css_put_many(&memcg->css, nr_pages);
21363 @@ -5649,10 +5653,10 @@
21364
21365         commit_charge(newpage, memcg, false);
21366
21367 -       local_irq_save(flags);
21368 +       local_lock_irqsave(event_lock, flags);
21369         mem_cgroup_charge_statistics(memcg, newpage, compound, nr_pages);
21370         memcg_check_events(memcg, newpage);
21371 -       local_irq_restore(flags);
21372 +       local_unlock_irqrestore(event_lock, flags);
21373  }
21374
21375  DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key);
21376 @@ -5832,6 +5836,7 @@
21377  {
21378         struct mem_cgroup *memcg, *swap_memcg;
21379         unsigned short oldid;
21380 +       unsigned long flags;
21381
21382         VM_BUG_ON_PAGE(PageLRU(page), page);
21383         VM_BUG_ON_PAGE(page_count(page), page);
21384 @@ -5872,12 +5877,16 @@
21385          * important here to have the interrupts disabled because it is the
21386          * only synchronisation we have for udpating the per-CPU variables.
21387          */
21388 +       local_lock_irqsave(event_lock, flags);
21389 +#ifndef CONFIG_PREEMPT_RT_BASE
21390         VM_BUG_ON(!irqs_disabled());
21391 +#endif
21392         mem_cgroup_charge_statistics(memcg, page, false, -1);
21393         memcg_check_events(memcg, page);
21394
21395         if (!mem_cgroup_is_root(memcg))
21396                 css_put(&memcg->css);
21397 +       local_unlock_irqrestore(event_lock, flags);
21398  }
21399
21400  /*
21401 diff -Nur linux-4.9.6.orig/mm/mmu_context.c linux-4.9.6/mm/mmu_context.c
21402 --- linux-4.9.6.orig/mm/mmu_context.c   2017-01-26 08:25:24.000000000 +0100
21403 +++ linux-4.9.6/mm/mmu_context.c        2017-01-28 13:59:10.099662644 +0100
21404 @@ -23,6 +23,7 @@
21405         struct task_struct *tsk = current;
21406
21407         task_lock(tsk);
21408 +       preempt_disable_rt();
21409         active_mm = tsk->active_mm;
21410         if (active_mm != mm) {
21411                 atomic_inc(&mm->mm_count);
21412 @@ -30,6 +31,7 @@
21413         }
21414         tsk->mm = mm;
21415         switch_mm(active_mm, mm, tsk);
21416 +       preempt_enable_rt();
21417         task_unlock(tsk);
21418  #ifdef finish_arch_post_lock_switch
21419         finish_arch_post_lock_switch();
21420 diff -Nur linux-4.9.6.orig/mm/page_alloc.c linux-4.9.6/mm/page_alloc.c
21421 --- linux-4.9.6.orig/mm/page_alloc.c    2017-01-26 08:25:24.000000000 +0100
21422 +++ linux-4.9.6/mm/page_alloc.c 2017-01-28 13:59:10.099662644 +0100
21423 @@ -61,6 +61,7 @@
21424  #include <linux/page_ext.h>
21425  #include <linux/hugetlb.h>
21426  #include <linux/sched/rt.h>
21427 +#include <linux/locallock.h>
21428  #include <linux/page_owner.h>
21429  #include <linux/kthread.h>
21430  #include <linux/memcontrol.h>
21431 @@ -281,6 +282,18 @@
21432  EXPORT_SYMBOL(nr_online_nodes);
21433  #endif
21434
21435 +static DEFINE_LOCAL_IRQ_LOCK(pa_lock);
21436 +
21437 +#ifdef CONFIG_PREEMPT_RT_BASE
21438 +# define cpu_lock_irqsave(cpu, flags)          \
21439 +       local_lock_irqsave_on(pa_lock, flags, cpu)
21440 +# define cpu_unlock_irqrestore(cpu, flags)     \
21441 +       local_unlock_irqrestore_on(pa_lock, flags, cpu)
21442 +#else
21443 +# define cpu_lock_irqsave(cpu, flags)          local_irq_save(flags)
21444 +# define cpu_unlock_irqrestore(cpu, flags)     local_irq_restore(flags)
21445 +#endif
21446 +
21447  int page_group_by_mobility_disabled __read_mostly;
21448
21449  #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
21450 @@ -1072,7 +1085,7 @@
21451  #endif /* CONFIG_DEBUG_VM */
21452
21453  /*
21454 - * Frees a number of pages from the PCP lists
21455 + * Frees a number of pages which have been collected from the pcp lists.
21456   * Assumes all pages on list are in same zone, and of same order.
21457   * count is the number of pages to free.
21458   *
21459 @@ -1083,19 +1096,58 @@
21460   * pinned" detection logic.
21461   */
21462  static void free_pcppages_bulk(struct zone *zone, int count,
21463 -                                       struct per_cpu_pages *pcp)
21464 +                              struct list_head *list)
21465  {
21466 -       int migratetype = 0;
21467 -       int batch_free = 0;
21468         unsigned long nr_scanned;
21469         bool isolated_pageblocks;
21470 +       unsigned long flags;
21471 +
21472 +       spin_lock_irqsave(&zone->lock, flags);
21473
21474 -       spin_lock(&zone->lock);
21475         isolated_pageblocks = has_isolate_pageblock(zone);
21476         nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED);
21477         if (nr_scanned)
21478                 __mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned);
21479
21480 +       while (!list_empty(list)) {
21481 +               struct page *page;
21482 +               int mt; /* migratetype of the to-be-freed page */
21483 +
21484 +               page = list_first_entry(list, struct page, lru);
21485 +               /* must delete as __free_one_page list manipulates */
21486 +               list_del(&page->lru);
21487 +
21488 +               mt = get_pcppage_migratetype(page);
21489 +               /* MIGRATE_ISOLATE page should not go to pcplists */
21490 +               VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
21491 +               /* Pageblock could have been isolated meanwhile */
21492 +               if (unlikely(isolated_pageblocks))
21493 +                       mt = get_pageblock_migratetype(page);
21494 +
21495 +               if (bulkfree_pcp_prepare(page))
21496 +                       continue;
21497 +
21498 +               __free_one_page(page, page_to_pfn(page), zone, 0, mt);
21499 +               trace_mm_page_pcpu_drain(page, 0, mt);
21500 +               count--;
21501 +       }
21502 +       WARN_ON(count != 0);
21503 +       spin_unlock_irqrestore(&zone->lock, flags);
21504 +}
21505 +
21506 +/*
21507 + * Moves a number of pages from the PCP lists to free list which
21508 + * is freed outside of the locked region.
21509 + *
21510 + * Assumes all pages on list are in same zone, and of same order.
21511 + * count is the number of pages to free.
21512 + */
21513 +static void isolate_pcp_pages(int count, struct per_cpu_pages *src,
21514 +                             struct list_head *dst)
21515 +{
21516 +       int migratetype = 0;
21517 +       int batch_free = 0;
21518 +
21519         while (count) {
21520                 struct page *page;
21521                 struct list_head *list;
21522 @@ -1111,7 +1163,7 @@
21523                         batch_free++;
21524                         if (++migratetype == MIGRATE_PCPTYPES)
21525                                 migratetype = 0;
21526 -                       list = &pcp->lists[migratetype];
21527 +                       list = &src->lists[migratetype];
21528                 } while (list_empty(list));
21529
21530                 /* This is the only non-empty list. Free them all. */
21531 @@ -1119,27 +1171,12 @@
21532                         batch_free = count;
21533
21534                 do {
21535 -                       int mt; /* migratetype of the to-be-freed page */
21536 -
21537                         page = list_last_entry(list, struct page, lru);
21538 -                       /* must delete as __free_one_page list manipulates */
21539                         list_del(&page->lru);
21540
21541 -                       mt = get_pcppage_migratetype(page);
21542 -                       /* MIGRATE_ISOLATE page should not go to pcplists */
21543 -                       VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
21544 -                       /* Pageblock could have been isolated meanwhile */
21545 -                       if (unlikely(isolated_pageblocks))
21546 -                               mt = get_pageblock_migratetype(page);
21547 -
21548 -                       if (bulkfree_pcp_prepare(page))
21549 -                               continue;
21550 -
21551 -                       __free_one_page(page, page_to_pfn(page), zone, 0, mt);
21552 -                       trace_mm_page_pcpu_drain(page, 0, mt);
21553 +                       list_add(&page->lru, dst);
21554                 } while (--count && --batch_free && !list_empty(list));
21555         }
21556 -       spin_unlock(&zone->lock);
21557  }
21558
21559  static void free_one_page(struct zone *zone,
21560 @@ -1148,7 +1185,9 @@
21561                                 int migratetype)
21562  {
21563         unsigned long nr_scanned;
21564 -       spin_lock(&zone->lock);
21565 +       unsigned long flags;
21566 +
21567 +       spin_lock_irqsave(&zone->lock, flags);
21568         nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED);
21569         if (nr_scanned)
21570                 __mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned);
21571 @@ -1158,7 +1197,7 @@
21572                 migratetype = get_pfnblock_migratetype(page, pfn);
21573         }
21574         __free_one_page(page, pfn, zone, order, migratetype);
21575 -       spin_unlock(&zone->lock);
21576 +       spin_unlock_irqrestore(&zone->lock, flags);
21577  }
21578
21579  static void __meminit __init_single_page(struct page *page, unsigned long pfn,
21580 @@ -1244,10 +1283,10 @@
21581                 return;
21582
21583         migratetype = get_pfnblock_migratetype(page, pfn);
21584 -       local_irq_save(flags);
21585 +       local_lock_irqsave(pa_lock, flags);
21586         __count_vm_events(PGFREE, 1 << order);
21587         free_one_page(page_zone(page), page, pfn, order, migratetype);
21588 -       local_irq_restore(flags);
21589 +       local_unlock_irqrestore(pa_lock, flags);
21590  }
21591
21592  static void __init __free_pages_boot_core(struct page *page, unsigned int order)
21593 @@ -2246,16 +2285,18 @@
21594  void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
21595  {
21596         unsigned long flags;
21597 +       LIST_HEAD(dst);
21598         int to_drain, batch;
21599
21600 -       local_irq_save(flags);
21601 +       local_lock_irqsave(pa_lock, flags);
21602         batch = READ_ONCE(pcp->batch);
21603         to_drain = min(pcp->count, batch);
21604         if (to_drain > 0) {
21605 -               free_pcppages_bulk(zone, to_drain, pcp);
21606 +               isolate_pcp_pages(to_drain, pcp, &dst);
21607                 pcp->count -= to_drain;
21608         }
21609 -       local_irq_restore(flags);
21610 +       local_unlock_irqrestore(pa_lock, flags);
21611 +       free_pcppages_bulk(zone, to_drain, &dst);
21612  }
21613  #endif
21614
21615 @@ -2271,16 +2312,21 @@
21616         unsigned long flags;
21617         struct per_cpu_pageset *pset;
21618         struct per_cpu_pages *pcp;
21619 +       LIST_HEAD(dst);
21620 +       int count;
21621
21622 -       local_irq_save(flags);
21623 +       cpu_lock_irqsave(cpu, flags);
21624         pset = per_cpu_ptr(zone->pageset, cpu);
21625
21626         pcp = &pset->pcp;
21627 -       if (pcp->count) {
21628 -               free_pcppages_bulk(zone, pcp->count, pcp);
21629 +       count = pcp->count;
21630 +       if (count) {
21631 +               isolate_pcp_pages(count, pcp, &dst);
21632                 pcp->count = 0;
21633         }
21634 -       local_irq_restore(flags);
21635 +       cpu_unlock_irqrestore(cpu, flags);
21636 +       if (count)
21637 +               free_pcppages_bulk(zone, count, &dst);
21638  }
21639
21640  /*
21641 @@ -2366,8 +2412,17 @@
21642                 else
21643                         cpumask_clear_cpu(cpu, &cpus_with_pcps);
21644         }
21645 +#ifndef CONFIG_PREEMPT_RT_BASE
21646         on_each_cpu_mask(&cpus_with_pcps, (smp_call_func_t) drain_local_pages,
21647                                                                 zone, 1);
21648 +#else
21649 +       for_each_cpu(cpu, &cpus_with_pcps) {
21650 +               if (zone)
21651 +                       drain_pages_zone(cpu, zone);
21652 +               else
21653 +                       drain_pages(cpu);
21654 +       }
21655 +#endif
21656  }
21657
21658  #ifdef CONFIG_HIBERNATION
21659 @@ -2427,7 +2482,7 @@
21660
21661         migratetype = get_pfnblock_migratetype(page, pfn);
21662         set_pcppage_migratetype(page, migratetype);
21663 -       local_irq_save(flags);
21664 +       local_lock_irqsave(pa_lock, flags);
21665         __count_vm_event(PGFREE);
21666
21667         /*
21668 @@ -2453,12 +2508,17 @@
21669         pcp->count++;
21670         if (pcp->count >= pcp->high) {
21671                 unsigned long batch = READ_ONCE(pcp->batch);
21672 -               free_pcppages_bulk(zone, batch, pcp);
21673 +               LIST_HEAD(dst);
21674 +
21675 +               isolate_pcp_pages(batch, pcp, &dst);
21676                 pcp->count -= batch;
21677 +               local_unlock_irqrestore(pa_lock, flags);
21678 +               free_pcppages_bulk(zone, batch, &dst);
21679 +               return;
21680         }
21681
21682  out:
21683 -       local_irq_restore(flags);
21684 +       local_unlock_irqrestore(pa_lock, flags);
21685  }
21686
21687  /*
21688 @@ -2600,7 +2660,7 @@
21689                 struct per_cpu_pages *pcp;
21690                 struct list_head *list;
21691
21692 -               local_irq_save(flags);
21693 +               local_lock_irqsave(pa_lock, flags);
21694                 do {
21695                         pcp = &this_cpu_ptr(zone->pageset)->pcp;
21696                         list = &pcp->lists[migratetype];
21697 @@ -2627,7 +2687,7 @@
21698                  * allocate greater than order-1 page units with __GFP_NOFAIL.
21699                  */
21700                 WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
21701 -               spin_lock_irqsave(&zone->lock, flags);
21702 +               local_spin_lock_irqsave(pa_lock, &zone->lock, flags);
21703
21704                 do {
21705                         page = NULL;
21706 @@ -2639,22 +2699,24 @@
21707                         if (!page)
21708                                 page = __rmqueue(zone, order, migratetype);
21709                 } while (page && check_new_pages(page, order));
21710 -               spin_unlock(&zone->lock);
21711 -               if (!page)
21712 +               if (!page) {
21713 +                       spin_unlock(&zone->lock);
21714                         goto failed;
21715 +               }
21716                 __mod_zone_freepage_state(zone, -(1 << order),
21717                                           get_pcppage_migratetype(page));
21718 +               spin_unlock(&zone->lock);
21719         }
21720
21721         __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
21722         zone_statistics(preferred_zone, zone, gfp_flags);
21723 -       local_irq_restore(flags);
21724 +       local_unlock_irqrestore(pa_lock, flags);
21725
21726         VM_BUG_ON_PAGE(bad_range(zone, page), page);
21727         return page;
21728
21729  failed:
21730 -       local_irq_restore(flags);
21731 +       local_unlock_irqrestore(pa_lock, flags);
21732         return NULL;
21733  }
21734
21735 @@ -6505,7 +6567,9 @@
21736         int cpu = (unsigned long)hcpu;
21737
21738         if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
21739 +               local_lock_irq_on(swapvec_lock, cpu);
21740                 lru_add_drain_cpu(cpu);
21741 +               local_unlock_irq_on(swapvec_lock, cpu);
21742                 drain_pages(cpu);
21743
21744                 /*
21745 @@ -6531,6 +6595,7 @@
21746  void __init page_alloc_init(void)
21747  {
21748         hotcpu_notifier(page_alloc_cpu_notify, 0);
21749 +       local_irq_lock_init(pa_lock);
21750  }
21751
21752  /*
21753 @@ -7359,7 +7424,7 @@
21754         struct per_cpu_pageset *pset;
21755
21756         /* avoid races with drain_pages()  */
21757 -       local_irq_save(flags);
21758 +       local_lock_irqsave(pa_lock, flags);
21759         if (zone->pageset != &boot_pageset) {
21760                 for_each_online_cpu(cpu) {
21761                         pset = per_cpu_ptr(zone->pageset, cpu);
21762 @@ -7368,7 +7433,7 @@
21763                 free_percpu(zone->pageset);
21764                 zone->pageset = &boot_pageset;
21765         }
21766 -       local_irq_restore(flags);
21767 +       local_unlock_irqrestore(pa_lock, flags);
21768  }
21769
21770  #ifdef CONFIG_MEMORY_HOTREMOVE
21771 diff -Nur linux-4.9.6.orig/mm/slab.h linux-4.9.6/mm/slab.h
21772 --- linux-4.9.6.orig/mm/slab.h  2017-01-26 08:25:24.000000000 +0100
21773 +++ linux-4.9.6/mm/slab.h       2017-01-28 13:59:10.099662644 +0100
21774 @@ -426,7 +426,11 @@
21775   * The slab lists for all objects.
21776   */
21777  struct kmem_cache_node {
21778 +#ifdef CONFIG_SLUB
21779 +       raw_spinlock_t list_lock;
21780 +#else
21781         spinlock_t list_lock;
21782 +#endif
21783
21784  #ifdef CONFIG_SLAB
21785         struct list_head slabs_partial; /* partial list first, better asm code */
21786 diff -Nur linux-4.9.6.orig/mm/slub.c linux-4.9.6/mm/slub.c
21787 --- linux-4.9.6.orig/mm/slub.c  2017-01-26 08:25:24.000000000 +0100
21788 +++ linux-4.9.6/mm/slub.c       2017-01-28 13:59:10.099662644 +0100
21789 @@ -1141,7 +1141,7 @@
21790         unsigned long uninitialized_var(flags);
21791         int ret = 0;
21792
21793 -       spin_lock_irqsave(&n->list_lock, flags);
21794 +       raw_spin_lock_irqsave(&n->list_lock, flags);
21795         slab_lock(page);
21796
21797         if (s->flags & SLAB_CONSISTENCY_CHECKS) {
21798 @@ -1176,7 +1176,7 @@
21799                          bulk_cnt, cnt);
21800
21801         slab_unlock(page);
21802 -       spin_unlock_irqrestore(&n->list_lock, flags);
21803 +       raw_spin_unlock_irqrestore(&n->list_lock, flags);
21804         if (!ret)
21805                 slab_fix(s, "Object at 0x%p not freed", object);
21806         return ret;
21807 @@ -1304,6 +1304,12 @@
21808
21809  #endif /* CONFIG_SLUB_DEBUG */
21810
21811 +struct slub_free_list {
21812 +       raw_spinlock_t          lock;
21813 +       struct list_head        list;
21814 +};
21815 +static DEFINE_PER_CPU(struct slub_free_list, slub_free_list);
21816 +
21817  /*
21818   * Hooks for other subsystems that check memory allocations. In a typical
21819   * production configuration these hooks all should produce no code at all.
21820 @@ -1523,10 +1529,17 @@
21821         void *start, *p;
21822         int idx, order;
21823         bool shuffle;
21824 +       bool enableirqs = false;
21825
21826         flags &= gfp_allowed_mask;
21827
21828         if (gfpflags_allow_blocking(flags))
21829 +               enableirqs = true;
21830 +#ifdef CONFIG_PREEMPT_RT_FULL
21831 +       if (system_state == SYSTEM_RUNNING)
21832 +               enableirqs = true;
21833 +#endif
21834 +       if (enableirqs)
21835                 local_irq_enable();
21836
21837         flags |= s->allocflags;
21838 @@ -1601,7 +1614,7 @@
21839         page->frozen = 1;
21840
21841  out:
21842 -       if (gfpflags_allow_blocking(flags))
21843 +       if (enableirqs)
21844                 local_irq_disable();
21845         if (!page)
21846                 return NULL;
21847 @@ -1660,6 +1673,16 @@
21848         __free_pages(page, order);
21849  }
21850
21851 +static void free_delayed(struct list_head *h)
21852 +{
21853 +       while(!list_empty(h)) {
21854 +               struct page *page = list_first_entry(h, struct page, lru);
21855 +
21856 +               list_del(&page->lru);
21857 +               __free_slab(page->slab_cache, page);
21858 +       }
21859 +}
21860 +
21861  #define need_reserve_slab_rcu                                          \
21862         (sizeof(((struct page *)NULL)->lru) < sizeof(struct rcu_head))
21863
21864 @@ -1691,6 +1714,12 @@
21865                 }
21866
21867                 call_rcu(head, rcu_free_slab);
21868 +       } else if (irqs_disabled()) {
21869 +               struct slub_free_list *f = this_cpu_ptr(&slub_free_list);
21870 +
21871 +               raw_spin_lock(&f->lock);
21872 +               list_add(&page->lru, &f->list);
21873 +               raw_spin_unlock(&f->lock);
21874         } else
21875                 __free_slab(s, page);
21876  }
21877 @@ -1798,7 +1827,7 @@
21878         if (!n || !n->nr_partial)
21879                 return NULL;
21880
21881 -       spin_lock(&n->list_lock);
21882 +       raw_spin_lock(&n->list_lock);
21883         list_for_each_entry_safe(page, page2, &n->partial, lru) {
21884                 void *t;
21885
21886 @@ -1823,7 +1852,7 @@
21887                         break;
21888
21889         }
21890 -       spin_unlock(&n->list_lock);
21891 +       raw_spin_unlock(&n->list_lock);
21892         return object;
21893  }
21894
21895 @@ -2069,7 +2098,7 @@
21896                          * that acquire_slab() will see a slab page that
21897                          * is frozen
21898                          */
21899 -                       spin_lock(&n->list_lock);
21900 +                       raw_spin_lock(&n->list_lock);
21901                 }
21902         } else {
21903                 m = M_FULL;
21904 @@ -2080,7 +2109,7 @@
21905                          * slabs from diagnostic functions will not see
21906                          * any frozen slabs.
21907                          */
21908 -                       spin_lock(&n->list_lock);
21909 +                       raw_spin_lock(&n->list_lock);
21910                 }
21911         }
21912
21913 @@ -2115,7 +2144,7 @@
21914                 goto redo;
21915
21916         if (lock)
21917 -               spin_unlock(&n->list_lock);
21918 +               raw_spin_unlock(&n->list_lock);
21919
21920         if (m == M_FREE) {
21921                 stat(s, DEACTIVATE_EMPTY);
21922 @@ -2147,10 +2176,10 @@
21923                 n2 = get_node(s, page_to_nid(page));
21924                 if (n != n2) {
21925                         if (n)
21926 -                               spin_unlock(&n->list_lock);
21927 +                               raw_spin_unlock(&n->list_lock);
21928
21929                         n = n2;
21930 -                       spin_lock(&n->list_lock);
21931 +                       raw_spin_lock(&n->list_lock);
21932                 }
21933
21934                 do {
21935 @@ -2179,7 +2208,7 @@
21936         }
21937
21938         if (n)
21939 -               spin_unlock(&n->list_lock);
21940 +               raw_spin_unlock(&n->list_lock);
21941
21942         while (discard_page) {
21943                 page = discard_page;
21944 @@ -2218,14 +2247,21 @@
21945                         pobjects = oldpage->pobjects;
21946                         pages = oldpage->pages;
21947                         if (drain && pobjects > s->cpu_partial) {
21948 +                               struct slub_free_list *f;
21949                                 unsigned long flags;
21950 +                               LIST_HEAD(tofree);
21951                                 /*
21952                                  * partial array is full. Move the existing
21953                                  * set to the per node partial list.
21954                                  */
21955                                 local_irq_save(flags);
21956                                 unfreeze_partials(s, this_cpu_ptr(s->cpu_slab));
21957 +                               f = this_cpu_ptr(&slub_free_list);
21958 +                               raw_spin_lock(&f->lock);
21959 +                               list_splice_init(&f->list, &tofree);
21960 +                               raw_spin_unlock(&f->lock);
21961                                 local_irq_restore(flags);
21962 +                               free_delayed(&tofree);
21963                                 oldpage = NULL;
21964                                 pobjects = 0;
21965                                 pages = 0;
21966 @@ -2297,7 +2333,22 @@
21967
21968  static void flush_all(struct kmem_cache *s)
21969  {
21970 +       LIST_HEAD(tofree);
21971 +       int cpu;
21972 +
21973         on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1, GFP_ATOMIC);
21974 +       for_each_online_cpu(cpu) {
21975 +               struct slub_free_list *f;
21976 +
21977 +               if (!has_cpu_slab(cpu, s))
21978 +                       continue;
21979 +
21980 +               f = &per_cpu(slub_free_list, cpu);
21981 +               raw_spin_lock_irq(&f->lock);
21982 +               list_splice_init(&f->list, &tofree);
21983 +               raw_spin_unlock_irq(&f->lock);
21984 +               free_delayed(&tofree);
21985 +       }
21986  }
21987
21988  /*
21989 @@ -2352,10 +2403,10 @@
21990         unsigned long x = 0;
21991         struct page *page;
21992
21993 -       spin_lock_irqsave(&n->list_lock, flags);
21994 +       raw_spin_lock_irqsave(&n->list_lock, flags);
21995         list_for_each_entry(page, &n->partial, lru)
21996                 x += get_count(page);
21997 -       spin_unlock_irqrestore(&n->list_lock, flags);
21998 +       raw_spin_unlock_irqrestore(&n->list_lock, flags);
21999         return x;
22000  }
22001  #endif /* CONFIG_SLUB_DEBUG || CONFIG_SYSFS */
22002 @@ -2493,8 +2544,10 @@
22003   * already disabled (which is the case for bulk allocation).
22004   */
22005  static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
22006 -                         unsigned long addr, struct kmem_cache_cpu *c)
22007 +                         unsigned long addr, struct kmem_cache_cpu *c,
22008 +                         struct list_head *to_free)
22009  {
22010 +       struct slub_free_list *f;
22011         void *freelist;
22012         struct page *page;
22013
22014 @@ -2554,6 +2607,13 @@
22015         VM_BUG_ON(!c->page->frozen);
22016         c->freelist = get_freepointer(s, freelist);
22017         c->tid = next_tid(c->tid);
22018 +
22019 +out:
22020 +       f = this_cpu_ptr(&slub_free_list);
22021 +       raw_spin_lock(&f->lock);
22022 +       list_splice_init(&f->list, to_free);
22023 +       raw_spin_unlock(&f->lock);
22024 +
22025         return freelist;
22026
22027  new_slab:
22028 @@ -2585,7 +2645,7 @@
22029         deactivate_slab(s, page, get_freepointer(s, freelist));
22030         c->page = NULL;
22031         c->freelist = NULL;
22032 -       return freelist;
22033 +       goto out;
22034  }
22035
22036  /*
22037 @@ -2597,6 +2657,7 @@
22038  {
22039         void *p;
22040         unsigned long flags;
22041 +       LIST_HEAD(tofree);
22042
22043         local_irq_save(flags);
22044  #ifdef CONFIG_PREEMPT
22045 @@ -2608,8 +2669,9 @@
22046         c = this_cpu_ptr(s->cpu_slab);
22047  #endif
22048
22049 -       p = ___slab_alloc(s, gfpflags, node, addr, c);
22050 +       p = ___slab_alloc(s, gfpflags, node, addr, c, &tofree);
22051         local_irq_restore(flags);
22052 +       free_delayed(&tofree);
22053         return p;
22054  }
22055
22056 @@ -2795,7 +2857,7 @@
22057
22058         do {
22059                 if (unlikely(n)) {
22060 -                       spin_unlock_irqrestore(&n->list_lock, flags);
22061 +                       raw_spin_unlock_irqrestore(&n->list_lock, flags);
22062                         n = NULL;
22063                 }
22064                 prior = page->freelist;
22065 @@ -2827,7 +2889,7 @@
22066                                  * Otherwise the list_lock will synchronize with
22067                                  * other processors updating the list of slabs.
22068                                  */
22069 -                               spin_lock_irqsave(&n->list_lock, flags);
22070 +                               raw_spin_lock_irqsave(&n->list_lock, flags);
22071
22072                         }
22073                 }
22074 @@ -2869,7 +2931,7 @@
22075                 add_partial(n, page, DEACTIVATE_TO_TAIL);
22076                 stat(s, FREE_ADD_PARTIAL);
22077         }
22078 -       spin_unlock_irqrestore(&n->list_lock, flags);
22079 +       raw_spin_unlock_irqrestore(&n->list_lock, flags);
22080         return;
22081
22082  slab_empty:
22083 @@ -2884,7 +2946,7 @@
22084                 remove_full(s, n, page);
22085         }
22086
22087 -       spin_unlock_irqrestore(&n->list_lock, flags);
22088 +       raw_spin_unlock_irqrestore(&n->list_lock, flags);
22089         stat(s, FREE_SLAB);
22090         discard_slab(s, page);
22091  }
22092 @@ -3089,6 +3151,7 @@
22093                           void **p)
22094  {
22095         struct kmem_cache_cpu *c;
22096 +       LIST_HEAD(to_free);
22097         int i;
22098
22099         /* memcg and kmem_cache debug support */
22100 @@ -3112,7 +3175,7 @@
22101                          * of re-populating per CPU c->freelist
22102                          */
22103                         p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE,
22104 -                                           _RET_IP_, c);
22105 +                                           _RET_IP_, c, &to_free);
22106                         if (unlikely(!p[i]))
22107                                 goto error;
22108
22109 @@ -3124,6 +3187,7 @@
22110         }
22111         c->tid = next_tid(c->tid);
22112         local_irq_enable();
22113 +       free_delayed(&to_free);
22114
22115         /* Clear memory outside IRQ disabled fastpath loop */
22116         if (unlikely(flags & __GFP_ZERO)) {
22117 @@ -3271,7 +3335,7 @@
22118  init_kmem_cache_node(struct kmem_cache_node *n)
22119  {
22120         n->nr_partial = 0;
22121 -       spin_lock_init(&n->list_lock);
22122 +       raw_spin_lock_init(&n->list_lock);
22123         INIT_LIST_HEAD(&n->partial);
22124  #ifdef CONFIG_SLUB_DEBUG
22125         atomic_long_set(&n->nr_slabs, 0);
22126 @@ -3615,6 +3679,10 @@
22127                                                         const char *text)
22128  {
22129  #ifdef CONFIG_SLUB_DEBUG
22130 +#ifdef CONFIG_PREEMPT_RT_BASE
22131 +       /* XXX move out of irq-off section */
22132 +       slab_err(s, page, text, s->name);
22133 +#else
22134         void *addr = page_address(page);
22135         void *p;
22136         unsigned long *map = kzalloc(BITS_TO_LONGS(page->objects) *
22137 @@ -3635,6 +3703,7 @@
22138         slab_unlock(page);
22139         kfree(map);
22140  #endif
22141 +#endif
22142  }
22143
22144  /*
22145 @@ -3648,7 +3717,7 @@
22146         struct page *page, *h;
22147
22148         BUG_ON(irqs_disabled());
22149 -       spin_lock_irq(&n->list_lock);
22150 +       raw_spin_lock_irq(&n->list_lock);
22151         list_for_each_entry_safe(page, h, &n->partial, lru) {
22152                 if (!page->inuse) {
22153                         remove_partial(n, page);
22154 @@ -3658,7 +3727,7 @@
22155                         "Objects remaining in %s on __kmem_cache_shutdown()");
22156                 }
22157         }
22158 -       spin_unlock_irq(&n->list_lock);
22159 +       raw_spin_unlock_irq(&n->list_lock);
22160
22161         list_for_each_entry_safe(page, h, &discard, lru)
22162                 discard_slab(s, page);
22163 @@ -3916,7 +3985,7 @@
22164                 for (i = 0; i < SHRINK_PROMOTE_MAX; i++)
22165                         INIT_LIST_HEAD(promote + i);
22166
22167 -               spin_lock_irqsave(&n->list_lock, flags);
22168 +               raw_spin_lock_irqsave(&n->list_lock, flags);
22169
22170                 /*
22171                  * Build lists of slabs to discard or promote.
22172 @@ -3947,7 +4016,7 @@
22173                 for (i = SHRINK_PROMOTE_MAX - 1; i >= 0; i--)
22174                         list_splice(promote + i, &n->partial);
22175
22176 -               spin_unlock_irqrestore(&n->list_lock, flags);
22177 +               raw_spin_unlock_irqrestore(&n->list_lock, flags);
22178
22179                 /* Release empty slabs */
22180                 list_for_each_entry_safe(page, t, &discard, lru)
22181 @@ -4123,6 +4192,12 @@
22182  {
22183         static __initdata struct kmem_cache boot_kmem_cache,
22184                 boot_kmem_cache_node;
22185 +       int cpu;
22186 +
22187 +       for_each_possible_cpu(cpu) {
22188 +               raw_spin_lock_init(&per_cpu(slub_free_list, cpu).lock);
22189 +               INIT_LIST_HEAD(&per_cpu(slub_free_list, cpu).list);
22190 +       }
22191
22192         if (debug_guardpage_minorder())
22193                 slub_max_order = 0;
22194 @@ -4331,7 +4406,7 @@
22195         struct page *page;
22196         unsigned long flags;
22197
22198 -       spin_lock_irqsave(&n->list_lock, flags);
22199 +       raw_spin_lock_irqsave(&n->list_lock, flags);
22200
22201         list_for_each_entry(page, &n->partial, lru) {
22202                 validate_slab_slab(s, page, map);
22203 @@ -4353,7 +4428,7 @@
22204                        s->name, count, atomic_long_read(&n->nr_slabs));
22205
22206  out:
22207 -       spin_unlock_irqrestore(&n->list_lock, flags);
22208 +       raw_spin_unlock_irqrestore(&n->list_lock, flags);
22209         return count;
22210  }
22211
22212 @@ -4541,12 +4616,12 @@
22213                 if (!atomic_long_read(&n->nr_slabs))
22214                         continue;
22215
22216 -               spin_lock_irqsave(&n->list_lock, flags);
22217 +               raw_spin_lock_irqsave(&n->list_lock, flags);
22218                 list_for_each_entry(page, &n->partial, lru)
22219                         process_slab(&t, s, page, alloc, map);
22220                 list_for_each_entry(page, &n->full, lru)
22221                         process_slab(&t, s, page, alloc, map);
22222 -               spin_unlock_irqrestore(&n->list_lock, flags);
22223 +               raw_spin_unlock_irqrestore(&n->list_lock, flags);
22224         }
22225
22226         for (i = 0; i < t.count; i++) {
22227 diff -Nur linux-4.9.6.orig/mm/swap.c linux-4.9.6/mm/swap.c
22228 --- linux-4.9.6.orig/mm/swap.c  2017-01-26 08:25:24.000000000 +0100
22229 +++ linux-4.9.6/mm/swap.c       2017-01-28 13:59:10.103662798 +0100
22230 @@ -32,6 +32,7 @@
22231  #include <linux/memcontrol.h>
22232  #include <linux/gfp.h>
22233  #include <linux/uio.h>
22234 +#include <linux/locallock.h>
22235  #include <linux/hugetlb.h>
22236  #include <linux/page_idle.h>
22237
22238 @@ -50,6 +51,8 @@
22239  #ifdef CONFIG_SMP
22240  static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs);
22241  #endif
22242 +static DEFINE_LOCAL_IRQ_LOCK(rotate_lock);
22243 +DEFINE_LOCAL_IRQ_LOCK(swapvec_lock);
22244
22245  /*
22246   * This path almost never happens for VM activity - pages are normally
22247 @@ -240,11 +243,11 @@
22248                 unsigned long flags;
22249
22250                 get_page(page);
22251 -               local_irq_save(flags);
22252 +               local_lock_irqsave(rotate_lock, flags);
22253                 pvec = this_cpu_ptr(&lru_rotate_pvecs);
22254                 if (!pagevec_add(pvec, page) || PageCompound(page))
22255                         pagevec_move_tail(pvec);
22256 -               local_irq_restore(flags);
22257 +               local_unlock_irqrestore(rotate_lock, flags);
22258         }
22259  }
22260
22261 @@ -294,12 +297,13 @@
22262  {
22263         page = compound_head(page);
22264         if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
22265 -               struct pagevec *pvec = &get_cpu_var(activate_page_pvecs);
22266 +               struct pagevec *pvec = &get_locked_var(swapvec_lock,
22267 +                                                      activate_page_pvecs);
22268
22269                 get_page(page);
22270                 if (!pagevec_add(pvec, page) || PageCompound(page))
22271                         pagevec_lru_move_fn(pvec, __activate_page, NULL);
22272 -               put_cpu_var(activate_page_pvecs);
22273 +               put_locked_var(swapvec_lock, activate_page_pvecs);
22274         }
22275  }
22276
22277 @@ -326,7 +330,7 @@
22278
22279  static void __lru_cache_activate_page(struct page *page)
22280  {
22281 -       struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
22282 +       struct pagevec *pvec = &get_locked_var(swapvec_lock, lru_add_pvec);
22283         int i;
22284
22285         /*
22286 @@ -348,7 +352,7 @@
22287                 }
22288         }
22289
22290 -       put_cpu_var(lru_add_pvec);
22291 +       put_locked_var(swapvec_lock, lru_add_pvec);
22292  }
22293
22294  /*
22295 @@ -390,12 +394,12 @@
22296
22297  static void __lru_cache_add(struct page *page)
22298  {
22299 -       struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
22300 +       struct pagevec *pvec = &get_locked_var(swapvec_lock, lru_add_pvec);
22301
22302         get_page(page);
22303         if (!pagevec_add(pvec, page) || PageCompound(page))
22304                 __pagevec_lru_add(pvec);
22305 -       put_cpu_var(lru_add_pvec);
22306 +       put_locked_var(swapvec_lock, lru_add_pvec);
22307  }
22308
22309  /**
22310 @@ -593,9 +597,15 @@
22311                 unsigned long flags;
22312
22313                 /* No harm done if a racing interrupt already did this */
22314 -               local_irq_save(flags);
22315 +#ifdef CONFIG_PREEMPT_RT_BASE
22316 +               local_lock_irqsave_on(rotate_lock, flags, cpu);
22317                 pagevec_move_tail(pvec);
22318 -               local_irq_restore(flags);
22319 +               local_unlock_irqrestore_on(rotate_lock, flags, cpu);
22320 +#else
22321 +               local_lock_irqsave(rotate_lock, flags);
22322 +               pagevec_move_tail(pvec);
22323 +               local_unlock_irqrestore(rotate_lock, flags);
22324 +#endif
22325         }
22326
22327         pvec = &per_cpu(lru_deactivate_file_pvecs, cpu);
22328 @@ -627,11 +637,12 @@
22329                 return;
22330
22331         if (likely(get_page_unless_zero(page))) {
22332 -               struct pagevec *pvec = &get_cpu_var(lru_deactivate_file_pvecs);
22333 +               struct pagevec *pvec = &get_locked_var(swapvec_lock,
22334 +                                                      lru_deactivate_file_pvecs);
22335
22336                 if (!pagevec_add(pvec, page) || PageCompound(page))
22337                         pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
22338 -               put_cpu_var(lru_deactivate_file_pvecs);
22339 +               put_locked_var(swapvec_lock, lru_deactivate_file_pvecs);
22340         }
22341  }
22342
22343 @@ -646,27 +657,31 @@
22344  void deactivate_page(struct page *page)
22345  {
22346         if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
22347 -               struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs);
22348 +               struct pagevec *pvec = &get_locked_var(swapvec_lock,
22349 +                                                      lru_deactivate_pvecs);
22350
22351                 get_page(page);
22352                 if (!pagevec_add(pvec, page) || PageCompound(page))
22353                         pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
22354 -               put_cpu_var(lru_deactivate_pvecs);
22355 +               put_locked_var(swapvec_lock, lru_deactivate_pvecs);
22356         }
22357  }
22358
22359  void lru_add_drain(void)
22360  {
22361 -       lru_add_drain_cpu(get_cpu());
22362 -       put_cpu();
22363 +       lru_add_drain_cpu(local_lock_cpu(swapvec_lock));
22364 +       local_unlock_cpu(swapvec_lock);
22365  }
22366
22367 -static void lru_add_drain_per_cpu(struct work_struct *dummy)
22368 +#ifdef CONFIG_PREEMPT_RT_BASE
22369 +static inline void remote_lru_add_drain(int cpu, struct cpumask *has_work)
22370  {
22371 -       lru_add_drain();
22372 +       local_lock_on(swapvec_lock, cpu);
22373 +       lru_add_drain_cpu(cpu);
22374 +       local_unlock_on(swapvec_lock, cpu);
22375  }
22376
22377 -static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
22378 +#else
22379
22380  /*
22381   * lru_add_drain_wq is used to do lru_add_drain_all() from a WQ_MEM_RECLAIM
22382 @@ -686,6 +701,22 @@
22383  }
22384  early_initcall(lru_init);
22385
22386 +static void lru_add_drain_per_cpu(struct work_struct *dummy)
22387 +{
22388 +       lru_add_drain();
22389 +}
22390 +
22391 +static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
22392 +static inline void remote_lru_add_drain(int cpu, struct cpumask *has_work)
22393 +{
22394 +       struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);
22395 +
22396 +       INIT_WORK(work, lru_add_drain_per_cpu);
22397 +       queue_work_on(cpu, lru_add_drain_wq, work);
22398 +       cpumask_set_cpu(cpu, has_work);
22399 +}
22400 +#endif
22401 +
22402  void lru_add_drain_all(void)
22403  {
22404         static DEFINE_MUTEX(lock);
22405 @@ -697,21 +728,18 @@
22406         cpumask_clear(&has_work);
22407
22408         for_each_online_cpu(cpu) {
22409 -               struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);
22410 -
22411                 if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
22412                     pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
22413                     pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) ||
22414                     pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) ||
22415 -                   need_activate_page_drain(cpu)) {
22416 -                       INIT_WORK(work, lru_add_drain_per_cpu);
22417 -                       queue_work_on(cpu, lru_add_drain_wq, work);
22418 -                       cpumask_set_cpu(cpu, &has_work);
22419 -               }
22420 +                   need_activate_page_drain(cpu))
22421 +                       remote_lru_add_drain(cpu, &has_work);
22422         }
22423
22424 +#ifndef CONFIG_PREEMPT_RT_BASE
22425         for_each_cpu(cpu, &has_work)
22426                 flush_work(&per_cpu(lru_add_drain_work, cpu));
22427 +#endif
22428
22429         put_online_cpus();
22430         mutex_unlock(&lock);
22431 diff -Nur linux-4.9.6.orig/mm/truncate.c linux-4.9.6/mm/truncate.c
22432 --- linux-4.9.6.orig/mm/truncate.c      2017-01-26 08:25:24.000000000 +0100
22433 +++ linux-4.9.6/mm/truncate.c   2017-01-28 13:59:10.103662798 +0100
22434 @@ -62,9 +62,12 @@
22435          * protected by mapping->tree_lock.
22436          */
22437         if (!workingset_node_shadows(node) &&
22438 -           !list_empty(&node->private_list))
22439 -               list_lru_del(&workingset_shadow_nodes,
22440 +           !list_empty(&node->private_list)) {
22441 +               local_lock(workingset_shadow_lock);
22442 +               list_lru_del(&__workingset_shadow_nodes,
22443                                 &node->private_list);
22444 +               local_unlock(workingset_shadow_lock);
22445 +       }
22446         __radix_tree_delete_node(&mapping->page_tree, node);
22447  unlock:
22448         spin_unlock_irq(&mapping->tree_lock);
22449 diff -Nur linux-4.9.6.orig/mm/vmalloc.c linux-4.9.6/mm/vmalloc.c
22450 --- linux-4.9.6.orig/mm/vmalloc.c       2017-01-26 08:25:24.000000000 +0100
22451 +++ linux-4.9.6/mm/vmalloc.c    2017-01-28 13:59:10.103662798 +0100
22452 @@ -845,7 +845,7 @@
22453         struct vmap_block *vb;
22454         struct vmap_area *va;
22455         unsigned long vb_idx;
22456 -       int node, err;
22457 +       int node, err, cpu;
22458         void *vaddr;
22459
22460         node = numa_node_id();
22461 @@ -888,11 +888,12 @@
22462         BUG_ON(err);
22463         radix_tree_preload_end();
22464
22465 -       vbq = &get_cpu_var(vmap_block_queue);
22466 +       cpu = get_cpu_light();
22467 +       vbq = this_cpu_ptr(&vmap_block_queue);
22468         spin_lock(&vbq->lock);
22469         list_add_tail_rcu(&vb->free_list, &vbq->free);
22470         spin_unlock(&vbq->lock);
22471 -       put_cpu_var(vmap_block_queue);
22472 +       put_cpu_light();
22473
22474         return vaddr;
22475  }
22476 @@ -961,6 +962,7 @@
22477         struct vmap_block *vb;
22478         void *vaddr = NULL;
22479         unsigned int order;
22480 +       int cpu;
22481
22482         BUG_ON(offset_in_page(size));
22483         BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
22484 @@ -975,7 +977,8 @@
22485         order = get_order(size);
22486
22487         rcu_read_lock();
22488 -       vbq = &get_cpu_var(vmap_block_queue);
22489 +       cpu = get_cpu_light();
22490 +       vbq = this_cpu_ptr(&vmap_block_queue);
22491         list_for_each_entry_rcu(vb, &vbq->free, free_list) {
22492                 unsigned long pages_off;
22493
22494 @@ -998,7 +1001,7 @@
22495                 break;
22496         }
22497
22498 -       put_cpu_var(vmap_block_queue);
22499 +       put_cpu_light();
22500         rcu_read_unlock();
22501
22502         /* Allocate new block if nothing was found */
22503 diff -Nur linux-4.9.6.orig/mm/vmstat.c linux-4.9.6/mm/vmstat.c
22504 --- linux-4.9.6.orig/mm/vmstat.c        2017-01-26 08:25:24.000000000 +0100
22505 +++ linux-4.9.6/mm/vmstat.c     2017-01-28 13:59:10.103662798 +0100
22506 @@ -245,6 +245,7 @@
22507         long x;
22508         long t;
22509
22510 +       preempt_disable_rt();
22511         x = delta + __this_cpu_read(*p);
22512
22513         t = __this_cpu_read(pcp->stat_threshold);
22514 @@ -254,6 +255,7 @@
22515                 x = 0;
22516         }
22517         __this_cpu_write(*p, x);
22518 +       preempt_enable_rt();
22519  }
22520  EXPORT_SYMBOL(__mod_zone_page_state);
22521
22522 @@ -265,6 +267,7 @@
22523         long x;
22524         long t;
22525
22526 +       preempt_disable_rt();
22527         x = delta + __this_cpu_read(*p);
22528
22529         t = __this_cpu_read(pcp->stat_threshold);
22530 @@ -274,6 +277,7 @@
22531                 x = 0;
22532         }
22533         __this_cpu_write(*p, x);
22534 +       preempt_enable_rt();
22535  }
22536  EXPORT_SYMBOL(__mod_node_page_state);
22537
22538 @@ -306,6 +310,7 @@
22539         s8 __percpu *p = pcp->vm_stat_diff + item;
22540         s8 v, t;
22541
22542 +       preempt_disable_rt();
22543         v = __this_cpu_inc_return(*p);
22544         t = __this_cpu_read(pcp->stat_threshold);
22545         if (unlikely(v > t)) {
22546 @@ -314,6 +319,7 @@
22547                 zone_page_state_add(v + overstep, zone, item);
22548                 __this_cpu_write(*p, -overstep);
22549         }
22550 +       preempt_enable_rt();
22551  }
22552
22553  void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
22554 @@ -322,6 +328,7 @@
22555         s8 __percpu *p = pcp->vm_node_stat_diff + item;
22556         s8 v, t;
22557
22558 +       preempt_disable_rt();
22559         v = __this_cpu_inc_return(*p);
22560         t = __this_cpu_read(pcp->stat_threshold);
22561         if (unlikely(v > t)) {
22562 @@ -330,6 +337,7 @@
22563                 node_page_state_add(v + overstep, pgdat, item);
22564                 __this_cpu_write(*p, -overstep);
22565         }
22566 +       preempt_enable_rt();
22567  }
22568
22569  void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
22570 @@ -350,6 +358,7 @@
22571         s8 __percpu *p = pcp->vm_stat_diff + item;
22572         s8 v, t;
22573
22574 +       preempt_disable_rt();
22575         v = __this_cpu_dec_return(*p);
22576         t = __this_cpu_read(pcp->stat_threshold);
22577         if (unlikely(v < - t)) {
22578 @@ -358,6 +367,7 @@
22579                 zone_page_state_add(v - overstep, zone, item);
22580                 __this_cpu_write(*p, overstep);
22581         }
22582 +       preempt_enable_rt();
22583  }
22584
22585  void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
22586 @@ -366,6 +376,7 @@
22587         s8 __percpu *p = pcp->vm_node_stat_diff + item;
22588         s8 v, t;
22589
22590 +       preempt_disable_rt();
22591         v = __this_cpu_dec_return(*p);
22592         t = __this_cpu_read(pcp->stat_threshold);
22593         if (unlikely(v < - t)) {
22594 @@ -374,6 +385,7 @@
22595                 node_page_state_add(v - overstep, pgdat, item);
22596                 __this_cpu_write(*p, overstep);
22597         }
22598 +       preempt_enable_rt();
22599  }
22600
22601  void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
22602 diff -Nur linux-4.9.6.orig/mm/workingset.c linux-4.9.6/mm/workingset.c
22603 --- linux-4.9.6.orig/mm/workingset.c    2017-01-26 08:25:24.000000000 +0100
22604 +++ linux-4.9.6/mm/workingset.c 2017-01-28 13:59:10.103662798 +0100
22605 @@ -334,7 +334,8 @@
22606   * point where they would still be useful.
22607   */
22608
22609 -struct list_lru workingset_shadow_nodes;
22610 +struct list_lru __workingset_shadow_nodes;
22611 +DEFINE_LOCAL_IRQ_LOCK(workingset_shadow_lock);
22612
22613  static unsigned long count_shadow_nodes(struct shrinker *shrinker,
22614                                         struct shrink_control *sc)
22615 @@ -344,9 +345,9 @@
22616         unsigned long pages;
22617
22618         /* list_lru lock nests inside IRQ-safe mapping->tree_lock */
22619 -       local_irq_disable();
22620 -       shadow_nodes = list_lru_shrink_count(&workingset_shadow_nodes, sc);
22621 -       local_irq_enable();
22622 +       local_lock_irq(workingset_shadow_lock);
22623 +       shadow_nodes = list_lru_shrink_count(&__workingset_shadow_nodes, sc);
22624 +       local_unlock_irq(workingset_shadow_lock);
22625
22626         if (sc->memcg) {
22627                 pages = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid,
22628 @@ -438,9 +439,9 @@
22629         spin_unlock(&mapping->tree_lock);
22630         ret = LRU_REMOVED_RETRY;
22631  out:
22632 -       local_irq_enable();
22633 +       local_unlock_irq(workingset_shadow_lock);
22634         cond_resched();
22635 -       local_irq_disable();
22636 +       local_lock_irq(workingset_shadow_lock);
22637         spin_lock(lru_lock);
22638         return ret;
22639  }
22640 @@ -451,10 +452,10 @@
22641         unsigned long ret;
22642
22643         /* list_lru lock nests inside IRQ-safe mapping->tree_lock */
22644 -       local_irq_disable();
22645 -       ret =  list_lru_shrink_walk(&workingset_shadow_nodes, sc,
22646 +       local_lock_irq(workingset_shadow_lock);
22647 +       ret =  list_lru_shrink_walk(&__workingset_shadow_nodes, sc,
22648                                     shadow_lru_isolate, NULL);
22649 -       local_irq_enable();
22650 +       local_unlock_irq(workingset_shadow_lock);
22651         return ret;
22652  }
22653
22654 @@ -492,7 +493,7 @@
22655         pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n",
22656                timestamp_bits, max_order, bucket_order);
22657
22658 -       ret = list_lru_init_key(&workingset_shadow_nodes, &shadow_nodes_key);
22659 +       ret = list_lru_init_key(&__workingset_shadow_nodes, &shadow_nodes_key);
22660         if (ret)
22661                 goto err;
22662         ret = register_shrinker(&workingset_shadow_shrinker);
22663 @@ -500,7 +501,7 @@
22664                 goto err_list_lru;
22665         return 0;
22666  err_list_lru:
22667 -       list_lru_destroy(&workingset_shadow_nodes);
22668 +       list_lru_destroy(&__workingset_shadow_nodes);
22669  err:
22670         return ret;
22671  }
22672 diff -Nur linux-4.9.6.orig/mm/zsmalloc.c linux-4.9.6/mm/zsmalloc.c
22673 --- linux-4.9.6.orig/mm/zsmalloc.c      2017-01-26 08:25:24.000000000 +0100
22674 +++ linux-4.9.6/mm/zsmalloc.c   2017-01-28 13:59:10.103662798 +0100
22675 @@ -53,6 +53,7 @@
22676  #include <linux/mount.h>
22677  #include <linux/migrate.h>
22678  #include <linux/pagemap.h>
22679 +#include <linux/locallock.h>
22680
22681  #define ZSPAGE_MAGIC   0x58
22682
22683 @@ -70,9 +71,22 @@
22684   */
22685  #define ZS_MAX_ZSPAGE_ORDER 2
22686  #define ZS_MAX_PAGES_PER_ZSPAGE (_AC(1, UL) << ZS_MAX_ZSPAGE_ORDER)
22687 -
22688  #define ZS_HANDLE_SIZE (sizeof(unsigned long))
22689
22690 +#ifdef CONFIG_PREEMPT_RT_FULL
22691 +
22692 +struct zsmalloc_handle {
22693 +       unsigned long addr;
22694 +       struct mutex lock;
22695 +};
22696 +
22697 +#define ZS_HANDLE_ALLOC_SIZE (sizeof(struct zsmalloc_handle))
22698 +
22699 +#else
22700 +
22701 +#define ZS_HANDLE_ALLOC_SIZE (sizeof(unsigned long))
22702 +#endif
22703 +
22704  /*
22705   * Object location (<PFN>, <obj_idx>) is encoded as
22706   * as single (unsigned long) handle value.
22707 @@ -327,7 +341,7 @@
22708
22709  static int create_cache(struct zs_pool *pool)
22710  {
22711 -       pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_SIZE,
22712 +       pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_ALLOC_SIZE,
22713                                         0, 0, NULL);
22714         if (!pool->handle_cachep)
22715                 return 1;
22716 @@ -351,10 +365,27 @@
22717
22718  static unsigned long cache_alloc_handle(struct zs_pool *pool, gfp_t gfp)
22719  {
22720 -       return (unsigned long)kmem_cache_alloc(pool->handle_cachep,
22721 -                       gfp & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
22722 +       void *p;
22723 +
22724 +       p = kmem_cache_alloc(pool->handle_cachep,
22725 +                            gfp & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
22726 +#ifdef CONFIG_PREEMPT_RT_FULL
22727 +       if (p) {
22728 +               struct zsmalloc_handle *zh = p;
22729 +
22730 +               mutex_init(&zh->lock);
22731 +       }
22732 +#endif
22733 +       return (unsigned long)p;
22734  }
22735
22736 +#ifdef CONFIG_PREEMPT_RT_FULL
22737 +static struct zsmalloc_handle *zs_get_pure_handle(unsigned long handle)
22738 +{
22739 +       return (void *)(handle &~((1 << OBJ_TAG_BITS) - 1));
22740 +}
22741 +#endif
22742 +
22743  static void cache_free_handle(struct zs_pool *pool, unsigned long handle)
22744  {
22745         kmem_cache_free(pool->handle_cachep, (void *)handle);
22746 @@ -373,12 +404,18 @@
22747
22748  static void record_obj(unsigned long handle, unsigned long obj)
22749  {
22750 +#ifdef CONFIG_PREEMPT_RT_FULL
22751 +       struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
22752 +
22753 +       WRITE_ONCE(zh->addr, obj);
22754 +#else
22755         /*
22756          * lsb of @obj represents handle lock while other bits
22757          * represent object value the handle is pointing so
22758          * updating shouldn't do store tearing.
22759          */
22760         WRITE_ONCE(*(unsigned long *)handle, obj);
22761 +#endif
22762  }
22763
22764  /* zpool driver */
22765 @@ -467,6 +504,7 @@
22766
22767  /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */
22768  static DEFINE_PER_CPU(struct mapping_area, zs_map_area);
22769 +static DEFINE_LOCAL_IRQ_LOCK(zs_map_area_lock);
22770
22771  static bool is_zspage_isolated(struct zspage *zspage)
22772  {
22773 @@ -902,7 +940,13 @@
22774
22775  static unsigned long handle_to_obj(unsigned long handle)
22776  {
22777 +#ifdef CONFIG_PREEMPT_RT_FULL
22778 +       struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
22779 +
22780 +       return zh->addr;
22781 +#else
22782         return *(unsigned long *)handle;
22783 +#endif
22784  }
22785
22786  static unsigned long obj_to_head(struct page *page, void *obj)
22787 @@ -916,22 +960,46 @@
22788
22789  static inline int testpin_tag(unsigned long handle)
22790  {
22791 +#ifdef CONFIG_PREEMPT_RT_FULL
22792 +       struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
22793 +
22794 +       return mutex_is_locked(&zh->lock);
22795 +#else
22796         return bit_spin_is_locked(HANDLE_PIN_BIT, (unsigned long *)handle);
22797 +#endif
22798  }
22799
22800  static inline int trypin_tag(unsigned long handle)
22801  {
22802 +#ifdef CONFIG_PREEMPT_RT_FULL
22803 +       struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
22804 +
22805 +       return mutex_trylock(&zh->lock);
22806 +#else
22807         return bit_spin_trylock(HANDLE_PIN_BIT, (unsigned long *)handle);
22808 +#endif
22809  }
22810
22811  static void pin_tag(unsigned long handle)
22812  {
22813 +#ifdef CONFIG_PREEMPT_RT_FULL
22814 +       struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
22815 +
22816 +       return mutex_lock(&zh->lock);
22817 +#else
22818         bit_spin_lock(HANDLE_PIN_BIT, (unsigned long *)handle);
22819 +#endif
22820  }
22821
22822  static void unpin_tag(unsigned long handle)
22823  {
22824 +#ifdef CONFIG_PREEMPT_RT_FULL
22825 +       struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
22826 +
22827 +       return mutex_unlock(&zh->lock);
22828 +#else
22829         bit_spin_unlock(HANDLE_PIN_BIT, (unsigned long *)handle);
22830 +#endif
22831  }
22832
22833  static void reset_page(struct page *page)
22834 @@ -1423,7 +1491,7 @@
22835         class = pool->size_class[class_idx];
22836         off = (class->size * obj_idx) & ~PAGE_MASK;
22837
22838 -       area = &get_cpu_var(zs_map_area);
22839 +       area = &get_locked_var(zs_map_area_lock, zs_map_area);
22840         area->vm_mm = mm;
22841         if (off + class->size <= PAGE_SIZE) {
22842                 /* this object is contained entirely within a page */
22843 @@ -1477,7 +1545,7 @@
22844
22845                 __zs_unmap_object(area, pages, off, class->size);
22846         }
22847 -       put_cpu_var(zs_map_area);
22848 +       put_locked_var(zs_map_area_lock, zs_map_area);
22849
22850         migrate_read_unlock(zspage);
22851         unpin_tag(handle);
22852 diff -Nur linux-4.9.6.orig/net/core/dev.c linux-4.9.6/net/core/dev.c
22853 --- linux-4.9.6.orig/net/core/dev.c     2017-01-26 08:25:24.000000000 +0100
22854 +++ linux-4.9.6/net/core/dev.c  2017-01-28 13:59:10.103662798 +0100
22855 @@ -190,6 +190,7 @@
22856  static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
22857
22858  static seqcount_t devnet_rename_seq;
22859 +static DEFINE_MUTEX(devnet_rename_mutex);
22860
22861  static inline void dev_base_seq_inc(struct net *net)
22862  {
22863 @@ -211,14 +212,14 @@
22864  static inline void rps_lock(struct softnet_data *sd)
22865  {
22866  #ifdef CONFIG_RPS
22867 -       spin_lock(&sd->input_pkt_queue.lock);
22868 +       raw_spin_lock(&sd->input_pkt_queue.raw_lock);
22869  #endif
22870  }
22871
22872  static inline void rps_unlock(struct softnet_data *sd)
22873  {
22874  #ifdef CONFIG_RPS
22875 -       spin_unlock(&sd->input_pkt_queue.lock);
22876 +       raw_spin_unlock(&sd->input_pkt_queue.raw_lock);
22877  #endif
22878  }
22879
22880 @@ -888,7 +889,8 @@
22881         strcpy(name, dev->name);
22882         rcu_read_unlock();
22883         if (read_seqcount_retry(&devnet_rename_seq, seq)) {
22884 -               cond_resched();
22885 +               mutex_lock(&devnet_rename_mutex);
22886 +               mutex_unlock(&devnet_rename_mutex);
22887                 goto retry;
22888         }
22889
22890 @@ -1157,20 +1159,17 @@
22891         if (dev->flags & IFF_UP)
22892                 return -EBUSY;
22893
22894 -       write_seqcount_begin(&devnet_rename_seq);
22895 +       mutex_lock(&devnet_rename_mutex);
22896 +       __raw_write_seqcount_begin(&devnet_rename_seq);
22897
22898 -       if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
22899 -               write_seqcount_end(&devnet_rename_seq);
22900 -               return 0;
22901 -       }
22902 +       if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
22903 +               goto outunlock;
22904
22905         memcpy(oldname, dev->name, IFNAMSIZ);
22906
22907         err = dev_get_valid_name(net, dev, newname);
22908 -       if (err < 0) {
22909 -               write_seqcount_end(&devnet_rename_seq);
22910 -               return err;
22911 -       }
22912 +       if (err < 0)
22913 +               goto outunlock;
22914
22915         if (oldname[0] && !strchr(oldname, '%'))
22916                 netdev_info(dev, "renamed from %s\n", oldname);
22917 @@ -1183,11 +1182,12 @@
22918         if (ret) {
22919                 memcpy(dev->name, oldname, IFNAMSIZ);
22920                 dev->name_assign_type = old_assign_type;
22921 -               write_seqcount_end(&devnet_rename_seq);
22922 -               return ret;
22923 +               err = ret;
22924 +               goto outunlock;
22925         }
22926
22927 -       write_seqcount_end(&devnet_rename_seq);
22928 +       __raw_write_seqcount_end(&devnet_rename_seq);
22929 +       mutex_unlock(&devnet_rename_mutex);
22930
22931         netdev_adjacent_rename_links(dev, oldname);
22932
22933 @@ -1208,7 +1208,8 @@
22934                 /* err >= 0 after dev_alloc_name() or stores the first errno */
22935                 if (err >= 0) {
22936                         err = ret;
22937 -                       write_seqcount_begin(&devnet_rename_seq);
22938 +                       mutex_lock(&devnet_rename_mutex);
22939 +                       __raw_write_seqcount_begin(&devnet_rename_seq);
22940                         memcpy(dev->name, oldname, IFNAMSIZ);
22941                         memcpy(oldname, newname, IFNAMSIZ);
22942                         dev->name_assign_type = old_assign_type;
22943 @@ -1221,6 +1222,11 @@
22944         }
22945
22946         return err;
22947 +
22948 +outunlock:
22949 +       __raw_write_seqcount_end(&devnet_rename_seq);
22950 +       mutex_unlock(&devnet_rename_mutex);
22951 +       return err;
22952  }
22953
22954  /**
22955 @@ -2263,6 +2269,7 @@
22956         sd->output_queue_tailp = &q->next_sched;
22957         raise_softirq_irqoff(NET_TX_SOFTIRQ);
22958         local_irq_restore(flags);
22959 +       preempt_check_resched_rt();
22960  }
22961
22962  void __netif_schedule(struct Qdisc *q)
22963 @@ -2344,6 +2351,7 @@
22964         __this_cpu_write(softnet_data.completion_queue, skb);
22965         raise_softirq_irqoff(NET_TX_SOFTIRQ);
22966         local_irq_restore(flags);
22967 +       preempt_check_resched_rt();
22968  }
22969  EXPORT_SYMBOL(__dev_kfree_skb_irq);
22970
22971 @@ -3078,7 +3086,11 @@
22972          * This permits qdisc->running owner to get the lock more
22973          * often and dequeue packets faster.
22974          */
22975 +#ifdef CONFIG_PREEMPT_RT_FULL
22976 +       contended = true;
22977 +#else
22978         contended = qdisc_is_running(q);
22979 +#endif
22980         if (unlikely(contended))
22981                 spin_lock(&q->busylock);
22982
22983 @@ -3141,8 +3153,10 @@
22984  #define skb_update_prio(skb)
22985  #endif
22986
22987 +#ifndef CONFIG_PREEMPT_RT_FULL
22988  DEFINE_PER_CPU(int, xmit_recursion);
22989  EXPORT_SYMBOL(xmit_recursion);
22990 +#endif
22991
22992  /**
22993   *     dev_loopback_xmit - loop back @skb
22994 @@ -3376,8 +3390,7 @@
22995                 int cpu = smp_processor_id(); /* ok because BHs are off */
22996
22997                 if (txq->xmit_lock_owner != cpu) {
22998 -                       if (unlikely(__this_cpu_read(xmit_recursion) >
22999 -                                    XMIT_RECURSION_LIMIT))
23000 +                       if (unlikely(xmit_rec_read() > XMIT_RECURSION_LIMIT))
23001                                 goto recursion_alert;
23002
23003                         skb = validate_xmit_skb(skb, dev);
23004 @@ -3387,9 +3400,9 @@
23005                         HARD_TX_LOCK(dev, txq, cpu);
23006
23007                         if (!netif_xmit_stopped(txq)) {
23008 -                               __this_cpu_inc(xmit_recursion);
23009 +                               xmit_rec_inc();
23010                                 skb = dev_hard_start_xmit(skb, dev, txq, &rc);
23011 -                               __this_cpu_dec(xmit_recursion);
23012 +                               xmit_rec_dec();
23013                                 if (dev_xmit_complete(rc)) {
23014                                         HARD_TX_UNLOCK(dev, txq);
23015                                         goto out;
23016 @@ -3763,6 +3776,7 @@
23017         rps_unlock(sd);
23018
23019         local_irq_restore(flags);
23020 +       preempt_check_resched_rt();
23021
23022         atomic_long_inc(&skb->dev->rx_dropped);
23023         kfree_skb(skb);
23024 @@ -3781,7 +3795,7 @@
23025                 struct rps_dev_flow voidflow, *rflow = &voidflow;
23026                 int cpu;
23027
23028 -               preempt_disable();
23029 +               migrate_disable();
23030                 rcu_read_lock();
23031
23032                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
23033 @@ -3791,13 +3805,13 @@
23034                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
23035
23036                 rcu_read_unlock();
23037 -               preempt_enable();
23038 +               migrate_enable();
23039         } else
23040  #endif
23041         {
23042                 unsigned int qtail;
23043 -               ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
23044 -               put_cpu();
23045 +               ret = enqueue_to_backlog(skb, get_cpu_light(), &qtail);
23046 +               put_cpu_light();
23047         }
23048         return ret;
23049  }
23050 @@ -3831,11 +3845,9 @@
23051
23052         trace_netif_rx_ni_entry(skb);
23053
23054 -       preempt_disable();
23055 +       local_bh_disable();
23056         err = netif_rx_internal(skb);
23057 -       if (local_softirq_pending())
23058 -               do_softirq();
23059 -       preempt_enable();
23060 +       local_bh_enable();
23061
23062         return err;
23063  }
23064 @@ -4314,7 +4326,7 @@
23065         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
23066                 if (skb->dev->reg_state == NETREG_UNREGISTERING) {
23067                         __skb_unlink(skb, &sd->input_pkt_queue);
23068 -                       kfree_skb(skb);
23069 +                       __skb_queue_tail(&sd->tofree_queue, skb);
23070                         input_queue_head_incr(sd);
23071                 }
23072         }
23073 @@ -4324,11 +4336,14 @@
23074         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
23075                 if (skb->dev->reg_state == NETREG_UNREGISTERING) {
23076                         __skb_unlink(skb, &sd->process_queue);
23077 -                       kfree_skb(skb);
23078 +                       __skb_queue_tail(&sd->tofree_queue, skb);
23079                         input_queue_head_incr(sd);
23080                 }
23081         }
23082 +       if (!skb_queue_empty(&sd->tofree_queue))
23083 +               raise_softirq_irqoff(NET_RX_SOFTIRQ);
23084         local_bh_enable();
23085 +
23086  }
23087
23088  static void flush_all_backlogs(void)
23089 @@ -4809,6 +4824,7 @@
23090                 sd->rps_ipi_list = NULL;
23091
23092                 local_irq_enable();
23093 +               preempt_check_resched_rt();
23094
23095                 /* Send pending IPI's to kick RPS processing on remote cpus. */
23096                 while (remsd) {
23097 @@ -4822,6 +4838,7 @@
23098         } else
23099  #endif
23100                 local_irq_enable();
23101 +       preempt_check_resched_rt();
23102  }
23103
23104  static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
23105 @@ -4851,7 +4868,9 @@
23106         while (again) {
23107                 struct sk_buff *skb;
23108
23109 +               local_irq_disable();
23110                 while ((skb = __skb_dequeue(&sd->process_queue))) {
23111 +                       local_irq_enable();
23112                         rcu_read_lock();
23113                         __netif_receive_skb(skb);
23114                         rcu_read_unlock();
23115 @@ -4859,9 +4878,9 @@
23116                         if (++work >= quota)
23117                                 return work;
23118
23119 +                       local_irq_disable();
23120                 }
23121
23122 -               local_irq_disable();
23123                 rps_lock(sd);
23124                 if (skb_queue_empty(&sd->input_pkt_queue)) {
23125                         /*
23126 @@ -4899,9 +4918,11 @@
23127         local_irq_save(flags);
23128         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
23129         local_irq_restore(flags);
23130 +       preempt_check_resched_rt();
23131  }
23132  EXPORT_SYMBOL(__napi_schedule);
23133
23134 +#ifndef CONFIG_PREEMPT_RT_FULL
23135  /**
23136   * __napi_schedule_irqoff - schedule for receive
23137   * @n: entry to schedule
23138 @@ -4913,6 +4934,7 @@
23139         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
23140  }
23141  EXPORT_SYMBOL(__napi_schedule_irqoff);
23142 +#endif
23143
23144  void __napi_complete(struct napi_struct *n)
23145  {
23146 @@ -5202,13 +5224,21 @@
23147         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
23148         unsigned long time_limit = jiffies + 2;
23149         int budget = netdev_budget;
23150 +       struct sk_buff_head tofree_q;
23151 +       struct sk_buff *skb;
23152         LIST_HEAD(list);
23153         LIST_HEAD(repoll);
23154
23155 +       __skb_queue_head_init(&tofree_q);
23156 +
23157         local_irq_disable();
23158 +       skb_queue_splice_init(&sd->tofree_queue, &tofree_q);
23159         list_splice_init(&sd->poll_list, &list);
23160         local_irq_enable();
23161
23162 +       while ((skb = __skb_dequeue(&tofree_q)))
23163 +               kfree_skb(skb);
23164 +
23165         for (;;) {
23166                 struct napi_struct *n;
23167
23168 @@ -5239,7 +5269,7 @@
23169         list_splice_tail(&repoll, &list);
23170         list_splice(&list, &sd->poll_list);
23171         if (!list_empty(&sd->poll_list))
23172 -               __raise_softirq_irqoff(NET_RX_SOFTIRQ);
23173 +               __raise_softirq_irqoff_ksoft(NET_RX_SOFTIRQ);
23174
23175         net_rps_action_and_irq_enable(sd);
23176  }
23177 @@ -8000,16 +8030,20 @@
23178
23179         raise_softirq_irqoff(NET_TX_SOFTIRQ);
23180         local_irq_enable();
23181 +       preempt_check_resched_rt();
23182
23183         /* Process offline CPU's input_pkt_queue */
23184         while ((skb = __skb_dequeue(&oldsd->process_queue))) {
23185                 netif_rx_ni(skb);
23186                 input_queue_head_incr(oldsd);
23187         }
23188 -       while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
23189 +       while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
23190                 netif_rx_ni(skb);
23191                 input_queue_head_incr(oldsd);
23192         }
23193 +       while ((skb = __skb_dequeue(&oldsd->tofree_queue))) {
23194 +               kfree_skb(skb);
23195 +       }
23196
23197         return NOTIFY_OK;
23198  }
23199 @@ -8314,8 +8348,9 @@
23200
23201                 INIT_WORK(flush, flush_backlog);
23202
23203 -               skb_queue_head_init(&sd->input_pkt_queue);
23204 -               skb_queue_head_init(&sd->process_queue);
23205 +               skb_queue_head_init_raw(&sd->input_pkt_queue);
23206 +               skb_queue_head_init_raw(&sd->process_queue);
23207 +               skb_queue_head_init_raw(&sd->tofree_queue);
23208                 INIT_LIST_HEAD(&sd->poll_list);
23209                 sd->output_queue_tailp = &sd->output_queue;
23210  #ifdef CONFIG_RPS
23211 diff -Nur linux-4.9.6.orig/net/core/filter.c linux-4.9.6/net/core/filter.c
23212 --- linux-4.9.6.orig/net/core/filter.c  2017-01-26 08:25:24.000000000 +0100
23213 +++ linux-4.9.6/net/core/filter.c       2017-01-28 13:59:10.103662798 +0100
23214 @@ -1645,7 +1645,7 @@
23215  {
23216         int ret;
23217
23218 -       if (unlikely(__this_cpu_read(xmit_recursion) > XMIT_RECURSION_LIMIT)) {
23219 +       if (unlikely(xmit_rec_read() > XMIT_RECURSION_LIMIT)) {
23220                 net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n");
23221                 kfree_skb(skb);
23222                 return -ENETDOWN;
23223 @@ -1653,9 +1653,9 @@
23224
23225         skb->dev = dev;
23226
23227 -       __this_cpu_inc(xmit_recursion);
23228 +       xmit_rec_inc();
23229         ret = dev_queue_xmit(skb);
23230 -       __this_cpu_dec(xmit_recursion);
23231 +       xmit_rec_dec();
23232
23233         return ret;
23234  }
23235 diff -Nur linux-4.9.6.orig/net/core/gen_estimator.c linux-4.9.6/net/core/gen_estimator.c
23236 --- linux-4.9.6.orig/net/core/gen_estimator.c   2017-01-26 08:25:24.000000000 +0100
23237 +++ linux-4.9.6/net/core/gen_estimator.c        2017-01-28 13:59:10.103662798 +0100
23238 @@ -84,7 +84,7 @@
23239         struct gnet_stats_basic_packed  *bstats;
23240         struct gnet_stats_rate_est64    *rate_est;
23241         spinlock_t              *stats_lock;
23242 -       seqcount_t              *running;
23243 +       net_seqlock_t           *running;
23244         int                     ewma_log;
23245         u32                     last_packets;
23246         unsigned long           avpps;
23247 @@ -213,7 +213,7 @@
23248                       struct gnet_stats_basic_cpu __percpu *cpu_bstats,
23249                       struct gnet_stats_rate_est64 *rate_est,
23250                       spinlock_t *stats_lock,
23251 -                     seqcount_t *running,
23252 +                     net_seqlock_t *running,
23253                       struct nlattr *opt)
23254  {
23255         struct gen_estimator *est;
23256 @@ -309,7 +309,7 @@
23257                           struct gnet_stats_basic_cpu __percpu *cpu_bstats,
23258                           struct gnet_stats_rate_est64 *rate_est,
23259                           spinlock_t *stats_lock,
23260 -                         seqcount_t *running, struct nlattr *opt)
23261 +                         net_seqlock_t *running, struct nlattr *opt)
23262  {
23263         gen_kill_estimator(bstats, rate_est);
23264         return gen_new_estimator(bstats, cpu_bstats, rate_est, stats_lock, running, opt);
23265 diff -Nur linux-4.9.6.orig/net/core/gen_stats.c linux-4.9.6/net/core/gen_stats.c
23266 --- linux-4.9.6.orig/net/core/gen_stats.c       2017-01-26 08:25:24.000000000 +0100
23267 +++ linux-4.9.6/net/core/gen_stats.c    2017-01-28 13:59:10.107662951 +0100
23268 @@ -130,7 +130,7 @@
23269  }
23270
23271  void
23272 -__gnet_stats_copy_basic(const seqcount_t *running,
23273 +__gnet_stats_copy_basic(net_seqlock_t *running,
23274                         struct gnet_stats_basic_packed *bstats,
23275                         struct gnet_stats_basic_cpu __percpu *cpu,
23276                         struct gnet_stats_basic_packed *b)
23277 @@ -143,10 +143,10 @@
23278         }
23279         do {
23280                 if (running)
23281 -                       seq = read_seqcount_begin(running);
23282 +                       seq = net_seq_begin(running);
23283                 bstats->bytes = b->bytes;
23284                 bstats->packets = b->packets;
23285 -       } while (running && read_seqcount_retry(running, seq));
23286 +       } while (running && net_seq_retry(running, seq));
23287  }
23288  EXPORT_SYMBOL(__gnet_stats_copy_basic);
23289
23290 @@ -164,7 +164,7 @@
23291   * if the room in the socket buffer was not sufficient.
23292   */
23293  int
23294 -gnet_stats_copy_basic(const seqcount_t *running,
23295 +gnet_stats_copy_basic(net_seqlock_t *running,
23296                       struct gnet_dump *d,
23297                       struct gnet_stats_basic_cpu __percpu *cpu,
23298                       struct gnet_stats_basic_packed *b)
23299 diff -Nur linux-4.9.6.orig/net/core/skbuff.c linux-4.9.6/net/core/skbuff.c
23300 --- linux-4.9.6.orig/net/core/skbuff.c  2017-01-26 08:25:24.000000000 +0100
23301 +++ linux-4.9.6/net/core/skbuff.c       2017-01-28 13:59:10.107662951 +0100
23302 @@ -64,6 +64,7 @@
23303  #include <linux/errqueue.h>
23304  #include <linux/prefetch.h>
23305  #include <linux/if_vlan.h>
23306 +#include <linux/locallock.h>
23307
23308  #include <net/protocol.h>
23309  #include <net/dst.h>
23310 @@ -360,6 +361,8 @@
23311
23312  static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache);
23313  static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache);
23314 +static DEFINE_LOCAL_IRQ_LOCK(netdev_alloc_lock);
23315 +static DEFINE_LOCAL_IRQ_LOCK(napi_alloc_cache_lock);
23316
23317  static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
23318  {
23319 @@ -367,10 +370,10 @@
23320         unsigned long flags;
23321         void *data;
23322
23323 -       local_irq_save(flags);
23324 +       local_lock_irqsave(netdev_alloc_lock, flags);
23325         nc = this_cpu_ptr(&netdev_alloc_cache);
23326         data = __alloc_page_frag(nc, fragsz, gfp_mask);
23327 -       local_irq_restore(flags);
23328 +       local_unlock_irqrestore(netdev_alloc_lock, flags);
23329         return data;
23330  }
23331
23332 @@ -389,9 +392,13 @@
23333
23334  static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
23335  {
23336 -       struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
23337 +       struct napi_alloc_cache *nc;
23338 +       void *data;
23339
23340 -       return __alloc_page_frag(&nc->page, fragsz, gfp_mask);
23341 +       nc = &get_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
23342 +       data = __alloc_page_frag(&nc->page, fragsz, gfp_mask);
23343 +       put_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
23344 +       return data;
23345  }
23346
23347  void *napi_alloc_frag(unsigned int fragsz)
23348 @@ -438,13 +445,13 @@
23349         if (sk_memalloc_socks())
23350                 gfp_mask |= __GFP_MEMALLOC;
23351
23352 -       local_irq_save(flags);
23353 +       local_lock_irqsave(netdev_alloc_lock, flags);
23354
23355         nc = this_cpu_ptr(&netdev_alloc_cache);
23356         data = __alloc_page_frag(nc, len, gfp_mask);
23357         pfmemalloc = nc->pfmemalloc;
23358
23359 -       local_irq_restore(flags);
23360 +       local_unlock_irqrestore(netdev_alloc_lock, flags);
23361
23362         if (unlikely(!data))
23363                 return NULL;
23364 @@ -485,9 +492,10 @@
23365  struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
23366                                  gfp_t gfp_mask)
23367  {
23368 -       struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
23369 +       struct napi_alloc_cache *nc;
23370         struct sk_buff *skb;
23371         void *data;
23372 +       bool pfmemalloc;
23373
23374         len += NET_SKB_PAD + NET_IP_ALIGN;
23375
23376 @@ -505,7 +513,10 @@
23377         if (sk_memalloc_socks())
23378                 gfp_mask |= __GFP_MEMALLOC;
23379
23380 +       nc = &get_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
23381         data = __alloc_page_frag(&nc->page, len, gfp_mask);
23382 +       pfmemalloc = nc->page.pfmemalloc;
23383 +       put_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
23384         if (unlikely(!data))
23385                 return NULL;
23386
23387 @@ -516,7 +527,7 @@
23388         }
23389
23390         /* use OR instead of assignment to avoid clearing of bits in mask */
23391 -       if (nc->page.pfmemalloc)
23392 +       if (pfmemalloc)
23393                 skb->pfmemalloc = 1;
23394         skb->head_frag = 1;
23395
23396 @@ -760,23 +771,26 @@
23397
23398  void __kfree_skb_flush(void)
23399  {
23400 -       struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
23401 +       struct napi_alloc_cache *nc;
23402
23403 +       nc = &get_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
23404         /* flush skb_cache if containing objects */
23405         if (nc->skb_count) {
23406                 kmem_cache_free_bulk(skbuff_head_cache, nc->skb_count,
23407                                      nc->skb_cache);
23408                 nc->skb_count = 0;
23409         }
23410 +       put_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
23411  }
23412
23413  static inline void _kfree_skb_defer(struct sk_buff *skb)
23414  {
23415 -       struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
23416 +       struct napi_alloc_cache *nc;
23417
23418         /* drop skb->head and call any destructors for packet */
23419         skb_release_all(skb);
23420
23421 +       nc = &get_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
23422         /* record skb to CPU local list */
23423         nc->skb_cache[nc->skb_count++] = skb;
23424
23425 @@ -791,6 +805,7 @@
23426                                      nc->skb_cache);
23427                 nc->skb_count = 0;
23428         }
23429 +       put_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
23430  }
23431  void __kfree_skb_defer(struct sk_buff *skb)
23432  {
23433 diff -Nur linux-4.9.6.orig/net/core/sock.c linux-4.9.6/net/core/sock.c
23434 --- linux-4.9.6.orig/net/core/sock.c    2017-01-26 08:25:24.000000000 +0100
23435 +++ linux-4.9.6/net/core/sock.c 2017-01-28 13:59:10.107662951 +0100
23436 @@ -2488,12 +2488,11 @@
23437         if (sk->sk_lock.owned)
23438                 __lock_sock(sk);
23439         sk->sk_lock.owned = 1;
23440 -       spin_unlock(&sk->sk_lock.slock);
23441 +       spin_unlock_bh(&sk->sk_lock.slock);
23442         /*
23443          * The sk_lock has mutex_lock() semantics here:
23444          */
23445         mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
23446 -       local_bh_enable();
23447  }
23448  EXPORT_SYMBOL(lock_sock_nested);
23449
23450 diff -Nur linux-4.9.6.orig/net/ipv4/icmp.c linux-4.9.6/net/ipv4/icmp.c
23451 --- linux-4.9.6.orig/net/ipv4/icmp.c    2017-01-26 08:25:24.000000000 +0100
23452 +++ linux-4.9.6/net/ipv4/icmp.c 2017-01-28 13:59:10.107662951 +0100
23453 @@ -69,6 +69,7 @@
23454  #include <linux/jiffies.h>
23455  #include <linux/kernel.h>
23456  #include <linux/fcntl.h>
23457 +#include <linux/sysrq.h>
23458  #include <linux/socket.h>
23459  #include <linux/in.h>
23460  #include <linux/inet.h>
23461 @@ -77,6 +78,7 @@
23462  #include <linux/string.h>
23463  #include <linux/netfilter_ipv4.h>
23464  #include <linux/slab.h>
23465 +#include <linux/locallock.h>
23466  #include <net/snmp.h>
23467  #include <net/ip.h>
23468  #include <net/route.h>
23469 @@ -204,6 +206,8 @@
23470   *
23471   *     On SMP we have one ICMP socket per-cpu.
23472   */
23473 +static DEFINE_LOCAL_IRQ_LOCK(icmp_sk_lock);
23474 +
23475  static struct sock *icmp_sk(struct net *net)
23476  {
23477         return *this_cpu_ptr(net->ipv4.icmp_sk);
23478 @@ -215,12 +219,14 @@
23479
23480         local_bh_disable();
23481
23482 +       local_lock(icmp_sk_lock);
23483         sk = icmp_sk(net);
23484
23485         if (unlikely(!spin_trylock(&sk->sk_lock.slock))) {
23486                 /* This can happen if the output path signals a
23487                  * dst_link_failure() for an outgoing ICMP packet.
23488                  */
23489 +               local_unlock(icmp_sk_lock);
23490                 local_bh_enable();
23491                 return NULL;
23492         }
23493 @@ -230,6 +236,7 @@
23494  static inline void icmp_xmit_unlock(struct sock *sk)
23495  {
23496         spin_unlock_bh(&sk->sk_lock.slock);
23497 +       local_unlock(icmp_sk_lock);
23498  }
23499
23500  int sysctl_icmp_msgs_per_sec __read_mostly = 1000;
23501 @@ -358,6 +365,7 @@
23502         struct sock *sk;
23503         struct sk_buff *skb;
23504
23505 +       local_lock(icmp_sk_lock);
23506         sk = icmp_sk(dev_net((*rt)->dst.dev));
23507         if (ip_append_data(sk, fl4, icmp_glue_bits, icmp_param,
23508                            icmp_param->data_len+icmp_param->head_len,
23509 @@ -380,6 +388,7 @@
23510                 skb->ip_summed = CHECKSUM_NONE;
23511                 ip_push_pending_frames(sk, fl4);
23512         }
23513 +       local_unlock(icmp_sk_lock);
23514  }
23515
23516  /*
23517 @@ -891,6 +900,30 @@
23518  }
23519
23520  /*
23521 + * 32bit and 64bit have different timestamp length, so we check for
23522 + * the cookie at offset 20 and verify it is repeated at offset 50
23523 + */
23524 +#define CO_POS0                20
23525 +#define CO_POS1                50
23526 +#define CO_SIZE                sizeof(int)
23527 +#define ICMP_SYSRQ_SIZE        57
23528 +
23529 +/*
23530 + * We got a ICMP_SYSRQ_SIZE sized ping request. Check for the cookie
23531 + * pattern and if it matches send the next byte as a trigger to sysrq.
23532 + */
23533 +static void icmp_check_sysrq(struct net *net, struct sk_buff *skb)
23534 +{
23535 +       int cookie = htonl(net->ipv4.sysctl_icmp_echo_sysrq);
23536 +       char *p = skb->data;
23537 +
23538 +       if (!memcmp(&cookie, p + CO_POS0, CO_SIZE) &&
23539 +           !memcmp(&cookie, p + CO_POS1, CO_SIZE) &&
23540 +           p[CO_POS0 + CO_SIZE] == p[CO_POS1 + CO_SIZE])
23541 +               handle_sysrq(p[CO_POS0 + CO_SIZE]);
23542 +}
23543 +
23544 +/*
23545   *     Handle ICMP_ECHO ("ping") requests.
23546   *
23547   *     RFC 1122: 3.2.2.6 MUST have an echo server that answers ICMP echo
23548 @@ -917,6 +950,11 @@
23549                 icmp_param.data_len        = skb->len;
23550                 icmp_param.head_len        = sizeof(struct icmphdr);
23551                 icmp_reply(&icmp_param, skb);
23552 +
23553 +               if (skb->len == ICMP_SYSRQ_SIZE &&
23554 +                   net->ipv4.sysctl_icmp_echo_sysrq) {
23555 +                       icmp_check_sysrq(net, skb);
23556 +               }
23557         }
23558         /* should there be an ICMP stat for ignored echos? */
23559         return true;
23560 diff -Nur linux-4.9.6.orig/net/ipv4/sysctl_net_ipv4.c linux-4.9.6/net/ipv4/sysctl_net_ipv4.c
23561 --- linux-4.9.6.orig/net/ipv4/sysctl_net_ipv4.c 2017-01-26 08:25:24.000000000 +0100
23562 +++ linux-4.9.6/net/ipv4/sysctl_net_ipv4.c      2017-01-28 13:59:10.107662951 +0100
23563 @@ -681,6 +681,13 @@
23564                 .proc_handler   = proc_dointvec
23565         },
23566         {
23567 +               .procname       = "icmp_echo_sysrq",
23568 +               .data           = &init_net.ipv4.sysctl_icmp_echo_sysrq,
23569 +               .maxlen         = sizeof(int),
23570 +               .mode           = 0644,
23571 +               .proc_handler   = proc_dointvec
23572 +       },
23573 +       {
23574                 .procname       = "icmp_ignore_bogus_error_responses",
23575                 .data           = &init_net.ipv4.sysctl_icmp_ignore_bogus_error_responses,
23576                 .maxlen         = sizeof(int),
23577 diff -Nur linux-4.9.6.orig/net/ipv4/tcp_ipv4.c linux-4.9.6/net/ipv4/tcp_ipv4.c
23578 --- linux-4.9.6.orig/net/ipv4/tcp_ipv4.c        2017-01-26 08:25:24.000000000 +0100
23579 +++ linux-4.9.6/net/ipv4/tcp_ipv4.c     2017-01-28 13:59:10.107662951 +0100
23580 @@ -62,6 +62,7 @@
23581  #include <linux/init.h>
23582  #include <linux/times.h>
23583  #include <linux/slab.h>
23584 +#include <linux/locallock.h>
23585
23586  #include <net/net_namespace.h>
23587  #include <net/icmp.h>
23588 @@ -564,6 +565,7 @@
23589  }
23590  EXPORT_SYMBOL(tcp_v4_send_check);
23591
23592 +static DEFINE_LOCAL_IRQ_LOCK(tcp_sk_lock);
23593  /*
23594   *     This routine will send an RST to the other tcp.
23595   *
23596 @@ -691,6 +693,8 @@
23597                      offsetof(struct inet_timewait_sock, tw_bound_dev_if));
23598
23599         arg.tos = ip_hdr(skb)->tos;
23600 +
23601 +       local_lock(tcp_sk_lock);
23602         local_bh_disable();
23603         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
23604                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
23605 @@ -700,6 +704,7 @@
23606         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
23607         __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
23608         local_bh_enable();
23609 +       local_unlock(tcp_sk_lock);
23610
23611  #ifdef CONFIG_TCP_MD5SIG
23612  out:
23613 @@ -775,6 +780,7 @@
23614         if (oif)
23615                 arg.bound_dev_if = oif;
23616         arg.tos = tos;
23617 +       local_lock(tcp_sk_lock);
23618         local_bh_disable();
23619         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
23620                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
23621 @@ -783,6 +789,7 @@
23622
23623         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
23624         local_bh_enable();
23625 +       local_unlock(tcp_sk_lock);
23626  }
23627
23628  static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
23629 diff -Nur linux-4.9.6.orig/net/mac80211/rx.c linux-4.9.6/net/mac80211/rx.c
23630 --- linux-4.9.6.orig/net/mac80211/rx.c  2017-01-26 08:25:24.000000000 +0100
23631 +++ linux-4.9.6/net/mac80211/rx.c       2017-01-28 13:59:10.107662951 +0100
23632 @@ -4166,7 +4166,7 @@
23633         struct ieee80211_supported_band *sband;
23634         struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
23635
23636 -       WARN_ON_ONCE(softirq_count() == 0);
23637 +       WARN_ON_ONCE_NONRT(softirq_count() == 0);
23638
23639         if (WARN_ON(status->band >= NUM_NL80211_BANDS))
23640                 goto drop;
23641 diff -Nur linux-4.9.6.orig/net/netfilter/core.c linux-4.9.6/net/netfilter/core.c
23642 --- linux-4.9.6.orig/net/netfilter/core.c       2017-01-26 08:25:24.000000000 +0100
23643 +++ linux-4.9.6/net/netfilter/core.c    2017-01-28 13:59:10.107662951 +0100
23644 @@ -22,12 +22,18 @@
23645  #include <linux/proc_fs.h>
23646  #include <linux/mutex.h>
23647  #include <linux/slab.h>
23648 +#include <linux/locallock.h>
23649  #include <linux/rcupdate.h>
23650  #include <net/net_namespace.h>
23651  #include <net/sock.h>
23652
23653  #include "nf_internals.h"
23654
23655 +#ifdef CONFIG_PREEMPT_RT_BASE
23656 +DEFINE_LOCAL_IRQ_LOCK(xt_write_lock);
23657 +EXPORT_PER_CPU_SYMBOL(xt_write_lock);
23658 +#endif
23659 +
23660  static DEFINE_MUTEX(afinfo_mutex);
23661
23662  const struct nf_afinfo __rcu *nf_afinfo[NFPROTO_NUMPROTO] __read_mostly;
23663 diff -Nur linux-4.9.6.orig/net/packet/af_packet.c linux-4.9.6/net/packet/af_packet.c
23664 --- linux-4.9.6.orig/net/packet/af_packet.c     2017-01-26 08:25:24.000000000 +0100
23665 +++ linux-4.9.6/net/packet/af_packet.c  2017-01-28 13:59:10.111663104 +0100
23666 @@ -63,6 +63,7 @@
23667  #include <linux/if_packet.h>
23668  #include <linux/wireless.h>
23669  #include <linux/kernel.h>
23670 +#include <linux/delay.h>
23671  #include <linux/kmod.h>
23672  #include <linux/slab.h>
23673  #include <linux/vmalloc.h>
23674 @@ -694,7 +695,7 @@
23675         if (BLOCK_NUM_PKTS(pbd)) {
23676                 while (atomic_read(&pkc->blk_fill_in_prog)) {
23677                         /* Waiting for skb_copy_bits to finish... */
23678 -                       cpu_relax();
23679 +                       cpu_chill();
23680                 }
23681         }
23682
23683 @@ -956,7 +957,7 @@
23684                 if (!(status & TP_STATUS_BLK_TMO)) {
23685                         while (atomic_read(&pkc->blk_fill_in_prog)) {
23686                                 /* Waiting for skb_copy_bits to finish... */
23687 -                               cpu_relax();
23688 +                               cpu_chill();
23689                         }
23690                 }
23691                 prb_close_block(pkc, pbd, po, status);
23692 diff -Nur linux-4.9.6.orig/net/rds/ib_rdma.c linux-4.9.6/net/rds/ib_rdma.c
23693 --- linux-4.9.6.orig/net/rds/ib_rdma.c  2017-01-26 08:25:24.000000000 +0100
23694 +++ linux-4.9.6/net/rds/ib_rdma.c       2017-01-28 13:59:10.111663104 +0100
23695 @@ -34,6 +34,7 @@
23696  #include <linux/slab.h>
23697  #include <linux/rculist.h>
23698  #include <linux/llist.h>
23699 +#include <linux/delay.h>
23700
23701  #include "rds_single_path.h"
23702  #include "ib_mr.h"
23703 @@ -210,7 +211,7 @@
23704         for_each_online_cpu(cpu) {
23705                 flag = &per_cpu(clean_list_grace, cpu);
23706                 while (test_bit(CLEAN_LIST_BUSY_BIT, flag))
23707 -                       cpu_relax();
23708 +                       cpu_chill();
23709         }
23710  }
23711
23712 diff -Nur linux-4.9.6.orig/net/rxrpc/security.c linux-4.9.6/net/rxrpc/security.c
23713 --- linux-4.9.6.orig/net/rxrpc/security.c       2017-01-26 08:25:24.000000000 +0100
23714 +++ linux-4.9.6/net/rxrpc/security.c    2017-01-28 13:59:10.111663104 +0100
23715 @@ -19,9 +19,6 @@
23716  #include <keys/rxrpc-type.h>
23717  #include "ar-internal.h"
23718
23719 -static LIST_HEAD(rxrpc_security_methods);
23720 -static DECLARE_RWSEM(rxrpc_security_sem);
23721 -
23722  static const struct rxrpc_security *rxrpc_security_types[] = {
23723         [RXRPC_SECURITY_NONE]   = &rxrpc_no_security,
23724  #ifdef CONFIG_RXKAD
23725 diff -Nur linux-4.9.6.orig/net/sched/sch_api.c linux-4.9.6/net/sched/sch_api.c
23726 --- linux-4.9.6.orig/net/sched/sch_api.c        2017-01-26 08:25:24.000000000 +0100
23727 +++ linux-4.9.6/net/sched/sch_api.c     2017-01-28 13:59:10.111663104 +0100
23728 @@ -981,7 +981,7 @@
23729                         rcu_assign_pointer(sch->stab, stab);
23730                 }
23731                 if (tca[TCA_RATE]) {
23732 -                       seqcount_t *running;
23733 +                       net_seqlock_t *running;
23734
23735                         err = -EOPNOTSUPP;
23736                         if (sch->flags & TCQ_F_MQROOT)
23737 diff -Nur linux-4.9.6.orig/net/sched/sch_generic.c linux-4.9.6/net/sched/sch_generic.c
23738 --- linux-4.9.6.orig/net/sched/sch_generic.c    2017-01-26 08:25:24.000000000 +0100
23739 +++ linux-4.9.6/net/sched/sch_generic.c 2017-01-28 13:59:10.111663104 +0100
23740 @@ -425,7 +425,11 @@
23741         .ops            =       &noop_qdisc_ops,
23742         .q.lock         =       __SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock),
23743         .dev_queue      =       &noop_netdev_queue,
23744 +#ifdef CONFIG_PREEMPT_RT_BASE
23745 +       .running        =       __SEQLOCK_UNLOCKED(noop_qdisc.running),
23746 +#else
23747         .running        =       SEQCNT_ZERO(noop_qdisc.running),
23748 +#endif
23749         .busylock       =       __SPIN_LOCK_UNLOCKED(noop_qdisc.busylock),
23750  };
23751  EXPORT_SYMBOL(noop_qdisc);
23752 @@ -624,9 +628,17 @@
23753         lockdep_set_class(&sch->busylock,
23754                           dev->qdisc_tx_busylock ?: &qdisc_tx_busylock);
23755
23756 +#ifdef CONFIG_PREEMPT_RT_BASE
23757 +       seqlock_init(&sch->running);
23758 +       lockdep_set_class(&sch->running.seqcount,
23759 +                         dev->qdisc_running_key ?: &qdisc_running_key);
23760 +       lockdep_set_class(&sch->running.lock,
23761 +                         dev->qdisc_running_key ?: &qdisc_running_key);
23762 +#else
23763         seqcount_init(&sch->running);
23764         lockdep_set_class(&sch->running,
23765                           dev->qdisc_running_key ?: &qdisc_running_key);
23766 +#endif
23767
23768         sch->ops = ops;
23769         sch->enqueue = ops->enqueue;
23770 @@ -925,7 +937,7 @@
23771         /* Wait for outstanding qdisc_run calls. */
23772         list_for_each_entry(dev, head, close_list)
23773                 while (some_qdisc_is_busy(dev))
23774 -                       yield();
23775 +                       msleep(1);
23776  }
23777
23778  void dev_deactivate(struct net_device *dev)
23779 diff -Nur linux-4.9.6.orig/net/sunrpc/svc_xprt.c linux-4.9.6/net/sunrpc/svc_xprt.c
23780 --- linux-4.9.6.orig/net/sunrpc/svc_xprt.c      2017-01-26 08:25:24.000000000 +0100
23781 +++ linux-4.9.6/net/sunrpc/svc_xprt.c   2017-01-28 13:59:10.111663104 +0100
23782 @@ -396,7 +396,7 @@
23783                 goto out;
23784         }
23785
23786 -       cpu = get_cpu();
23787 +       cpu = get_cpu_light();
23788         pool = svc_pool_for_cpu(xprt->xpt_server, cpu);
23789
23790         atomic_long_inc(&pool->sp_stats.packets);
23791 @@ -432,7 +432,7 @@
23792
23793                 atomic_long_inc(&pool->sp_stats.threads_woken);
23794                 wake_up_process(rqstp->rq_task);
23795 -               put_cpu();
23796 +               put_cpu_light();
23797                 goto out;
23798         }
23799         rcu_read_unlock();
23800 @@ -453,7 +453,7 @@
23801                 goto redo_search;
23802         }
23803         rqstp = NULL;
23804 -       put_cpu();
23805 +       put_cpu_light();
23806  out:
23807         trace_svc_xprt_do_enqueue(xprt, rqstp);
23808  }
23809 diff -Nur linux-4.9.6.orig/scripts/mkcompile_h linux-4.9.6/scripts/mkcompile_h
23810 --- linux-4.9.6.orig/scripts/mkcompile_h        2017-01-26 08:25:24.000000000 +0100
23811 +++ linux-4.9.6/scripts/mkcompile_h     2017-01-28 13:59:10.111663104 +0100
23812 @@ -4,7 +4,8 @@
23813  ARCH=$2
23814  SMP=$3
23815  PREEMPT=$4
23816 -CC=$5
23817 +RT=$5
23818 +CC=$6
23819
23820  vecho() { [ "${quiet}" = "silent_" ] || echo "$@" ; }
23821
23822 @@ -57,6 +58,7 @@
23823  CONFIG_FLAGS=""
23824  if [ -n "$SMP" ] ; then CONFIG_FLAGS="SMP"; fi
23825  if [ -n "$PREEMPT" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS PREEMPT"; fi
23826 +if [ -n "$RT" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS RT"; fi
23827  UTS_VERSION="$UTS_VERSION $CONFIG_FLAGS $TIMESTAMP"
23828
23829  # Truncate to maximum length
23830 diff -Nur linux-4.9.6.orig/sound/core/pcm_native.c linux-4.9.6/sound/core/pcm_native.c
23831 --- linux-4.9.6.orig/sound/core/pcm_native.c    2017-01-26 08:25:24.000000000 +0100
23832 +++ linux-4.9.6/sound/core/pcm_native.c 2017-01-28 13:59:10.111663104 +0100
23833 @@ -135,7 +135,7 @@
23834  void snd_pcm_stream_lock_irq(struct snd_pcm_substream *substream)
23835  {
23836         if (!substream->pcm->nonatomic)
23837 -               local_irq_disable();
23838 +               local_irq_disable_nort();
23839         snd_pcm_stream_lock(substream);
23840  }
23841  EXPORT_SYMBOL_GPL(snd_pcm_stream_lock_irq);
23842 @@ -150,7 +150,7 @@
23843  {
23844         snd_pcm_stream_unlock(substream);
23845         if (!substream->pcm->nonatomic)
23846 -               local_irq_enable();
23847 +               local_irq_enable_nort();
23848  }
23849  EXPORT_SYMBOL_GPL(snd_pcm_stream_unlock_irq);
23850
23851 @@ -158,7 +158,7 @@
23852  {
23853         unsigned long flags = 0;
23854         if (!substream->pcm->nonatomic)
23855 -               local_irq_save(flags);
23856 +               local_irq_save_nort(flags);
23857         snd_pcm_stream_lock(substream);
23858         return flags;
23859  }
23860 @@ -176,7 +176,7 @@
23861  {
23862         snd_pcm_stream_unlock(substream);
23863         if (!substream->pcm->nonatomic)
23864 -               local_irq_restore(flags);
23865 +               local_irq_restore_nort(flags);
23866  }
23867  EXPORT_SYMBOL_GPL(snd_pcm_stream_unlock_irqrestore);
23868